""" Document management endpoints for the Virtual Board Member AI System. """ import asyncio import logging from typing import List, Optional, Dict, Any from pathlib import Path import uuid from datetime import datetime from fastapi import APIRouter, Depends, HTTPException, UploadFile, File, Form, BackgroundTasks, Query from fastapi.responses import JSONResponse from sqlalchemy.orm import Session from sqlalchemy import and_, or_ from app.core.database import get_db from app.core.auth import get_current_user, get_current_tenant from app.models.document import Document, DocumentType, DocumentTag, DocumentVersion from app.models.user import User from app.models.tenant import Tenant from app.services.document_processor import DocumentProcessor from app.services.vector_service import VectorService from app.services.storage_service import StorageService from app.services.document_organization import DocumentOrganizationService logger = logging.getLogger(__name__) router = APIRouter() @router.post("/upload") async def upload_document( background_tasks: BackgroundTasks, file: UploadFile = File(...), title: str = Form(...), description: Optional[str] = Form(None), document_type: DocumentType = Form(DocumentType.OTHER), tags: Optional[str] = Form(None), # Comma-separated tag names db: Session = Depends(get_db), current_user: User = Depends(get_current_user), current_tenant: Tenant = Depends(get_current_tenant) ): """ Upload and process a single document with multi-tenant support. """ try: # Validate file if not file.filename: raise HTTPException(status_code=400, detail="No file provided") # Check file size (50MB limit) if file.size and file.size > 50 * 1024 * 1024: # 50MB raise HTTPException(status_code=400, detail="File too large. Maximum size is 50MB") # Create document record document = Document( id=uuid.uuid4(), title=title, description=description, document_type=document_type, filename=file.filename, file_path="", # Will be set after saving file_size=0, # Will be updated after storage mime_type=file.content_type or "application/octet-stream", uploaded_by=current_user.id, organization_id=current_tenant.id, processing_status="pending" ) db.add(document) db.commit() db.refresh(document) # Save file using storage service storage_service = StorageService(current_tenant) storage_result = await storage_service.upload_file(file, str(document.id)) # Update document with storage information document.file_path = storage_result["file_path"] document.file_size = storage_result["file_size"] document.document_metadata = { "storage_url": storage_result["storage_url"], "checksum": storage_result["checksum"], "uploaded_at": storage_result["uploaded_at"] } db.commit() # Process tags if tags: tag_names = [tag.strip() for tag in tags.split(",") if tag.strip()] await _process_document_tags(db, document, tag_names, current_tenant) # Start background processing background_tasks.add_task( _process_document_background, document.id, str(file_path), current_tenant.id ) return { "message": "Document uploaded successfully", "document_id": str(document.id), "status": "processing" } except Exception as e: logger.error(f"Error uploading document: {str(e)}") raise HTTPException(status_code=500, detail="Failed to upload document") @router.post("/upload/batch") async def upload_documents_batch( background_tasks: BackgroundTasks, files: List[UploadFile] = File(...), titles: List[str] = Form(...), descriptions: Optional[List[str]] = Form(None), document_types: Optional[List[DocumentType]] = Form(None), db: Session = Depends(get_db), current_user: User = Depends(get_current_user), current_tenant: Tenant = Depends(get_current_tenant) ): """ Upload and process multiple documents (up to 50 files) with multi-tenant support. """ try: if len(files) > 50: raise HTTPException(status_code=400, detail="Maximum 50 files allowed per batch") if len(files) != len(titles): raise HTTPException(status_code=400, detail="Number of files must match number of titles") documents = [] for i, file in enumerate(files): # Validate file if not file.filename: continue # Check file size if file.size and file.size > 50 * 1024 * 1024: # 50MB continue # Create document record document_type = document_types[i] if document_types and i < len(document_types) else DocumentType.OTHER description = descriptions[i] if descriptions and i < len(descriptions) else None document = Document( id=uuid.uuid4(), title=titles[i], description=description, document_type=document_type, filename=file.filename, file_path="", file_size=0, # Will be updated after storage mime_type=file.content_type or "application/octet-stream", uploaded_by=current_user.id, organization_id=current_tenant.id, processing_status="pending" ) db.add(document) documents.append((document, file)) db.commit() # Save files using storage service and start processing storage_service = StorageService(current_tenant) for document, file in documents: # Upload file to storage storage_result = await storage_service.upload_file(file, str(document.id)) # Update document with storage information document.file_path = storage_result["file_path"] document.file_size = storage_result["file_size"] document.document_metadata = { "storage_url": storage_result["storage_url"], "checksum": storage_result["checksum"], "uploaded_at": storage_result["uploaded_at"] } # Start background processing background_tasks.add_task( _process_document_background, document.id, document.file_path, current_tenant.id ) db.commit() return { "message": f"Uploaded {len(documents)} documents successfully", "document_ids": [str(doc.id) for doc, _ in documents], "status": "processing" } except Exception as e: logger.error(f"Error uploading documents batch: {str(e)}") raise HTTPException(status_code=500, detail="Failed to upload documents") @router.get("/") async def list_documents( skip: int = Query(0, ge=0), limit: int = Query(100, ge=1, le=1000), document_type: Optional[DocumentType] = Query(None), search: Optional[str] = Query(None), tags: Optional[str] = Query(None), # Comma-separated tag names status: Optional[str] = Query(None), db: Session = Depends(get_db), current_user: User = Depends(get_current_user), current_tenant: Tenant = Depends(get_current_tenant) ): """ List documents with filtering and search capabilities. """ try: query = db.query(Document).filter(Document.organization_id == current_tenant.id) # Apply filters if document_type: query = query.filter(Document.document_type == document_type) if status: query = query.filter(Document.processing_status == status) if search: search_filter = or_( Document.title.ilike(f"%{search}%"), Document.description.ilike(f"%{search}%"), Document.filename.ilike(f"%{search}%") ) query = query.filter(search_filter) if tags: tag_names = [tag.strip() for tag in tags.split(",") if tag.strip()] # This is a simplified tag filter - in production, you'd use a proper join for tag_name in tag_names: query = query.join(Document.tags).filter(DocumentTag.name.ilike(f"%{tag_name}%")) # Apply pagination total = query.count() documents = query.offset(skip).limit(limit).all() return { "documents": [ { "id": str(doc.id), "title": doc.title, "description": doc.description, "document_type": doc.document_type, "filename": doc.filename, "file_size": doc.file_size, "processing_status": doc.processing_status, "created_at": doc.created_at.isoformat(), "updated_at": doc.updated_at.isoformat(), "tags": [{"id": str(tag.id), "name": tag.name} for tag in doc.tags] } for doc in documents ], "total": total, "skip": skip, "limit": limit } except Exception as e: logger.error(f"Error listing documents: {str(e)}") raise HTTPException(status_code=500, detail="Failed to list documents") @router.get("/{document_id}") async def get_document( document_id: str, db: Session = Depends(get_db), current_user: User = Depends(get_current_user), current_tenant: Tenant = Depends(get_current_tenant) ): """ Get document details by ID. """ try: document = db.query(Document).filter( and_( Document.id == document_id, Document.organization_id == current_tenant.id ) ).first() if not document: raise HTTPException(status_code=404, detail="Document not found") return { "id": str(document.id), "title": document.title, "description": document.description, "document_type": document.document_type, "filename": document.filename, "file_size": document.file_size, "mime_type": document.mime_type, "processing_status": document.processing_status, "processing_error": document.processing_error, "extracted_text": document.extracted_text, "document_metadata": document.document_metadata, "source_system": document.source_system, "created_at": document.created_at.isoformat(), "updated_at": document.updated_at.isoformat(), "tags": [{"id": str(tag.id), "name": tag.name} for tag in document.tags], "versions": [ { "id": str(version.id), "version_number": version.version_number, "filename": version.filename, "created_at": version.created_at.isoformat() } for version in document.versions ] } except HTTPException: raise except Exception as e: logger.error(f"Error getting document {document_id}: {str(e)}") raise HTTPException(status_code=500, detail="Failed to get document") @router.delete("/{document_id}") async def delete_document( document_id: str, db: Session = Depends(get_db), current_user: User = Depends(get_current_user), current_tenant: Tenant = Depends(get_current_tenant) ): """ Delete a document and its associated files. """ try: document = db.query(Document).filter( and_( Document.id == document_id, Document.organization_id == current_tenant.id ) ).first() if not document: raise HTTPException(status_code=404, detail="Document not found") # Delete file from storage if document.file_path: try: storage_service = StorageService(current_tenant) await storage_service.delete_file(document.file_path) except Exception as e: logger.warning(f"Could not delete file {document.file_path}: {str(e)}") # Delete from database (cascade will handle related records) db.delete(document) db.commit() return {"message": "Document deleted successfully"} except HTTPException: raise except Exception as e: logger.error(f"Error deleting document {document_id}: {str(e)}") raise HTTPException(status_code=500, detail="Failed to delete document") @router.post("/{document_id}/tags") async def add_document_tags( document_id: str, tag_names: List[str], db: Session = Depends(get_db), current_user: User = Depends(get_current_user), current_tenant: Tenant = Depends(get_current_tenant) ): """ Add tags to a document. """ try: document = db.query(Document).filter( and_( Document.id == document_id, Document.organization_id == current_tenant.id ) ).first() if not document: raise HTTPException(status_code=404, detail="Document not found") await _process_document_tags(db, document, tag_names, current_tenant) return {"message": "Tags added successfully"} except HTTPException: raise except Exception as e: logger.error(f"Error adding tags to document {document_id}: {str(e)}") raise HTTPException(status_code=500, detail="Failed to add tags") @router.post("/folders") async def create_folder( folder_path: str = Form(...), description: Optional[str] = Form(None), db: Session = Depends(get_db), current_user: User = Depends(get_current_user), current_tenant: Tenant = Depends(get_current_tenant) ): """ Create a new folder in the document hierarchy. """ try: organization_service = DocumentOrganizationService(current_tenant) folder = await organization_service.create_folder_structure(db, folder_path, description) return { "message": "Folder created successfully", "folder": folder } except Exception as e: logger.error(f"Error creating folder {folder_path}: {str(e)}") raise HTTPException(status_code=500, detail="Failed to create folder") @router.get("/folders") async def get_folder_structure( root_path: str = Query(""), db: Session = Depends(get_db), current_user: User = Depends(get_current_user), current_tenant: Tenant = Depends(get_current_tenant) ): """ Get the complete folder structure. """ try: organization_service = DocumentOrganizationService(current_tenant) structure = await organization_service.get_folder_structure(db, root_path) return structure except Exception as e: logger.error(f"Error getting folder structure: {str(e)}") raise HTTPException(status_code=500, detail="Failed to get folder structure") @router.get("/folders/{folder_path:path}/documents") async def get_documents_in_folder( folder_path: str, skip: int = Query(0, ge=0), limit: int = Query(100, ge=1, le=1000), db: Session = Depends(get_db), current_user: User = Depends(get_current_user), current_tenant: Tenant = Depends(get_current_tenant) ): """ Get all documents in a specific folder. """ try: organization_service = DocumentOrganizationService(current_tenant) documents = await organization_service.get_documents_in_folder(db, folder_path, skip, limit) return documents except Exception as e: logger.error(f"Error getting documents in folder {folder_path}: {str(e)}") raise HTTPException(status_code=500, detail="Failed to get documents in folder") @router.put("/{document_id}/move") async def move_document_to_folder( document_id: str, folder_path: str = Form(...), db: Session = Depends(get_db), current_user: User = Depends(get_current_user), current_tenant: Tenant = Depends(get_current_tenant) ): """ Move a document to a specific folder. """ try: organization_service = DocumentOrganizationService(current_tenant) success = await organization_service.move_document_to_folder(db, document_id, folder_path) if success: return {"message": "Document moved successfully"} else: raise HTTPException(status_code=404, detail="Document not found") except HTTPException: raise except Exception as e: logger.error(f"Error moving document {document_id} to folder {folder_path}: {str(e)}") raise HTTPException(status_code=500, detail="Failed to move document") @router.get("/tags/popular") async def get_popular_tags( limit: int = Query(20, ge=1, le=100), db: Session = Depends(get_db), current_user: User = Depends(get_current_user), current_tenant: Tenant = Depends(get_current_tenant) ): """ Get the most popular tags. """ try: organization_service = DocumentOrganizationService(current_tenant) tags = await organization_service.get_popular_tags(db, limit) return {"tags": tags} except Exception as e: logger.error(f"Error getting popular tags: {str(e)}") raise HTTPException(status_code=500, detail="Failed to get popular tags") @router.get("/tags/{tag_names}") async def get_documents_by_tags( tag_names: str, skip: int = Query(0, ge=0), limit: int = Query(100, ge=1, le=1000), db: Session = Depends(get_db), current_user: User = Depends(get_current_user), current_tenant: Tenant = Depends(get_current_tenant) ): """ Get documents that have specific tags. """ try: tag_list = [tag.strip() for tag in tag_names.split(",") if tag.strip()] organization_service = DocumentOrganizationService(current_tenant) documents = await organization_service.get_documents_by_tags(db, tag_list, skip, limit) return documents except Exception as e: logger.error(f"Error getting documents by tags {tag_names}: {str(e)}") raise HTTPException(status_code=500, detail="Failed to get documents by tags") async def _process_document_background(document_id: str, file_path: str, tenant_id: str): """ Background task to process a document. """ try: from app.core.database import SessionLocal db = SessionLocal() # Get document and tenant document = db.query(Document).filter(Document.id == document_id).first() tenant = db.query(Tenant).filter(Tenant.id == tenant_id).first() if not document or not tenant: logger.error(f"Document {document_id} or tenant {tenant_id} not found") return # Update status to processing document.processing_status = "processing" db.commit() # Get file from storage storage_service = StorageService(tenant) file_content = await storage_service.download_file(document.file_path) # Create temporary file for processing temp_file_path = Path(f"/tmp/{document.id}_{document.filename}") with open(temp_file_path, "wb") as f: f.write(file_content) # Process document processor = DocumentProcessor(tenant) result = await processor.process_document(temp_file_path, document) # Clean up temporary file temp_file_path.unlink(missing_ok=True) # Update document with extracted content document.extracted_text = "\n".join(result.get('text_content', [])) document.document_metadata = { 'tables': result.get('tables', []), 'charts': result.get('charts', []), 'images': result.get('images', []), 'structure': result.get('structure', {}), 'pages': result.get('metadata', {}).get('pages', 0), 'processing_timestamp': datetime.utcnow().isoformat() } # Auto-categorize and extract metadata organization_service = DocumentOrganizationService(tenant) categories = await organization_service.auto_categorize_document(db, document) additional_metadata = await organization_service.extract_metadata(document) # Update document metadata with additional information document.document_metadata.update(additional_metadata) document.document_metadata['auto_categories'] = categories # Add auto-generated tags based on categories if categories: await organization_service.add_tags_to_document(db, str(document.id), categories) document.processing_status = "completed" # Generate embeddings and store in vector database vector_service = VectorService(tenant) await vector_service.index_document(document, result) db.commit() logger.info(f"Successfully processed document {document_id}") except Exception as e: logger.error(f"Error processing document {document_id}: {str(e)}") # Update document status to failed try: document.processing_status = "failed" document.processing_error = str(e) db.commit() except: pass finally: db.close() async def _process_document_tags(db: Session, document: Document, tag_names: List[str], tenant: Tenant): """ Process and add tags to a document. """ for tag_name in tag_names: # Find or create tag tag = db.query(DocumentTag).filter( and_( DocumentTag.name == tag_name, # In a real implementation, you'd have tenant_id in DocumentTag ) ).first() if not tag: tag = DocumentTag( id=uuid.uuid4(), name=tag_name, description=f"Auto-generated tag: {tag_name}" ) db.add(tag) db.commit() db.refresh(tag) # Add tag to document if not already present if tag not in document.tags: document.tags.append(tag) db.commit()