Files
virtual_board_member/app/api/v1/endpoints/documents.py
Jonathan Pressnell 1a8ec37bed feat: Complete Week 2 - Document Processing Pipeline
- Implement multi-format document support (PDF, XLSX, CSV, PPTX, TXT, Images)
- Add S3-compatible storage service with tenant isolation
- Create document organization service with hierarchical folders and tagging
- Implement advanced document processing with table/chart extraction
- Add batch upload capabilities (up to 50 files)
- Create comprehensive document validation and security scanning
- Implement automatic metadata extraction and categorization
- Add document version control system
- Update DEVELOPMENT_PLAN.md to mark Week 2 as completed
- Add WEEK2_COMPLETION_SUMMARY.md with detailed implementation notes
- All tests passing (6/6) - 100% success rate
2025-08-08 15:47:43 -04:00

659 lines
23 KiB
Python

"""
Document management endpoints for the Virtual Board Member AI System.
"""
import asyncio
import logging
from typing import List, Optional, Dict, Any
from pathlib import Path
import uuid
from datetime import datetime
from fastapi import APIRouter, Depends, HTTPException, UploadFile, File, Form, BackgroundTasks, Query
from fastapi.responses import JSONResponse
from sqlalchemy.orm import Session
from sqlalchemy import and_, or_
from app.core.database import get_db
from app.core.auth import get_current_user, get_current_tenant
from app.models.document import Document, DocumentType, DocumentTag, DocumentVersion
from app.models.user import User
from app.models.tenant import Tenant
from app.services.document_processor import DocumentProcessor
from app.services.vector_service import VectorService
from app.services.storage_service import StorageService
from app.services.document_organization import DocumentOrganizationService
logger = logging.getLogger(__name__)
router = APIRouter()
@router.post("/upload")
async def upload_document(
background_tasks: BackgroundTasks,
file: UploadFile = File(...),
title: str = Form(...),
description: Optional[str] = Form(None),
document_type: DocumentType = Form(DocumentType.OTHER),
tags: Optional[str] = Form(None), # Comma-separated tag names
db: Session = Depends(get_db),
current_user: User = Depends(get_current_user),
current_tenant: Tenant = Depends(get_current_tenant)
):
"""
Upload and process a single document with multi-tenant support.
"""
try:
# Validate file
if not file.filename:
raise HTTPException(status_code=400, detail="No file provided")
# Check file size (50MB limit)
if file.size and file.size > 50 * 1024 * 1024: # 50MB
raise HTTPException(status_code=400, detail="File too large. Maximum size is 50MB")
# Create document record
document = Document(
id=uuid.uuid4(),
title=title,
description=description,
document_type=document_type,
filename=file.filename,
file_path="", # Will be set after saving
file_size=0, # Will be updated after storage
mime_type=file.content_type or "application/octet-stream",
uploaded_by=current_user.id,
organization_id=current_tenant.id,
processing_status="pending"
)
db.add(document)
db.commit()
db.refresh(document)
# Save file using storage service
storage_service = StorageService(current_tenant)
storage_result = await storage_service.upload_file(file, str(document.id))
# Update document with storage information
document.file_path = storage_result["file_path"]
document.file_size = storage_result["file_size"]
document.document_metadata = {
"storage_url": storage_result["storage_url"],
"checksum": storage_result["checksum"],
"uploaded_at": storage_result["uploaded_at"]
}
db.commit()
# Process tags
if tags:
tag_names = [tag.strip() for tag in tags.split(",") if tag.strip()]
await _process_document_tags(db, document, tag_names, current_tenant)
# Start background processing
background_tasks.add_task(
_process_document_background,
document.id,
str(file_path),
current_tenant.id
)
return {
"message": "Document uploaded successfully",
"document_id": str(document.id),
"status": "processing"
}
except Exception as e:
logger.error(f"Error uploading document: {str(e)}")
raise HTTPException(status_code=500, detail="Failed to upload document")
@router.post("/upload/batch")
async def upload_documents_batch(
background_tasks: BackgroundTasks,
files: List[UploadFile] = File(...),
titles: List[str] = Form(...),
descriptions: Optional[List[str]] = Form(None),
document_types: Optional[List[DocumentType]] = Form(None),
db: Session = Depends(get_db),
current_user: User = Depends(get_current_user),
current_tenant: Tenant = Depends(get_current_tenant)
):
"""
Upload and process multiple documents (up to 50 files) with multi-tenant support.
"""
try:
if len(files) > 50:
raise HTTPException(status_code=400, detail="Maximum 50 files allowed per batch")
if len(files) != len(titles):
raise HTTPException(status_code=400, detail="Number of files must match number of titles")
documents = []
for i, file in enumerate(files):
# Validate file
if not file.filename:
continue
# Check file size
if file.size and file.size > 50 * 1024 * 1024: # 50MB
continue
# Create document record
document_type = document_types[i] if document_types and i < len(document_types) else DocumentType.OTHER
description = descriptions[i] if descriptions and i < len(descriptions) else None
document = Document(
id=uuid.uuid4(),
title=titles[i],
description=description,
document_type=document_type,
filename=file.filename,
file_path="",
file_size=0, # Will be updated after storage
mime_type=file.content_type or "application/octet-stream",
uploaded_by=current_user.id,
organization_id=current_tenant.id,
processing_status="pending"
)
db.add(document)
documents.append((document, file))
db.commit()
# Save files using storage service and start processing
storage_service = StorageService(current_tenant)
for document, file in documents:
# Upload file to storage
storage_result = await storage_service.upload_file(file, str(document.id))
# Update document with storage information
document.file_path = storage_result["file_path"]
document.file_size = storage_result["file_size"]
document.document_metadata = {
"storage_url": storage_result["storage_url"],
"checksum": storage_result["checksum"],
"uploaded_at": storage_result["uploaded_at"]
}
# Start background processing
background_tasks.add_task(
_process_document_background,
document.id,
document.file_path,
current_tenant.id
)
db.commit()
return {
"message": f"Uploaded {len(documents)} documents successfully",
"document_ids": [str(doc.id) for doc, _ in documents],
"status": "processing"
}
except Exception as e:
logger.error(f"Error uploading documents batch: {str(e)}")
raise HTTPException(status_code=500, detail="Failed to upload documents")
@router.get("/")
async def list_documents(
skip: int = Query(0, ge=0),
limit: int = Query(100, ge=1, le=1000),
document_type: Optional[DocumentType] = Query(None),
search: Optional[str] = Query(None),
tags: Optional[str] = Query(None), # Comma-separated tag names
status: Optional[str] = Query(None),
db: Session = Depends(get_db),
current_user: User = Depends(get_current_user),
current_tenant: Tenant = Depends(get_current_tenant)
):
"""
List documents with filtering and search capabilities.
"""
try:
query = db.query(Document).filter(Document.organization_id == current_tenant.id)
# Apply filters
if document_type:
query = query.filter(Document.document_type == document_type)
if status:
query = query.filter(Document.processing_status == status)
if search:
search_filter = or_(
Document.title.ilike(f"%{search}%"),
Document.description.ilike(f"%{search}%"),
Document.filename.ilike(f"%{search}%")
)
query = query.filter(search_filter)
if tags:
tag_names = [tag.strip() for tag in tags.split(",") if tag.strip()]
# This is a simplified tag filter - in production, you'd use a proper join
for tag_name in tag_names:
query = query.join(Document.tags).filter(DocumentTag.name.ilike(f"%{tag_name}%"))
# Apply pagination
total = query.count()
documents = query.offset(skip).limit(limit).all()
return {
"documents": [
{
"id": str(doc.id),
"title": doc.title,
"description": doc.description,
"document_type": doc.document_type,
"filename": doc.filename,
"file_size": doc.file_size,
"processing_status": doc.processing_status,
"created_at": doc.created_at.isoformat(),
"updated_at": doc.updated_at.isoformat(),
"tags": [{"id": str(tag.id), "name": tag.name} for tag in doc.tags]
}
for doc in documents
],
"total": total,
"skip": skip,
"limit": limit
}
except Exception as e:
logger.error(f"Error listing documents: {str(e)}")
raise HTTPException(status_code=500, detail="Failed to list documents")
@router.get("/{document_id}")
async def get_document(
document_id: str,
db: Session = Depends(get_db),
current_user: User = Depends(get_current_user),
current_tenant: Tenant = Depends(get_current_tenant)
):
"""
Get document details by ID.
"""
try:
document = db.query(Document).filter(
and_(
Document.id == document_id,
Document.organization_id == current_tenant.id
)
).first()
if not document:
raise HTTPException(status_code=404, detail="Document not found")
return {
"id": str(document.id),
"title": document.title,
"description": document.description,
"document_type": document.document_type,
"filename": document.filename,
"file_size": document.file_size,
"mime_type": document.mime_type,
"processing_status": document.processing_status,
"processing_error": document.processing_error,
"extracted_text": document.extracted_text,
"document_metadata": document.document_metadata,
"source_system": document.source_system,
"created_at": document.created_at.isoformat(),
"updated_at": document.updated_at.isoformat(),
"tags": [{"id": str(tag.id), "name": tag.name} for tag in document.tags],
"versions": [
{
"id": str(version.id),
"version_number": version.version_number,
"filename": version.filename,
"created_at": version.created_at.isoformat()
}
for version in document.versions
]
}
except HTTPException:
raise
except Exception as e:
logger.error(f"Error getting document {document_id}: {str(e)}")
raise HTTPException(status_code=500, detail="Failed to get document")
@router.delete("/{document_id}")
async def delete_document(
document_id: str,
db: Session = Depends(get_db),
current_user: User = Depends(get_current_user),
current_tenant: Tenant = Depends(get_current_tenant)
):
"""
Delete a document and its associated files.
"""
try:
document = db.query(Document).filter(
and_(
Document.id == document_id,
Document.organization_id == current_tenant.id
)
).first()
if not document:
raise HTTPException(status_code=404, detail="Document not found")
# Delete file from storage
if document.file_path:
try:
storage_service = StorageService(current_tenant)
await storage_service.delete_file(document.file_path)
except Exception as e:
logger.warning(f"Could not delete file {document.file_path}: {str(e)}")
# Delete from database (cascade will handle related records)
db.delete(document)
db.commit()
return {"message": "Document deleted successfully"}
except HTTPException:
raise
except Exception as e:
logger.error(f"Error deleting document {document_id}: {str(e)}")
raise HTTPException(status_code=500, detail="Failed to delete document")
@router.post("/{document_id}/tags")
async def add_document_tags(
document_id: str,
tag_names: List[str],
db: Session = Depends(get_db),
current_user: User = Depends(get_current_user),
current_tenant: Tenant = Depends(get_current_tenant)
):
"""
Add tags to a document.
"""
try:
document = db.query(Document).filter(
and_(
Document.id == document_id,
Document.organization_id == current_tenant.id
)
).first()
if not document:
raise HTTPException(status_code=404, detail="Document not found")
await _process_document_tags(db, document, tag_names, current_tenant)
return {"message": "Tags added successfully"}
except HTTPException:
raise
except Exception as e:
logger.error(f"Error adding tags to document {document_id}: {str(e)}")
raise HTTPException(status_code=500, detail="Failed to add tags")
@router.post("/folders")
async def create_folder(
folder_path: str = Form(...),
description: Optional[str] = Form(None),
db: Session = Depends(get_db),
current_user: User = Depends(get_current_user),
current_tenant: Tenant = Depends(get_current_tenant)
):
"""
Create a new folder in the document hierarchy.
"""
try:
organization_service = DocumentOrganizationService(current_tenant)
folder = await organization_service.create_folder_structure(db, folder_path, description)
return {
"message": "Folder created successfully",
"folder": folder
}
except Exception as e:
logger.error(f"Error creating folder {folder_path}: {str(e)}")
raise HTTPException(status_code=500, detail="Failed to create folder")
@router.get("/folders")
async def get_folder_structure(
root_path: str = Query(""),
db: Session = Depends(get_db),
current_user: User = Depends(get_current_user),
current_tenant: Tenant = Depends(get_current_tenant)
):
"""
Get the complete folder structure.
"""
try:
organization_service = DocumentOrganizationService(current_tenant)
structure = await organization_service.get_folder_structure(db, root_path)
return structure
except Exception as e:
logger.error(f"Error getting folder structure: {str(e)}")
raise HTTPException(status_code=500, detail="Failed to get folder structure")
@router.get("/folders/{folder_path:path}/documents")
async def get_documents_in_folder(
folder_path: str,
skip: int = Query(0, ge=0),
limit: int = Query(100, ge=1, le=1000),
db: Session = Depends(get_db),
current_user: User = Depends(get_current_user),
current_tenant: Tenant = Depends(get_current_tenant)
):
"""
Get all documents in a specific folder.
"""
try:
organization_service = DocumentOrganizationService(current_tenant)
documents = await organization_service.get_documents_in_folder(db, folder_path, skip, limit)
return documents
except Exception as e:
logger.error(f"Error getting documents in folder {folder_path}: {str(e)}")
raise HTTPException(status_code=500, detail="Failed to get documents in folder")
@router.put("/{document_id}/move")
async def move_document_to_folder(
document_id: str,
folder_path: str = Form(...),
db: Session = Depends(get_db),
current_user: User = Depends(get_current_user),
current_tenant: Tenant = Depends(get_current_tenant)
):
"""
Move a document to a specific folder.
"""
try:
organization_service = DocumentOrganizationService(current_tenant)
success = await organization_service.move_document_to_folder(db, document_id, folder_path)
if success:
return {"message": "Document moved successfully"}
else:
raise HTTPException(status_code=404, detail="Document not found")
except HTTPException:
raise
except Exception as e:
logger.error(f"Error moving document {document_id} to folder {folder_path}: {str(e)}")
raise HTTPException(status_code=500, detail="Failed to move document")
@router.get("/tags/popular")
async def get_popular_tags(
limit: int = Query(20, ge=1, le=100),
db: Session = Depends(get_db),
current_user: User = Depends(get_current_user),
current_tenant: Tenant = Depends(get_current_tenant)
):
"""
Get the most popular tags.
"""
try:
organization_service = DocumentOrganizationService(current_tenant)
tags = await organization_service.get_popular_tags(db, limit)
return {"tags": tags}
except Exception as e:
logger.error(f"Error getting popular tags: {str(e)}")
raise HTTPException(status_code=500, detail="Failed to get popular tags")
@router.get("/tags/{tag_names}")
async def get_documents_by_tags(
tag_names: str,
skip: int = Query(0, ge=0),
limit: int = Query(100, ge=1, le=1000),
db: Session = Depends(get_db),
current_user: User = Depends(get_current_user),
current_tenant: Tenant = Depends(get_current_tenant)
):
"""
Get documents that have specific tags.
"""
try:
tag_list = [tag.strip() for tag in tag_names.split(",") if tag.strip()]
organization_service = DocumentOrganizationService(current_tenant)
documents = await organization_service.get_documents_by_tags(db, tag_list, skip, limit)
return documents
except Exception as e:
logger.error(f"Error getting documents by tags {tag_names}: {str(e)}")
raise HTTPException(status_code=500, detail="Failed to get documents by tags")
async def _process_document_background(document_id: str, file_path: str, tenant_id: str):
"""
Background task to process a document.
"""
try:
from app.core.database import SessionLocal
db = SessionLocal()
# Get document and tenant
document = db.query(Document).filter(Document.id == document_id).first()
tenant = db.query(Tenant).filter(Tenant.id == tenant_id).first()
if not document or not tenant:
logger.error(f"Document {document_id} or tenant {tenant_id} not found")
return
# Update status to processing
document.processing_status = "processing"
db.commit()
# Get file from storage
storage_service = StorageService(tenant)
file_content = await storage_service.download_file(document.file_path)
# Create temporary file for processing
temp_file_path = Path(f"/tmp/{document.id}_{document.filename}")
with open(temp_file_path, "wb") as f:
f.write(file_content)
# Process document
processor = DocumentProcessor(tenant)
result = await processor.process_document(temp_file_path, document)
# Clean up temporary file
temp_file_path.unlink(missing_ok=True)
# Update document with extracted content
document.extracted_text = "\n".join(result.get('text_content', []))
document.document_metadata = {
'tables': result.get('tables', []),
'charts': result.get('charts', []),
'images': result.get('images', []),
'structure': result.get('structure', {}),
'pages': result.get('metadata', {}).get('pages', 0),
'processing_timestamp': datetime.utcnow().isoformat()
}
# Auto-categorize and extract metadata
organization_service = DocumentOrganizationService(tenant)
categories = await organization_service.auto_categorize_document(db, document)
additional_metadata = await organization_service.extract_metadata(document)
# Update document metadata with additional information
document.document_metadata.update(additional_metadata)
document.document_metadata['auto_categories'] = categories
# Add auto-generated tags based on categories
if categories:
await organization_service.add_tags_to_document(db, str(document.id), categories)
document.processing_status = "completed"
# Generate embeddings and store in vector database
vector_service = VectorService(tenant)
await vector_service.index_document(document, result)
db.commit()
logger.info(f"Successfully processed document {document_id}")
except Exception as e:
logger.error(f"Error processing document {document_id}: {str(e)}")
# Update document status to failed
try:
document.processing_status = "failed"
document.processing_error = str(e)
db.commit()
except:
pass
finally:
db.close()
async def _process_document_tags(db: Session, document: Document, tag_names: List[str], tenant: Tenant):
"""
Process and add tags to a document.
"""
for tag_name in tag_names:
# Find or create tag
tag = db.query(DocumentTag).filter(
and_(
DocumentTag.name == tag_name,
# In a real implementation, you'd have tenant_id in DocumentTag
)
).first()
if not tag:
tag = DocumentTag(
id=uuid.uuid4(),
name=tag_name,
description=f"Auto-generated tag: {tag_name}"
)
db.add(tag)
db.commit()
db.refresh(tag)
# Add tag to document if not already present
if tag not in document.tags:
document.tags.append(tag)
db.commit()