From 95c92946de17fa0a31abf799529851e35d42fca0 Mon Sep 17 00:00:00 2001 From: Jon Date: Fri, 1 Aug 2025 11:13:03 -0400 Subject: [PATCH] fix(core): Overhaul and fix the end-to-end document processing pipeline --- backend/package.json | 2 +- backend/src/controllers/documentController.ts | 514 ++++++++++------ backend/src/index.ts | 127 +--- backend/src/middleware/errorHandler.ts | 19 + backend/src/middleware/upload.ts | 33 +- backend/src/models/VectorDatabaseModel.ts | 582 +----------------- backend/src/models/types.ts | 1 + backend/src/routes/documents.ts | 8 +- backend/src/services/fileStorageService.ts | 31 + cors.json | 23 + firebase.json | 6 + frontend/firebase.json | 4 + frontend/src/App.tsx | 13 - frontend/src/components/DocumentUpload.tsx | 60 +- frontend/src/services/documentService.ts | 142 ++++- storage.cors.json | 23 + storage.rules | 8 + 17 files changed, 695 insertions(+), 901 deletions(-) create mode 100644 cors.json create mode 100644 firebase.json create mode 100644 storage.cors.json create mode 100644 storage.rules diff --git a/backend/package.json b/backend/package.json index 5f56476..f05afd7 100644 --- a/backend/package.json +++ b/backend/package.json @@ -2,7 +2,7 @@ "name": "cim-processor-backend", "version": "1.0.0", "description": "Backend API for CIM Document Processor", - "main": "index.js", + "main": "dist/index.js", "scripts": { "dev": "ts-node-dev --respawn --transpile-only --max-old-space-size=8192 --expose-gc src/index.ts", "build": "tsc && node src/scripts/prepare-dist.js && cp .puppeteerrc.cjs dist/", diff --git a/backend/src/controllers/documentController.ts b/backend/src/controllers/documentController.ts index bf36b3e..093702d 100644 --- a/backend/src/controllers/documentController.ts +++ b/backend/src/controllers/documentController.ts @@ -7,10 +7,11 @@ import { uploadProgressService } from '../services/uploadProgressService'; import { uploadMonitoringService } from '../services/uploadMonitoringService'; export const documentController = { - async uploadDocument(req: Request, res: Response): Promise { - const startTime = Date.now(); - const structuredLogger = new StructuredLogger(req.correlationId); - + async getUploadUrl(req: Request, res: Response): Promise { + console.log('🎯🎯🎯 GET UPLOAD URL ENDPOINT HIT!'); + console.log('🎯 Method:', req.method); + console.log('🎯 URL:', req.url); + console.log('🎯 Headers:', JSON.stringify(req.headers, null, 2)); try { const userId = req.user?.uid; if (!userId) { @@ -21,206 +22,369 @@ export const documentController = { return; } - // Check if file was uploaded - if (!req.file) { - res.status(400).json({ - error: 'No file uploaded', - correlationId: req.correlationId + const { fileName, fileSize, contentType } = req.body; + + if (!fileName || !fileSize || !contentType) { + res.status(400).json({ + error: 'Missing required fields: fileName, fileSize, contentType', + correlationId: req.correlationId }); return; } - const file = req.file; - - // Track upload start - const uploadEventData: any = { - userId, - fileInfo: { - originalName: file.originalname, - size: file.size, - mimetype: file.mimetype, - }, - status: 'started', - stage: 'upload_initiated', - }; - - if (req.correlationId) { - uploadEventData.correlationId = req.correlationId; - } - - uploadMonitoringService.trackUploadEvent(uploadEventData); - - structuredLogger.uploadStart({ - originalName: file.originalname, - size: file.size, - mimetype: file.mimetype, - }, userId); - - // Always use optimized agentic RAG processing - no strategy selection needed - const processingStrategy = 'optimized_agentic_rag'; - - // Store file and get file path - const storageResult = await fileStorageService.storeFile(file, userId); - - if (!storageResult.success || !storageResult.fileInfo) { - const processingTime = Date.now() - startTime; - - // Track upload failure - const failureEventData: any = { - userId, - fileInfo: { - originalName: file.originalname, - size: file.size, - mimetype: file.mimetype, - }, - status: 'failed', - stage: 'file_storage', - error: { - message: storageResult.error || 'Failed to store file', - type: 'storage_error', - code: 'STORAGE_ERROR', - }, - processingTime, - }; - - if (req.correlationId) { - failureEventData.correlationId = req.correlationId; - } - - uploadMonitoringService.trackUploadEvent(failureEventData); - - structuredLogger.uploadError( - new Error(storageResult.error || 'Failed to store file'), - { - originalName: file.originalname, - size: file.size, - mimetype: file.mimetype, - }, - userId, - 'file_storage' - ); - - res.status(500).json({ - error: 'Failed to store file', - correlationId: req.correlationId + // Validate file type + if (contentType !== 'application/pdf') { + res.status(400).json({ + error: 'Only PDF files are supported', + correlationId: req.correlationId }); return; } - - // Create document record + + // Validate file size (max 50MB) + if (fileSize > 50 * 1024 * 1024) { + res.status(400).json({ + error: 'File size exceeds 50MB limit', + correlationId: req.correlationId + }); + return; + } + + // Generate unique file path + const timestamp = Date.now(); + const sanitizedFileName = fileName.replace(/[^a-zA-Z0-9.-]/g, '_'); + const filePath = `uploads/${userId}/${timestamp}_${sanitizedFileName}`; + + // Create document record first const document = await DocumentModel.create({ user_id: userId, - original_file_name: file.originalname, - file_path: storageResult.fileInfo.path, - file_size: file.size, - status: 'uploaded' + original_file_name: fileName, + file_path: filePath, + file_size: fileSize, + status: 'uploading' }); - // Always auto-process with optimized agentic RAG - try { - const jobId = await jobQueueService.addJob( - 'document_processing', - { - documentId: document.id, - userId: userId, - options: { strategy: processingStrategy } - }, - 0 // Normal priority - ); - logger.info('Document processing job queued with optimized agentic RAG', { - documentId: document.id, - jobId, - strategy: processingStrategy - }); - - // Update status to indicate it's queued for processing - await DocumentModel.updateById(document.id, { status: 'extracting_text' }); - } catch (error) { - logger.error('Failed to queue document processing job', { error, documentId: document.id }); - } + // Generate signed upload URL + const { fileStorageService } = await import('../services/fileStorageService'); + const uploadUrl = await fileStorageService.generateSignedUploadUrl(filePath, contentType); - // Track upload success - const processingTime = Date.now() - startTime; - const successEventData: any = { - userId, - fileInfo: { - originalName: file.originalname, - size: file.size, - mimetype: file.mimetype, - }, - status: 'success', - stage: 'upload_completed', - processingTime, - }; + console.log('✅ Generated upload URL for document:', document.id); - if (req.correlationId) { - successEventData.correlationId = req.correlationId; - } - - uploadMonitoringService.trackUploadEvent(successEventData); - - structuredLogger.uploadSuccess({ - originalName: file.originalname, - size: file.size, - mimetype: file.mimetype, - }, userId, processingTime); - - // Return document info - res.status(201).json({ - id: document.id, - name: document.original_file_name, - originalName: document.original_file_name, - status: 'extracting_text', - uploadedAt: document.created_at, - uploadedBy: userId, - fileSize: document.file_size, - processingStrategy: processingStrategy, + res.status(200).json({ + documentId: document.id, + uploadUrl: uploadUrl, + filePath: filePath, correlationId: req.correlationId || undefined }); } catch (error) { - const processingTime = Date.now() - startTime; + console.log('❌ Get upload URL error:', error); + logger.error('Get upload URL failed', { + error, + correlationId: req.correlationId + }); - // Track upload failure - const errorEventData: any = { - userId: req.user?.uid || 'unknown', - fileInfo: { - originalName: req.file?.originalname || 'unknown', - size: req.file?.size || 0, - mimetype: req.file?.mimetype || 'unknown', - }, - status: 'failed', - stage: 'upload_error', - error: { - message: error instanceof Error ? error.message : 'Unknown error', - type: 'upload_error', - }, - processingTime, - }; + res.status(500).json({ + error: 'Failed to generate upload URL', + message: error instanceof Error ? error.message : 'Unknown error', + correlationId: req.correlationId || undefined + }); + } + }, - if (req.correlationId) { - errorEventData.correlationId = req.correlationId; + async confirmUpload(req: Request, res: Response): Promise { + try { + const userId = req.user?.uid; + if (!userId) { + res.status(401).json({ + error: 'User not authenticated', + correlationId: req.correlationId + }); + return; } - uploadMonitoringService.trackUploadEvent(errorEventData); + const { id: documentId } = req.params; + if (!documentId) { + res.status(400).json({ + error: 'Document ID is required', + correlationId: req.correlationId + }); + return; + } - structuredLogger.uploadError( - error, - { - originalName: req.file?.originalname || 'unknown', - size: req.file?.size || 0, - mimetype: req.file?.mimetype || 'unknown', - }, - req.user?.uid || 'unknown', - 'upload_error' + // Get document record + const document = await DocumentModel.findById(documentId); + if (!document) { + res.status(404).json({ + error: 'Document not found', + correlationId: req.correlationId + }); + return; + } + + // Verify user owns document + if (document.user_id !== userId) { + res.status(403).json({ + error: 'Access denied', + correlationId: req.correlationId + }); + return; + } + + console.log('🔄 Starting Document AI processing for:', documentId); + + // Update status to processing + await DocumentModel.updateById(documentId, { + status: 'processing_llm' + }); + + // Acknowledge the request immediately + res.status(202).json({ + message: 'Upload confirmed, processing has started.', + documentId: documentId, + status: 'processing' + }); + + // Process in the background + (async () => { + try { + // Download file from Firebase Storage for Document AI processing + const { fileStorageService } = await import('../services/fileStorageService'); + + let fileBuffer: Buffer | null = null; + for (let i = 0; i < 3; i++) { + await new Promise(resolve => setTimeout(resolve, 2000)); // 2 second delay + fileBuffer = await fileStorageService.getFile(document.file_path); + if (fileBuffer) { + break; + } + } + + if (!fileBuffer) { + await DocumentModel.updateById(documentId, { + status: 'failed', + error_message: 'Failed to download uploaded file' + }); + return; + } + + // Process with Unified Document Processor + const { unifiedDocumentProcessor } = await import('../services/unifiedDocumentProcessor'); + + const result = await unifiedDocumentProcessor.processDocument( + documentId, + userId, + '', // Text is not needed for this strategy + { strategy: 'optimized_agentic_rag' } + ); + + if (result.success) { + // Update document with results + await DocumentModel.updateById(documentId, { + status: 'completed', + generated_summary: result.summary, + processing_completed_at: new Date() + }); + + // 🗑️ DELETE PDF after successful processing + try { + await fileStorageService.deleteFile(document.file_path); + console.log('✅ PDF deleted after successful processing:', document.file_path); + } catch (deleteError) { + console.log('⚠️ Failed to delete PDF file:', deleteError); + logger.warn('Failed to delete PDF after processing', { + filePath: document.file_path, + documentId, + error: deleteError + }); + } + + console.log('✅ Document AI processing completed successfully'); + } else { + await DocumentModel.updateById(documentId, { + status: 'failed', + error_message: result.error + }); + + // Also delete PDF on processing failure to avoid storage costs + try { + await fileStorageService.deleteFile(document.file_path); + console.log('🗑️ PDF deleted after processing failure'); + } catch (deleteError) { + console.log('⚠️ Failed to delete PDF file after error:', deleteError); + } + } + } catch (error) { + console.log('❌ Background processing error:', error); + logger.error('Background processing failed', { + error, + documentId + }); + await DocumentModel.updateById(documentId, { + status: 'failed', + error_message: 'Background processing failed' + }); + } + })(); + + } catch (error) { + console.log('❌ Confirm upload error:', error); + logger.error('Confirm upload failed', { + error, + correlationId: req.correlationId + }); + + res.status(500).json({ + error: 'Upload confirmation failed', + message: error instanceof Error ? error.message : 'Unknown error', + correlationId: req.correlationId || undefined + }); + } + }, + + async uploadDocument(req: Request, res: Response): Promise { + const startTime = Date.now(); + + // 🔍 COMPREHENSIVE DEBUG: Log everything about the request + console.log('🚀 ========================='); + console.log('🚀 DOCUMENT AI UPLOAD STARTED'); + console.log('🚀 Method:', req.method); + console.log('🚀 URL:', req.url); + console.log('🚀 Content-Type:', req.get('Content-Type')); + console.log('🚀 Content-Length:', req.get('Content-Length')); + console.log('🚀 Authorization header present:', !!req.get('Authorization')); + console.log('🚀 User from token:', req.user?.uid || 'NOT_FOUND'); + + // Debug body in detail + console.log('🚀 Has body:', !!req.body); + console.log('🚀 Body type:', typeof req.body); + console.log('🚀 Body constructor:', req.body?.constructor?.name); + console.log('🚀 Body length:', req.body?.length || 0); + console.log('🚀 Is Buffer?:', Buffer.isBuffer(req.body)); + + // Debug all headers + console.log('🚀 All headers:', JSON.stringify(req.headers, null, 2)); + + // Debug request properties + console.log('🚀 Request readable:', req.readable); + console.log('🚀 Request complete:', req.complete); + + // If body exists, show first few bytes + if (req.body && req.body.length > 0) { + const preview = req.body.slice(0, 100).toString('hex'); + console.log('🚀 Body preview (hex):', preview); + + // Try to see if it contains multipart boundary + const bodyStr = req.body.toString('utf8', 0, Math.min(500, req.body.length)); + console.log('🚀 Body preview (string):', bodyStr.substring(0, 200)); + } + + console.log('🚀 ========================='); + + try { + const userId = req.user?.uid; + if (!userId) { + console.log('❌ Authentication failed - no userId'); + res.status(401).json({ + error: 'User not authenticated', + correlationId: req.correlationId + }); + return; + } + + console.log('✅ Authentication successful for user:', userId); + + // Get raw body buffer for Document AI processing + const rawBody = req.body; + if (!rawBody || rawBody.length === 0) { + res.status(400).json({ + error: 'No file data received', + correlationId: req.correlationId, + debug: { + method: req.method, + contentType: req.get('Content-Type'), + contentLength: req.get('Content-Length'), + hasRawBody: !!rawBody, + rawBodySize: rawBody?.length || 0, + bodyType: typeof rawBody + } + }); + return; + } + + console.log('✅ Found raw body buffer:', rawBody.length, 'bytes'); + + // Create document record first + const document = await DocumentModel.create({ + user_id: userId, + original_file_name: 'uploaded-document.pdf', + file_path: '', + file_size: rawBody.length, + status: 'processing_llm' + }); + + console.log('✅ Document record created:', document.id); + + // Process with Document AI directly + const { DocumentAiGenkitProcessor } = await import('../services/documentAiGenkitProcessor'); + const processor = new DocumentAiGenkitProcessor(); + + console.log('✅ Starting Document AI processing...'); + const result = await processor.processDocument( + document.id, + userId, + rawBody, + 'uploaded-document.pdf', + 'application/pdf' ); + if (result.success) { + await DocumentModel.updateById(document.id, { + status: 'completed', + generated_summary: result.content, + processing_completed_at: new Date() + }); + + console.log('✅ Document AI processing completed successfully'); + + res.status(201).json({ + id: document.id, + name: 'uploaded-document.pdf', + originalName: 'uploaded-document.pdf', + status: 'completed', + uploadedAt: document.created_at, + uploadedBy: userId, + fileSize: rawBody.length, + summary: result.content, + correlationId: req.correlationId || undefined + }); + return; + } else { + console.log('❌ Document AI processing failed:', result.error); + await DocumentModel.updateById(document.id, { + status: 'failed', + error_message: result.error + }); + + res.status(500).json({ + error: 'Document processing failed', + message: result.error, + correlationId: req.correlationId || undefined + }); + return; + } + + } catch (error) { + console.log('❌ Upload error:', error); + logger.error('Upload document failed', { error, correlationId: req.correlationId }); + res.status(500).json({ error: 'Upload failed', + message: error instanceof Error ? error.message : 'Unknown error', correlationId: req.correlationId || undefined }); } @@ -552,4 +716,4 @@ export const documentController = { throw new Error('Failed to get document text'); } } -}; \ No newline at end of file +}; \ No newline at end of file diff --git a/backend/src/index.ts b/backend/src/index.ts index 410b752..0fdf3a3 100644 --- a/backend/src/index.ts +++ b/backend/src/index.ts @@ -18,20 +18,17 @@ import { notFoundHandler } from './middleware/notFoundHandler'; const app = express(); -// Enable trust proxy to ensure Express works correctly behind the proxy +// Add this middleware to log all incoming requests +app.use((req, res, next) => { + console.log(`Incoming request: ${req.method} ${req.path}`); + next(); +}); + +// Enable trust proxy to ensure Express works correctly behind a proxy app.set('trust proxy', 1); // Security middleware -app.use(helmet({ - contentSecurityPolicy: { - directives: { - defaultSrc: ["'self'"], - styleSrc: ["'self'", "'unsafe-inline'"], - scriptSrc: ["'self'"], - imgSrc: ["'self'", "data:", "https:"], - }, - }, -})); +app.use(helmet()); // CORS configuration const allowedOrigins = [ @@ -43,13 +40,10 @@ const allowedOrigins = [ app.use(cors({ origin: function (origin, callback) { - console.log('🌐 CORS request from origin:', origin); - if (!origin || allowedOrigins.indexOf(origin) !== -1) { - console.log('✅ CORS allowed for origin:', origin); callback(null, true); } else { - console.log('❌ CORS blocked origin:', origin); + logger.warn(`CORS blocked for origin: ${origin}`); callback(new Error('Not allowed by CORS')); } }, @@ -62,7 +56,7 @@ app.use(cors({ // Rate limiting const limiter = rateLimit({ windowMs: 15 * 60 * 1000, // 15 minutes - max: 1000, // limit each IP to 1000 requests per windowMs (increased for testing) + max: 1000, message: { error: 'Too many requests from this IP, please try again later.', }, @@ -72,27 +66,6 @@ const limiter = rateLimit({ app.use(limiter); -// Body parsing middleware - only for non-multipart requests -app.use((req, res, next) => { - if (req.headers['content-type'] && req.headers['content-type'].includes('multipart/form-data')) { - // Skip body parsing for multipart requests - let multer handle it - next(); - } else { - // Parse JSON and URL-encoded bodies for other requests - express.json({ limit: '10mb' })(req, res, next); - } -}); - -app.use((req, res, next) => { - if (req.headers['content-type'] && req.headers['content-type'].includes('multipart/form-data')) { - // Skip body parsing for multipart requests - let multer handle it - next(); - } else { - // Parse URL-encoded bodies for other requests - express.urlencoded({ extended: true, limit: '10mb' })(req, res, next); - } -}); - // Logging middleware app.use(morgan('combined', { stream: { @@ -100,17 +73,12 @@ app.use(morgan('combined', { }, })); -// Request debugging middleware -app.use((req, res, next) => { - console.log('📥 Incoming request:', req.method, req.url); - console.log('📥 Request headers:', Object.keys(req.headers)); - console.log('📥 Content-Type:', req.get('Content-Type')); - console.log('📥 Authorization:', req.get('Authorization') ? 'Present' : 'Missing'); - next(); -}); +// CRITICAL: Add body parsing BEFORE routes +app.use(express.json({ limit: '10mb' })); +app.use(express.urlencoded({ extended: true, limit: '10mb' })); // Health check endpoint -app.get('/health', (_req, res) => { // _req to fix TS6133 +app.get('/health', (_req, res) => { res.status(200).json({ status: 'ok', timestamp: new Date().toISOString(), @@ -119,53 +87,23 @@ app.get('/health', (_req, res) => { // _req to fix TS6133 }); }); -// Agentic RAG health check endpoints -app.get('/health/agentic-rag', async (_req, res) => { - try { - const { agenticRAGDatabaseService } = await import('./services/agenticRAGDatabaseService'); - const healthStatus = await agenticRAGDatabaseService.getHealthStatus(); - res.json(healthStatus); - } catch (error) { - logger.error('Agentic RAG health check failed', { error }); - res.status(500).json({ - error: 'Health check failed', - status: 'unhealthy', - timestamp: new Date().toISOString() - }); - } -}); - -app.get('/health/agentic-rag/metrics', async (_req, res) => { - try { - const { agenticRAGDatabaseService } = await import('./services/agenticRAGDatabaseService'); - const startDate = new Date(Date.now() - 30 * 24 * 60 * 60 * 1000); // 30 days ago - const metrics = await agenticRAGDatabaseService.generatePerformanceReport(startDate, new Date()); - res.json(metrics); - } catch (error) { - logger.error('Agentic RAG metrics retrieval failed', { error }); - res.status(500).json({ error: 'Metrics retrieval failed' }); - } -}); - -// API routes - remove the /api prefix as it's handled by Firebase +// API Routes app.use('/documents', documentRoutes); app.use('/vector', vectorRoutes); app.use('/monitoring', monitoringRoutes); import * as functions from 'firebase-functions'; +import { onRequest } from 'firebase-functions/v2/https'; // API root endpoint -app.get('/', (_req, res) => { // _req to fix TS6133 +app.get('/', (_req, res) => { res.json({ message: 'CIM Document Processor API', version: '1.0.0', endpoints: { - auth: '/auth', documents: '/documents', health: '/health', - agenticRagHealth: '/health/agentic-rag', - agenticRagMetrics: '/health/agentic-rag/metrics', monitoring: '/monitoring', }, }); @@ -177,26 +115,11 @@ app.use(notFoundHandler); // Global error handler (must be last) app.use(errorHandler); -// Initialize job queue service for document processing -import { jobQueueService } from './services/jobQueueService'; - -// Start the job queue service asynchronously to avoid blocking function startup -// Use a longer delay to ensure the function is fully initialized -setTimeout(() => { - try { - jobQueueService.start(); - logger.info('Job queue service started successfully'); - } catch (error) { - logger.error('Failed to start job queue service', { error }); - } -}, 5000); - -// Listen on a port when not in a Firebase Function environment or when PORT is explicitly set -if (!process.env['FUNCTION_TARGET'] || process.env['PORT']) { - const port = process.env['PORT'] || 5001; - app.listen(port, () => { - logger.info(`API server listening on port ${port}`); - }); -} - -export const api = functions.https.onRequest(app); \ No newline at end of file +// Configure Firebase Functions v2 for larger uploads +export const api = onRequest({ + timeoutSeconds: 540, // 9 minutes + memory: '2GiB', + cpu: 1, + maxInstances: 10, + cors: true +}, app); \ No newline at end of file diff --git a/backend/src/middleware/errorHandler.ts b/backend/src/middleware/errorHandler.ts index bee3298..7c90bf6 100644 --- a/backend/src/middleware/errorHandler.ts +++ b/backend/src/middleware/errorHandler.ts @@ -11,6 +11,18 @@ export const errorHandler = ( req: Request, res: Response ): void => { + console.log('💥💥💥 MAXIMUM DEBUG ERROR HANDLER HIT 💥💥💥'); + console.log('💥 Error name:', err.name); + console.log('💥 Error message:', err.message); + console.log('💥 Error code:', (err as any).code); + console.log('💥 Error type:', typeof err); + console.log('💥 Error constructor:', err.constructor.name); + console.log('💥 Error stack:', err.stack); + console.log('💥 Request URL:', req.url); + console.log('💥 Request method:', req.method); + console.log('💥 Full error object:', JSON.stringify(err, Object.getOwnPropertyNames(err), 2)); + console.log('💥💥💥 END ERROR DEBUG 💥💥💥'); + let error = { ...err }; error.message = err.message; @@ -53,6 +65,13 @@ export const errorHandler = ( error = { message, statusCode: 401 } as AppError; } + // Multer errors (check if multer is imported anywhere) + if (err.name === 'MulterError' || (err as any).code === 'UNEXPECTED_END_OF_FORM') { + console.log('🚨 MULTER ERROR CAUGHT:', err.message); + const message = `File upload failed: ${err.message}`; + error = { message, statusCode: 400 } as AppError; + } + // Default error const statusCode = error.statusCode || 500; const message = error.message || 'Server Error'; diff --git a/backend/src/middleware/upload.ts b/backend/src/middleware/upload.ts index ff5144d..1c54a73 100644 --- a/backend/src/middleware/upload.ts +++ b/backend/src/middleware/upload.ts @@ -13,9 +13,15 @@ if (!fs.existsSync(uploadDir)) { // File filter function const fileFilter = (req: Request, file: any, cb: multer.FileFilterCallback) => { - console.log('🔍 File filter called for:', file.originalname); + console.log('🔍 ===== FILE FILTER CALLED ====='); + console.log('🔍 File originalname:', file.originalname); console.log('🔍 File mimetype:', file.mimetype); console.log('🔍 File size:', file.size); + console.log('🔍 File encoding:', file.encoding); + console.log('🔍 File fieldname:', file.fieldname); + console.log('🔍 Request Content-Type:', req.get('Content-Type')); + console.log('🔍 Request Content-Length:', req.get('Content-Length')); + console.log('🔍 ==========================='); // Check file type - allow PDF and text files for testing const allowedTypes = ['application/pdf', 'text/plain', 'text/html']; @@ -68,6 +74,14 @@ const upload = multer({ // Error handling middleware for multer export const handleUploadError = (error: any, req: Request, res: Response, next: NextFunction): void => { + console.log('🚨 ============================='); + console.log('🚨 UPLOAD ERROR HANDLER CALLED'); + console.log('🚨 Error type:', error?.constructor?.name); + console.log('🚨 Error message:', error?.message); + console.log('🚨 Error code:', error?.code); + console.log('🚨 Is MulterError:', error instanceof multer.MulterError); + console.log('🚨 ============================='); + if (error instanceof multer.MulterError) { logger.error('Multer error during file upload:', { error: error.message, @@ -129,12 +143,14 @@ export const handleUploadError = (error: any, req: Request, res: Response, next: // Main upload middleware with timeout handling export const uploadMiddleware = (req: Request, res: Response, next: NextFunction) => { - console.log('📤 Upload middleware called'); + console.log('📤 ============================='); + console.log('📤 UPLOAD MIDDLEWARE CALLED'); console.log('📤 Request method:', req.method); console.log('📤 Request URL:', req.url); console.log('📤 Content-Type:', req.get('Content-Type')); console.log('📤 Content-Length:', req.get('Content-Length')); console.log('📤 User-Agent:', req.get('User-Agent')); + console.log('📤 ============================='); // Set a timeout for the upload const uploadTimeout = setTimeout(() => { @@ -155,12 +171,25 @@ export const uploadMiddleware = (req: Request, res: Response, next: NextFunction clearTimeout(uploadTimeout); if (err) { console.log('❌ Upload middleware error:', err); + console.log('❌ Error details:', { + name: err.name, + message: err.message, + code: err.code, + stack: err.stack?.split('\n')[0] + }); } else { console.log('✅ Upload middleware completed successfully'); + console.log('✅ File after multer processing:', { + hasFile: !!req.file, + filename: req.file?.originalname, + size: req.file?.size, + mimetype: req.file?.mimetype + }); } originalNext(err); }; + console.log('🔄 Calling multer.single("document")...'); upload.single('document')(req, res, next); }; diff --git a/backend/src/models/VectorDatabaseModel.ts b/backend/src/models/VectorDatabaseModel.ts index 573e58b..946bd27 100644 --- a/backend/src/models/VectorDatabaseModel.ts +++ b/backend/src/models/VectorDatabaseModel.ts @@ -1,6 +1,6 @@ import { v4 as uuidv4 } from 'uuid'; import { logger } from '../utils/logger'; -import pool from '../config/database'; +import { getSupabaseServiceClient } from '../config/supabase'; export interface DocumentChunk { id: string; @@ -15,577 +15,21 @@ export interface DocumentChunk { updatedAt: Date; } -export interface VectorSearchResult { - documentId: string; - similarityScore: number; - chunkContent: string; - metadata: Record; -} - -export interface DocumentSimilarity { - id: string; - sourceDocumentId: string; - targetDocumentId: string; - similarityScore: number; - similarityType: string; - metadata: Record; - createdAt: Date; -} - -export interface IndustryEmbedding { - id: string; - industryName: string; - industryDescription?: string; - embedding: number[]; - documentCount: number; - averageSimilarity?: number; - createdAt: Date; - updatedAt: Date; -} - export class VectorDatabaseModel { - /** - * Store document chunks with embeddings - */ static async storeDocumentChunks(chunks: Omit[]): Promise { - const client = await pool.connect(); - - try { - await client.query('BEGIN'); - - for (const chunk of chunks) { - // Ensure embedding is properly formatted for pgvector - const embeddingArray = Array.isArray(chunk.embedding) ? chunk.embedding : []; - - // Validate embedding dimensions (should be 1536 for text-embedding-3-small) - if (embeddingArray.length !== 1536) { - logger.warn(`Embedding dimension mismatch: expected 1536, got ${embeddingArray.length}`); - // Pad or truncate to 1536 dimensions if necessary - const paddedEmbedding = new Array(1536).fill(0); - embeddingArray.forEach((val, index) => { - if (index < 1536) paddedEmbedding[index] = val; - }); - } - - // Format embedding properly for pgvector - must be a JSON array string - const embeddingString = JSON.stringify(embeddingArray); - - await client.query(` - INSERT INTO document_chunks ( - id, document_id, content, metadata, embedding, - chunk_index, section, page_number - ) VALUES ($1, $2, $3, $4, $5::vector, $6, $7, $8) - ON CONFLICT (id) DO UPDATE SET - content = EXCLUDED.content, - metadata = EXCLUDED.metadata, - embedding = EXCLUDED.embedding, - section = EXCLUDED.section, - page_number = EXCLUDED.page_number, - updated_at = CURRENT_TIMESTAMP - `, [ - uuidv4(), - chunk.documentId, - chunk.content, - JSON.stringify(chunk.metadata), - embeddingString, // Pass as JSON string for pgvector - chunk.chunkIndex, - chunk.section, - chunk.pageNumber - ]); - } - - await client.query('COMMIT'); - logger.info(`Stored ${chunks.length} document chunks in vector database`); - } catch (error) { - await client.query('ROLLBACK'); + const supabase = getSupabaseServiceClient(); + const { data, error } = await supabase + .from('document_chunks') + .insert(chunks.map(chunk => ({ + ...chunk, + embedding: `[${chunk.embedding.join(',')}]` // Format for pgvector + }))); + + if (error) { logger.error('Failed to store document chunks', error); throw error; - } finally { - client.release(); } + + logger.info(`Stored ${chunks.length} document chunks in vector database`); } - - /** - * Search for similar content using vector similarity - */ - static async searchSimilarContent( - queryEmbedding: number[], - options: { - documentId?: string; - limit?: number; - similarityThreshold?: number; - filters?: Record; - } = {} - ): Promise { - const { - documentId, - limit = 10, - similarityThreshold = 0.7, - filters = {} - } = options; - - // Ensure embedding is properly formatted - const embeddingArray = Array.isArray(queryEmbedding) ? queryEmbedding : []; - - // Validate embedding dimensions - if (embeddingArray.length !== 1536) { - logger.warn(`Query embedding dimension mismatch: expected 1536, got ${embeddingArray.length}`); - // Pad or truncate to 1536 dimensions if necessary - const paddedEmbedding = new Array(1536).fill(0); - embeddingArray.forEach((val, index) => { - if (index < 1536) paddedEmbedding[index] = val; - }); - } - - let query = ` - SELECT - dc.document_id, - 1 - (dc.embedding <=> $1::vector) as similarity_score, - dc.content as chunk_content, - dc.metadata - FROM document_chunks dc - WHERE dc.embedding IS NOT NULL - `; - - const params: any[] = [embeddingArray]; - let paramIndex = 2; - - if (documentId) { - query += ` AND dc.document_id = $${paramIndex}`; - params.push(documentId); - paramIndex++; - } - - // Add metadata filters - Object.entries(filters).forEach(([key, value]) => { - query += ` AND dc.metadata->>'${key}' = $${paramIndex}`; - params.push(value); - paramIndex++; - }); - - query += ` - AND 1 - (dc.embedding <=> $1::vector) >= $${paramIndex} - ORDER BY dc.embedding <=> $1::vector - LIMIT $${paramIndex + 1} - `; - params.push(similarityThreshold, limit); - - try { - const result = await pool.query(query, params); - - return result.rows.map((row: any) => ({ - documentId: row.document_id, - similarityScore: parseFloat(row.similarity_score), - chunkContent: row.chunk_content, - metadata: row.metadata - })); - } catch (error) { - logger.error('Vector search failed', error); - throw error; - } - } - - /** - * Get document chunks by document ID - */ - static async getDocumentChunks(documentId: string): Promise { - try { - const result = await pool.query(` - SELECT - id, - document_id, - content, - metadata, - embedding, - chunk_index, - section, - page_number, - created_at, - updated_at - FROM document_chunks - WHERE document_id = $1 - ORDER BY chunk_index - `, [documentId]); - - return result.rows.map((row: any) => ({ - id: row.id, - documentId: row.document_id, - content: row.content, - metadata: row.metadata || {}, - embedding: row.embedding || [], - chunkIndex: row.chunk_index, - section: row.section, - pageNumber: row.page_number, - createdAt: row.created_at, - updatedAt: row.updated_at - })); - } catch (error) { - logger.error('Failed to get document chunks', error); - throw error; - } - } - - /** - * Find similar documents - */ - static async findSimilarDocuments( - documentId: string, - limit: number = 10, - similarityThreshold: number = 0.6 - ): Promise { - try { - // Get document chunks - const documentChunks = await this.getDocumentChunks(documentId); - if (documentChunks.length === 0) return []; - - // Use the first chunk as reference - const referenceChunk = documentChunks[0]; - if (!referenceChunk || !referenceChunk.embedding) return []; - - const result = await pool.query(` - SELECT - id, - source_document_id, - target_document_id, - similarity_score, - similarity_type, - metadata, - created_at - FROM document_similarities - WHERE source_document_id = $1 - AND similarity_score >= $2 - ORDER BY similarity_score DESC - LIMIT $3 - `, [documentId, similarityThreshold, limit]); - - return result.rows.map((row: any) => ({ - id: row.id, - sourceDocumentId: row.source_document_id, - targetDocumentId: row.target_document_id, - similarityScore: parseFloat(row.similarity_score), - similarityType: row.similarity_type, - metadata: row.metadata || {}, - createdAt: row.created_at - })); - } catch (error) { - logger.error('Failed to find similar documents', error); - throw error; - } - } - - /** - * Update document similarities - */ - static async updateDocumentSimilarities(): Promise { - try { - await pool.query(` - SELECT update_document_similarities(); - `); - logger.info('Document similarities updated'); - } catch (error) { - logger.error('Failed to update document similarities', error); - throw error; - } - } - - /** - * Store industry embedding - */ - static async storeIndustryEmbedding(industry: Omit): Promise { - try { - // Ensure embedding is properly formatted - const embeddingArray = Array.isArray(industry.embedding) ? industry.embedding : []; - - // Validate embedding dimensions - if (embeddingArray.length !== 1536) { - logger.warn(`Industry embedding dimension mismatch: expected 1536, got ${embeddingArray.length}`); - // Pad or truncate to 1536 dimensions if necessary - const paddedEmbedding = new Array(1536).fill(0); - embeddingArray.forEach((val, index) => { - if (index < 1536) paddedEmbedding[index] = val; - }); - } - - await pool.query(` - INSERT INTO industry_embeddings ( - id, industry_name, industry_description, embedding, - document_count, average_similarity - ) VALUES ($1, $2, $3, $4::vector, $5, $6) - ON CONFLICT (industry_name) DO UPDATE SET - industry_description = EXCLUDED.industry_description, - embedding = EXCLUDED.embedding, - document_count = EXCLUDED.document_count, - average_similarity = EXCLUDED.average_similarity, - updated_at = CURRENT_TIMESTAMP - `, [ - uuidv4(), - industry.industryName, - industry.industryDescription, - embeddingArray, - industry.documentCount, - industry.averageSimilarity - ]); - - logger.info(`Stored industry embedding for: ${industry.industryName}`); - } catch (error) { - logger.error('Failed to store industry embedding', error); - throw error; - } - } - - /** - * Search by industry - */ - static async searchByIndustry( - industryName: string, - queryEmbedding: number[], - limit: number = 20 - ): Promise { - try { - // Ensure embedding is properly formatted - const embeddingArray = Array.isArray(queryEmbedding) ? queryEmbedding : []; - - // Validate embedding dimensions - if (embeddingArray.length !== 1536) { - logger.warn(`Industry search embedding dimension mismatch: expected 1536, got ${embeddingArray.length}`); - // Pad or truncate to 1536 dimensions if necessary - const paddedEmbedding = new Array(1536).fill(0); - embeddingArray.forEach((val, index) => { - if (index < 1536) paddedEmbedding[index] = val; - }); - } - - const result = await pool.query(` - SELECT - dc.document_id, - 1 - (dc.embedding <=> $1::vector) as similarity_score, - dc.content as chunk_content, - dc.metadata - FROM document_chunks dc - WHERE dc.embedding IS NOT NULL - AND dc.metadata->>'industry' = $2 - ORDER BY dc.embedding <=> $1::vector - LIMIT $3 - `, [embeddingArray, industryName.toLowerCase(), limit]); - - return result.rows.map((row: any) => ({ - documentId: row.document_id, - similarityScore: parseFloat(row.similarity_score), - chunkContent: row.chunk_content, - metadata: row.metadata || {} - })); - } catch (error) { - logger.error('Failed to search by industry', error); - throw error; - } - } - - /** - * Track search query for analytics - */ - static async trackSearchQuery( - userId: string, - queryText: string, - queryEmbedding: number[], - searchResults: VectorSearchResult[], - options: { - filters?: Record; - limitCount?: number; - similarityThreshold?: number; - processingTimeMs?: number; - } = {} - ): Promise { - try { - // Ensure embedding is properly formatted - const embeddingArray = Array.isArray(queryEmbedding) ? queryEmbedding : []; - - // Validate embedding dimensions - if (embeddingArray.length !== 1536) { - logger.warn(`Search tracking embedding dimension mismatch: expected 1536, got ${embeddingArray.length}`); - // Pad or truncate to 1536 dimensions if necessary - const paddedEmbedding = new Array(1536).fill(0); - embeddingArray.forEach((val, index) => { - if (index < 1536) paddedEmbedding[index] = val; - }); - } - - await pool.query(` - INSERT INTO vector_similarity_searches ( - id, user_id, query_text, query_embedding, search_results, - filters, limit_count, similarity_threshold, processing_time_ms - ) VALUES ($1, $2, $3, $4::vector, $5, $6, $7, $8, $9) - `, [ - uuidv4(), - userId, - queryText, - embeddingArray, - JSON.stringify(searchResults), - JSON.stringify(options.filters || {}), - options.limitCount || 10, - options.similarityThreshold || 0.7, - options.processingTimeMs || 0 - ]); - - logger.debug('Search query tracked for analytics'); - } catch (error) { - logger.error('Failed to track search query', error); - // Don't throw - analytics failure shouldn't break search - } - } - - /** - * Get search analytics - */ - static async getSearchAnalytics(userId: string, days: number = 30): Promise { - try { - const result = await pool.query(` - SELECT - query_text, - COUNT(*) as search_count, - AVG(processing_time_ms) as avg_processing_time, - AVG(similarity_threshold) as avg_similarity_threshold, - MAX(created_at) as last_search - FROM vector_similarity_searches - WHERE user_id = $1 - AND created_at >= NOW() - INTERVAL '${days} days' - GROUP BY query_text - ORDER BY search_count DESC - LIMIT 20 - `, [userId]); - - return result.rows; - } catch (error) { - logger.error('Failed to get search analytics', error); - throw error; - } - } - - /** - * Delete document chunks - */ - static async deleteDocumentChunks(documentId: string): Promise { - try { - await pool.query(` - DELETE FROM document_chunks - WHERE document_id = $1 - `, [documentId]); - - logger.info(`Deleted chunks for document: ${documentId}`); - } catch (error) { - logger.error('Failed to delete document chunks', error); - throw error; - } - } - - /** - * Get vector database statistics - */ - static async getVectorDatabaseStats(): Promise<{ - totalChunks: number; - totalDocuments: number; - totalSearches: number; - averageSimilarity: number; - }> { - try { - const [chunksResult, documentsResult, searchesResult, similarityResult] = await Promise.all([ - pool.query('SELECT COUNT(*) as count FROM document_chunks'), - pool.query('SELECT COUNT(DISTINCT document_id) as count FROM document_chunks'), - pool.query('SELECT COUNT(*) as count FROM vector_similarity_searches'), - pool.query(` - SELECT AVG(similarity_score) as avg_similarity - FROM document_similarities - WHERE similarity_score > 0 - `) - ]); - - return { - totalChunks: parseInt(chunksResult.rows[0]?.count || '0'), - totalDocuments: parseInt(documentsResult.rows[0]?.count || '0'), - totalSearches: parseInt(searchesResult.rows[0]?.count || '0'), - averageSimilarity: parseFloat(similarityResult.rows[0]?.avg_similarity || '0') - }; - } catch (error) { - logger.error('Failed to get vector database stats', error); - throw error; - } - } - - /** - * Get all chunks (for testing/debugging) - */ - static async getAllChunks(): Promise { - try { - const result = await pool.query(` - SELECT - id, - document_id, - content, - metadata, - embedding, - chunk_index, - section, - page_number, - created_at, - updated_at - FROM document_chunks - ORDER BY document_id, chunk_index - LIMIT 1000 - `); - - return result.rows.map((row: any) => ({ - id: row.id, - documentId: row.document_id, - content: row.content, - metadata: row.metadata || {}, - embedding: row.embedding || [], - chunkIndex: row.chunk_index, - section: row.section, - pageNumber: row.page_number, - createdAt: row.created_at, - updatedAt: row.updated_at - })); - } catch (error) { - logger.error('Failed to get all chunks', error); - throw error; - } - } - - /** - * Get total chunk count - */ - static async getTotalChunkCount(): Promise { - try { - const result = await pool.query('SELECT COUNT(*) as count FROM document_chunks'); - return parseInt(result.rows[0]?.count || '0'); - } catch (error) { - logger.error('Failed to get total chunk count', error); - throw error; - } - } - - /** - * Get total document count - */ - static async getTotalDocumentCount(): Promise { - try { - const result = await pool.query('SELECT COUNT(DISTINCT document_id) as count FROM document_chunks'); - return parseInt(result.rows[0]?.count || '0'); - } catch (error) { - logger.error('Failed to get total document count', error); - throw error; - } - } - - /** - * Get average chunk size - */ - static async getAverageChunkSize(): Promise { - try { - const result = await pool.query('SELECT AVG(LENGTH(content)) as avg_size FROM document_chunks'); - return Math.round(parseFloat(result.rows[0]?.avg_size || '0')); - } catch (error) { - logger.error('Failed to get average chunk size', error); - throw error; - } - } -} \ No newline at end of file +} \ No newline at end of file diff --git a/backend/src/models/types.ts b/backend/src/models/types.ts index c39b570..891d135 100644 --- a/backend/src/models/types.ts +++ b/backend/src/models/types.ts @@ -63,6 +63,7 @@ export interface ProcessingJob { } export type ProcessingStatus = + | 'uploading' | 'uploaded' | 'extracting_text' | 'processing_llm' diff --git a/backend/src/routes/documents.ts b/backend/src/routes/documents.ts index e53e406..f660c68 100644 --- a/backend/src/routes/documents.ts +++ b/backend/src/routes/documents.ts @@ -23,9 +23,13 @@ const router = express.Router(); router.use(verifyFirebaseToken); router.use(addCorrelationId); -// Essential document management routes (keeping these) +// NEW Firebase Storage direct upload routes +router.post('/upload-url', documentController.getUploadUrl); +router.post('/:id/confirm-upload', validateUUID('id'), documentController.confirmUpload); + +// LEGACY multipart upload routes (keeping for backward compatibility) router.post('/upload', handleFileUpload, documentController.uploadDocument); -router.post('/', handleFileUpload, documentController.uploadDocument); // Add direct POST to /documents for frontend compatibility +router.post('/', handleFileUpload, documentController.uploadDocument); router.get('/', documentController.getDocuments); // Analytics endpoints (MUST come before /:id routes to avoid conflicts) diff --git a/backend/src/services/fileStorageService.ts b/backend/src/services/fileStorageService.ts index 6fec134..69a457f 100644 --- a/backend/src/services/fileStorageService.ts +++ b/backend/src/services/fileStorageService.ts @@ -483,6 +483,37 @@ class FileStorageService { } } + /** + * Generate signed upload URL for direct client uploads + */ + async generateSignedUploadUrl(filePath: string, contentType: string, expirationMinutes: number = 60): Promise { + try { + const bucket = this.storage.bucket(this.bucketName); + const file = bucket.file(filePath); + + // Generate signed upload URL with retry logic + const [signedUrl] = await this.retryOperation( + async () => file.getSignedUrl({ + version: 'v4', + action: 'write', + expires: Date.now() + (expirationMinutes * 60 * 1000), + contentType: contentType, + }), + 'generate signed upload URL from GCS' + ); + + logger.info(`Generated signed upload URL for file: ${filePath}`, { + contentType, + expirationMinutes, + }); + + return signedUrl; + } catch (error) { + logger.error(`Error generating signed upload URL for file: ${filePath}`, error); + throw new Error(`Failed to generate upload URL: ${error instanceof Error ? error.message : 'Unknown error'}`); + } + } + /** * Copy file within Google Cloud Storage */ diff --git a/cors.json b/cors.json new file mode 100644 index 0000000..ace0f26 --- /dev/null +++ b/cors.json @@ -0,0 +1,23 @@ +[ + { + "origin": [ + "https://cim-summarizer.web.app", + "https://cim-summarizer.firebaseapp.com", + "http://localhost:3000", + "http://localhost:5173" + ], + "method": [ + "GET", + "POST", + "PUT", + "DELETE", + "OPTIONS" + ], + "responseHeader": [ + "Content-Type", + "Authorization", + "X-Requested-With" + ], + "maxAgeSeconds": 3600 + } +] diff --git a/firebase.json b/firebase.json new file mode 100644 index 0000000..e0078db --- /dev/null +++ b/firebase.json @@ -0,0 +1,6 @@ +{ + "storage": { + "rules": "storage.rules", + "cors": "storage.cors.json" + } +} \ No newline at end of file diff --git a/frontend/firebase.json b/frontend/firebase.json index d7f1b9d..fb89463 100644 --- a/frontend/firebase.json +++ b/frontend/firebase.json @@ -63,6 +63,10 @@ } ], "rewrites": [ + { + "source": "/api/**", + "function": "api" + }, { "source": "**", "destination": "/index.html" diff --git a/frontend/src/App.tsx b/frontend/src/App.tsx index a61efcf..cfd907f 100644 --- a/frontend/src/App.tsx +++ b/frontend/src/App.tsx @@ -387,19 +387,6 @@ const Dashboard: React.FC = () => { Welcome, {user?.name || user?.email} - {/* Debug buttons - show in production for troubleshooting */} - - diff --git a/frontend/src/components/DocumentUpload.tsx b/frontend/src/components/DocumentUpload.tsx index 05e948a..897fbc7 100644 --- a/frontend/src/components/DocumentUpload.tsx +++ b/frontend/src/components/DocumentUpload.tsx @@ -14,10 +14,10 @@ interface UploadedFile { progress: number; error?: string; documentId?: string; // Real document ID from backend - // GCS-specific fields - gcsError?: boolean; - storageType?: 'gcs' | 'local'; - gcsUrl?: string; + // Firebase Storage specific fields + storageError?: boolean; + storageType?: 'firebase' | 'local'; + storageUrl?: string; } interface DocumentUploadProps { @@ -92,17 +92,15 @@ const DocumentUpload: React.FC = ({ try { // Upload the document with optimized agentic RAG processing (no strategy selection needed) - const document = await documentService.uploadDocument( - file, + const result = await documentService.uploadDocument( + file, (progress) => { setUploadedFiles(prev => prev.map(f => - f.id === uploadedFile.id - ? { ...f, progress } - : f + f.id === uploadedFile.id ? { ...f, progress } : f ) ); - }, + }, abortController.signal ); @@ -141,13 +139,13 @@ const DocumentUpload: React.FC = ({ } else { console.error('Upload failed:', error); - // Handle GCS-specific errors + // Handle storage-specific errors let errorMessage = 'Upload failed'; - let isGCSError = false; + let isStorageError = false; if (GCSErrorHandler.isGCSError(error)) { errorMessage = GCSErrorHandler.getErrorMessage(error as GCSError); - isGCSError = true; + isStorageError = true; } else if (error instanceof Error) { errorMessage = error.message; } @@ -159,8 +157,8 @@ const DocumentUpload: React.FC = ({ ...f, status: 'error', error: errorMessage, - // Add GCS error indicator - ...(isGCSError && { gcsError: true }) + // Add storage error indicator + ...(isStorageError && { storageError: true }) } : f ) @@ -297,19 +295,19 @@ const DocumentUpload: React.FC = ({ } }; - const getStatusText = (status: UploadedFile['status'], error?: string, gcsError?: boolean) => { + const getStatusText = (status: UploadedFile['status'], error?: string, storageError?: boolean) => { switch (status) { case 'uploading': - return 'Uploading to Google Cloud Storage...'; + return 'Uploading to Firebase Storage...'; case 'uploaded': - return 'Uploaded to GCS ✓'; + return 'Uploaded to Firebase Storage ✓'; case 'processing': - return 'Processing with Optimized Agentic RAG...'; + return 'Processing with Document AI + Optimized Agentic RAG...'; case 'completed': - return 'Completed ✓'; + return 'Completed ✓ (PDF automatically deleted)'; case 'error': if (error === 'Upload cancelled') return 'Cancelled'; - if (gcsError) return 'GCS Error'; + if (storageError) return 'Firebase Storage Error'; return 'Error'; default: return ''; @@ -323,10 +321,10 @@ const DocumentUpload: React.FC = ({
-

Optimized Agentic RAG Processing

+

Document AI + Optimized Agentic RAG Processing

- All documents are automatically processed using our advanced optimized agentic RAG system, - which includes intelligent chunking, vectorization, and multi-agent analysis for the best results. + All documents are automatically processed using Google Document AI for extraction and our advanced optimized agentic RAG system for analysis, + including intelligent chunking, vectorization, and multi-agent CIM review. PDFs are automatically deleted after processing.

@@ -351,7 +349,7 @@ const DocumentUpload: React.FC = ({ Drag and drop PDF files here, or click to browse

- Maximum file size: 50MB • Supported format: PDF • Stored securely in Google Cloud Storage • Automatic Optimized Agentic RAG Processing + Maximum file size: 50MB • Supported format: PDF • Stored securely in Firebase Storage • Automatic Document AI + Optimized Agentic RAG Processing • PDFs deleted after processing

@@ -379,8 +377,8 @@ const DocumentUpload: React.FC = ({

Upload Complete

- Files have been uploaded successfully to Google Cloud Storage! You can now navigate away from this page. - Processing will continue in the background using Optimized Agentic RAG and you can check the status in the Documents tab. + Files have been uploaded successfully to Firebase Storage! You can now navigate away from this page. + Processing will continue in the background using Document AI + Optimized Agentic RAG. PDFs will be automatically deleted after processing to save costs.

@@ -426,10 +424,10 @@ const DocumentUpload: React.FC = ({
{getStatusIcon(file.status)} - {getStatusText(file.status, file.error, file.gcsError)} + {getStatusText(file.status, file.error, file.storageError)} - {/* GCS indicator */} - {file.storageType === 'gcs' && ( + {/* Firebase Storage indicator */} + {file.storageType === 'firebase' && ( )}
@@ -452,4 +450,4 @@ const DocumentUpload: React.FC = ({ ); }; -export default DocumentUpload; \ No newline at end of file +export default DocumentUpload; \ No newline at end of file diff --git a/frontend/src/services/documentService.ts b/frontend/src/services/documentService.ts index dc2d521..c73a35b 100644 --- a/frontend/src/services/documentService.ts +++ b/frontend/src/services/documentService.ts @@ -60,7 +60,7 @@ export interface Document { file_path: string; file_size: number; uploaded_at: string; - status: 'uploaded' | 'extracting_text' | 'processing_llm' | 'generating_pdf' | 'completed' | 'failed'; + status: 'uploading' | 'uploaded' | 'extracting_text' | 'processing_llm' | 'generating_pdf' | 'completed' | 'failed'; extracted_text?: string; generated_summary?: string; summary_markdown_path?: string; @@ -219,7 +219,7 @@ export class GCSErrorHandler { class DocumentService { /** - * Upload a document for processing + * Upload a document using Firebase Storage direct upload (new method) */ async uploadDocument( file: File, @@ -233,7 +233,137 @@ class DocumentService { throw new Error('Authentication required. Please log in to upload documents.'); } - console.log('📤 Starting document upload...'); + console.log('📤 Starting Firebase Storage direct upload...'); + console.log('📤 File:', file.name, 'Size:', file.size, 'Type:', file.type); + console.log('📤 Token available:', !!token); + + // Step 1: Get signed upload URL + onProgress?.(5); // 5% - Getting upload URL + + console.log('🌐 Making request to upload-url endpoint'); + console.log('🌐 Base URL:', API_BASE_URL); + console.log('🌐 Full URL would be:', `${API_BASE_URL}/documents/upload-url`); + console.log('🌐 Request payload:', { fileName: file.name, fileSize: file.size, contentType: file.type }); + + const uploadUrlResponse = await apiClient.post('/documents/upload-url', { + fileName: file.name, + fileSize: file.size, + contentType: file.type + }, { signal }); + + const { documentId, uploadUrl } = uploadUrlResponse.data; + console.log('✅ Got signed upload URL for document:', documentId); + + // Step 2: Upload directly to Firebase Storage + onProgress?.(10); // 10% - Starting direct upload + + await this.uploadToFirebaseStorage(file, uploadUrl, onProgress, signal); + console.log('✅ File uploaded to Firebase Storage'); + + // Step 3: Confirm upload and trigger processing + onProgress?.(95); // 95% - Confirming upload + + const confirmResponse = await apiClient.post(`/documents/${documentId}/confirm-upload`, {}, { signal }); + + onProgress?.(100); // 100% - Complete + console.log('✅ Upload confirmed and processing started'); + + return { + id: documentId, + ...confirmResponse.data + }; + + } catch (error: any) { + console.error('❌ Firebase Storage upload failed:', error); + + // Handle specific error cases + if (error.name === 'AbortError') { + throw new Error('Upload was cancelled.'); + } + + if (error.response?.status === 401) { + throw new Error('Authentication required. Please log in again.'); + } + + if (error.response?.status === 400) { + throw new Error(error.response?.data?.error || 'Invalid request'); + } + + if (error.response?.status >= 500) { + throw new Error('Server error. Please try again later.'); + } + + // Generic error fallback + throw new Error(error.response?.data?.error || error.message || 'Upload failed'); + } + } + + /** + * Upload file directly to Firebase Storage using signed URL + */ + private async uploadToFirebaseStorage( + file: File, + uploadUrl: string, + onProgress?: (progress: number) => void, + signal?: AbortSignal + ): Promise { + return new Promise((resolve, reject) => { + const xhr = new XMLHttpRequest(); + + // Handle upload progress + xhr.upload.addEventListener('progress', (event) => { + if (event.lengthComputable && onProgress) { + // Map Firebase Storage upload to 10%-90% of overall progress + const uploadProgress = Math.round((event.loaded / event.total) * 80) + 10; + onProgress(uploadProgress); + } + }); + + // Handle completion + xhr.addEventListener('load', () => { + if (xhr.status >= 200 && xhr.status < 300) { + resolve(); + } else { + reject(new Error(`Firebase Storage upload failed: ${xhr.status} ${xhr.statusText}`)); + } + }); + + // Handle errors + xhr.addEventListener('error', () => { + reject(new Error('Firebase Storage upload failed: Network error')); + }); + + // Handle abort + if (signal) { + signal.addEventListener('abort', () => { + xhr.abort(); + reject(new Error('Upload was cancelled')); + }); + } + + // Start upload + xhr.open('PUT', uploadUrl); + xhr.setRequestHeader('Content-Type', file.type); + xhr.send(file); + }); + } + + /** + * Legacy multipart upload method (kept for compatibility) + */ + async uploadDocumentLegacy( + file: File, + onProgress?: (progress: number) => void, + signal?: AbortSignal + ): Promise { + try { + // Check authentication before upload + const token = await authService.getToken(); + if (!token) { + throw new Error('Authentication required. Please log in to upload documents.'); + } + + console.log('📤 Starting legacy multipart upload...'); console.log('📤 File:', file.name, 'Size:', file.size, 'Type:', file.type); console.log('📤 Token available:', !!token); @@ -243,7 +373,7 @@ class DocumentService { // Always use optimized agentic RAG processing - no strategy selection needed formData.append('processingStrategy', 'optimized_agentic_rag'); - const response = await apiClient.post('/documents', formData, { + const response = await apiClient.post('/documents/upload', formData, { headers: { 'Content-Type': 'multipart/form-data', }, @@ -256,10 +386,10 @@ class DocumentService { }, }); - console.log('✅ Document upload successful:', response.data); + console.log('✅ Legacy document upload successful:', response.data); return response.data; } catch (error: any) { - console.error('❌ Document upload failed:', error); + console.error('❌ Legacy document upload failed:', error); // Provide more specific error messages if (error.response?.status === 401) { diff --git a/storage.cors.json b/storage.cors.json new file mode 100644 index 0000000..3e9d728 --- /dev/null +++ b/storage.cors.json @@ -0,0 +1,23 @@ +[ + { + "origin": [ + "https://cim-summarizer.web.app", + "https://cim-summarizer.firebaseapp.com", + "http://localhost:3000", + "http://localhost:5173" + ], + "method": [ + "GET", + "POST", + "PUT", + "DELETE", + "OPTIONS" + ], + "responseHeader": [ + "Content-Type", + "Authorization", + "X-Requested-With" + ], + "maxAgeSeconds": 3600 + } +] \ No newline at end of file diff --git a/storage.rules b/storage.rules new file mode 100644 index 0000000..776621d --- /dev/null +++ b/storage.rules @@ -0,0 +1,8 @@ +rules_version = '2'; +service firebase.storage { + match /b/{bucket}/o { + match /{allPaths=**} { + allow read, write: if request.auth != null; + } + } +}