From 95c92946de17fa0a31abf799529851e35d42fca0 Mon Sep 17 00:00:00 2001
From: Jon
Date: Fri, 1 Aug 2025 11:13:03 -0400
Subject: [PATCH] fix(core): Overhaul and fix the end-to-end document
processing pipeline
---
backend/package.json | 2 +-
backend/src/controllers/documentController.ts | 514 ++++++++++------
backend/src/index.ts | 127 +---
backend/src/middleware/errorHandler.ts | 19 +
backend/src/middleware/upload.ts | 33 +-
backend/src/models/VectorDatabaseModel.ts | 582 +-----------------
backend/src/models/types.ts | 1 +
backend/src/routes/documents.ts | 8 +-
backend/src/services/fileStorageService.ts | 31 +
cors.json | 23 +
firebase.json | 6 +
frontend/firebase.json | 4 +
frontend/src/App.tsx | 13 -
frontend/src/components/DocumentUpload.tsx | 60 +-
frontend/src/services/documentService.ts | 142 ++++-
storage.cors.json | 23 +
storage.rules | 8 +
17 files changed, 695 insertions(+), 901 deletions(-)
create mode 100644 cors.json
create mode 100644 firebase.json
create mode 100644 storage.cors.json
create mode 100644 storage.rules
diff --git a/backend/package.json b/backend/package.json
index 5f56476..f05afd7 100644
--- a/backend/package.json
+++ b/backend/package.json
@@ -2,7 +2,7 @@
"name": "cim-processor-backend",
"version": "1.0.0",
"description": "Backend API for CIM Document Processor",
- "main": "index.js",
+ "main": "dist/index.js",
"scripts": {
"dev": "ts-node-dev --respawn --transpile-only --max-old-space-size=8192 --expose-gc src/index.ts",
"build": "tsc && node src/scripts/prepare-dist.js && cp .puppeteerrc.cjs dist/",
diff --git a/backend/src/controllers/documentController.ts b/backend/src/controllers/documentController.ts
index bf36b3e..093702d 100644
--- a/backend/src/controllers/documentController.ts
+++ b/backend/src/controllers/documentController.ts
@@ -7,10 +7,11 @@ import { uploadProgressService } from '../services/uploadProgressService';
import { uploadMonitoringService } from '../services/uploadMonitoringService';
export const documentController = {
- async uploadDocument(req: Request, res: Response): Promise {
- const startTime = Date.now();
- const structuredLogger = new StructuredLogger(req.correlationId);
-
+ async getUploadUrl(req: Request, res: Response): Promise {
+ console.log('🎯🎯🎯 GET UPLOAD URL ENDPOINT HIT!');
+ console.log('🎯 Method:', req.method);
+ console.log('🎯 URL:', req.url);
+ console.log('🎯 Headers:', JSON.stringify(req.headers, null, 2));
try {
const userId = req.user?.uid;
if (!userId) {
@@ -21,206 +22,369 @@ export const documentController = {
return;
}
- // Check if file was uploaded
- if (!req.file) {
- res.status(400).json({
- error: 'No file uploaded',
- correlationId: req.correlationId
+ const { fileName, fileSize, contentType } = req.body;
+
+ if (!fileName || !fileSize || !contentType) {
+ res.status(400).json({
+ error: 'Missing required fields: fileName, fileSize, contentType',
+ correlationId: req.correlationId
});
return;
}
- const file = req.file;
-
- // Track upload start
- const uploadEventData: any = {
- userId,
- fileInfo: {
- originalName: file.originalname,
- size: file.size,
- mimetype: file.mimetype,
- },
- status: 'started',
- stage: 'upload_initiated',
- };
-
- if (req.correlationId) {
- uploadEventData.correlationId = req.correlationId;
- }
-
- uploadMonitoringService.trackUploadEvent(uploadEventData);
-
- structuredLogger.uploadStart({
- originalName: file.originalname,
- size: file.size,
- mimetype: file.mimetype,
- }, userId);
-
- // Always use optimized agentic RAG processing - no strategy selection needed
- const processingStrategy = 'optimized_agentic_rag';
-
- // Store file and get file path
- const storageResult = await fileStorageService.storeFile(file, userId);
-
- if (!storageResult.success || !storageResult.fileInfo) {
- const processingTime = Date.now() - startTime;
-
- // Track upload failure
- const failureEventData: any = {
- userId,
- fileInfo: {
- originalName: file.originalname,
- size: file.size,
- mimetype: file.mimetype,
- },
- status: 'failed',
- stage: 'file_storage',
- error: {
- message: storageResult.error || 'Failed to store file',
- type: 'storage_error',
- code: 'STORAGE_ERROR',
- },
- processingTime,
- };
-
- if (req.correlationId) {
- failureEventData.correlationId = req.correlationId;
- }
-
- uploadMonitoringService.trackUploadEvent(failureEventData);
-
- structuredLogger.uploadError(
- new Error(storageResult.error || 'Failed to store file'),
- {
- originalName: file.originalname,
- size: file.size,
- mimetype: file.mimetype,
- },
- userId,
- 'file_storage'
- );
-
- res.status(500).json({
- error: 'Failed to store file',
- correlationId: req.correlationId
+ // Validate file type
+ if (contentType !== 'application/pdf') {
+ res.status(400).json({
+ error: 'Only PDF files are supported',
+ correlationId: req.correlationId
});
return;
}
-
- // Create document record
+
+ // Validate file size (max 50MB)
+ if (fileSize > 50 * 1024 * 1024) {
+ res.status(400).json({
+ error: 'File size exceeds 50MB limit',
+ correlationId: req.correlationId
+ });
+ return;
+ }
+
+ // Generate unique file path
+ const timestamp = Date.now();
+ const sanitizedFileName = fileName.replace(/[^a-zA-Z0-9.-]/g, '_');
+ const filePath = `uploads/${userId}/${timestamp}_${sanitizedFileName}`;
+
+ // Create document record first
const document = await DocumentModel.create({
user_id: userId,
- original_file_name: file.originalname,
- file_path: storageResult.fileInfo.path,
- file_size: file.size,
- status: 'uploaded'
+ original_file_name: fileName,
+ file_path: filePath,
+ file_size: fileSize,
+ status: 'uploading'
});
- // Always auto-process with optimized agentic RAG
- try {
- const jobId = await jobQueueService.addJob(
- 'document_processing',
- {
- documentId: document.id,
- userId: userId,
- options: { strategy: processingStrategy }
- },
- 0 // Normal priority
- );
- logger.info('Document processing job queued with optimized agentic RAG', {
- documentId: document.id,
- jobId,
- strategy: processingStrategy
- });
-
- // Update status to indicate it's queued for processing
- await DocumentModel.updateById(document.id, { status: 'extracting_text' });
- } catch (error) {
- logger.error('Failed to queue document processing job', { error, documentId: document.id });
- }
+ // Generate signed upload URL
+ const { fileStorageService } = await import('../services/fileStorageService');
+ const uploadUrl = await fileStorageService.generateSignedUploadUrl(filePath, contentType);
- // Track upload success
- const processingTime = Date.now() - startTime;
- const successEventData: any = {
- userId,
- fileInfo: {
- originalName: file.originalname,
- size: file.size,
- mimetype: file.mimetype,
- },
- status: 'success',
- stage: 'upload_completed',
- processingTime,
- };
+ console.log('✅ Generated upload URL for document:', document.id);
- if (req.correlationId) {
- successEventData.correlationId = req.correlationId;
- }
-
- uploadMonitoringService.trackUploadEvent(successEventData);
-
- structuredLogger.uploadSuccess({
- originalName: file.originalname,
- size: file.size,
- mimetype: file.mimetype,
- }, userId, processingTime);
-
- // Return document info
- res.status(201).json({
- id: document.id,
- name: document.original_file_name,
- originalName: document.original_file_name,
- status: 'extracting_text',
- uploadedAt: document.created_at,
- uploadedBy: userId,
- fileSize: document.file_size,
- processingStrategy: processingStrategy,
+ res.status(200).json({
+ documentId: document.id,
+ uploadUrl: uploadUrl,
+ filePath: filePath,
correlationId: req.correlationId || undefined
});
} catch (error) {
- const processingTime = Date.now() - startTime;
+ console.log('❌ Get upload URL error:', error);
+ logger.error('Get upload URL failed', {
+ error,
+ correlationId: req.correlationId
+ });
- // Track upload failure
- const errorEventData: any = {
- userId: req.user?.uid || 'unknown',
- fileInfo: {
- originalName: req.file?.originalname || 'unknown',
- size: req.file?.size || 0,
- mimetype: req.file?.mimetype || 'unknown',
- },
- status: 'failed',
- stage: 'upload_error',
- error: {
- message: error instanceof Error ? error.message : 'Unknown error',
- type: 'upload_error',
- },
- processingTime,
- };
+ res.status(500).json({
+ error: 'Failed to generate upload URL',
+ message: error instanceof Error ? error.message : 'Unknown error',
+ correlationId: req.correlationId || undefined
+ });
+ }
+ },
- if (req.correlationId) {
- errorEventData.correlationId = req.correlationId;
+ async confirmUpload(req: Request, res: Response): Promise {
+ try {
+ const userId = req.user?.uid;
+ if (!userId) {
+ res.status(401).json({
+ error: 'User not authenticated',
+ correlationId: req.correlationId
+ });
+ return;
}
- uploadMonitoringService.trackUploadEvent(errorEventData);
+ const { id: documentId } = req.params;
+ if (!documentId) {
+ res.status(400).json({
+ error: 'Document ID is required',
+ correlationId: req.correlationId
+ });
+ return;
+ }
- structuredLogger.uploadError(
- error,
- {
- originalName: req.file?.originalname || 'unknown',
- size: req.file?.size || 0,
- mimetype: req.file?.mimetype || 'unknown',
- },
- req.user?.uid || 'unknown',
- 'upload_error'
+ // Get document record
+ const document = await DocumentModel.findById(documentId);
+ if (!document) {
+ res.status(404).json({
+ error: 'Document not found',
+ correlationId: req.correlationId
+ });
+ return;
+ }
+
+ // Verify user owns document
+ if (document.user_id !== userId) {
+ res.status(403).json({
+ error: 'Access denied',
+ correlationId: req.correlationId
+ });
+ return;
+ }
+
+ console.log('🔄 Starting Document AI processing for:', documentId);
+
+ // Update status to processing
+ await DocumentModel.updateById(documentId, {
+ status: 'processing_llm'
+ });
+
+ // Acknowledge the request immediately
+ res.status(202).json({
+ message: 'Upload confirmed, processing has started.',
+ documentId: documentId,
+ status: 'processing'
+ });
+
+ // Process in the background
+ (async () => {
+ try {
+ // Download file from Firebase Storage for Document AI processing
+ const { fileStorageService } = await import('../services/fileStorageService');
+
+ let fileBuffer: Buffer | null = null;
+ for (let i = 0; i < 3; i++) {
+ await new Promise(resolve => setTimeout(resolve, 2000)); // 2 second delay
+ fileBuffer = await fileStorageService.getFile(document.file_path);
+ if (fileBuffer) {
+ break;
+ }
+ }
+
+ if (!fileBuffer) {
+ await DocumentModel.updateById(documentId, {
+ status: 'failed',
+ error_message: 'Failed to download uploaded file'
+ });
+ return;
+ }
+
+ // Process with Unified Document Processor
+ const { unifiedDocumentProcessor } = await import('../services/unifiedDocumentProcessor');
+
+ const result = await unifiedDocumentProcessor.processDocument(
+ documentId,
+ userId,
+ '', // Text is not needed for this strategy
+ { strategy: 'optimized_agentic_rag' }
+ );
+
+ if (result.success) {
+ // Update document with results
+ await DocumentModel.updateById(documentId, {
+ status: 'completed',
+ generated_summary: result.summary,
+ processing_completed_at: new Date()
+ });
+
+ // 🗑️ DELETE PDF after successful processing
+ try {
+ await fileStorageService.deleteFile(document.file_path);
+ console.log('✅ PDF deleted after successful processing:', document.file_path);
+ } catch (deleteError) {
+ console.log('⚠️ Failed to delete PDF file:', deleteError);
+ logger.warn('Failed to delete PDF after processing', {
+ filePath: document.file_path,
+ documentId,
+ error: deleteError
+ });
+ }
+
+ console.log('✅ Document AI processing completed successfully');
+ } else {
+ await DocumentModel.updateById(documentId, {
+ status: 'failed',
+ error_message: result.error
+ });
+
+ // Also delete PDF on processing failure to avoid storage costs
+ try {
+ await fileStorageService.deleteFile(document.file_path);
+ console.log('🗑️ PDF deleted after processing failure');
+ } catch (deleteError) {
+ console.log('⚠️ Failed to delete PDF file after error:', deleteError);
+ }
+ }
+ } catch (error) {
+ console.log('❌ Background processing error:', error);
+ logger.error('Background processing failed', {
+ error,
+ documentId
+ });
+ await DocumentModel.updateById(documentId, {
+ status: 'failed',
+ error_message: 'Background processing failed'
+ });
+ }
+ })();
+
+ } catch (error) {
+ console.log('❌ Confirm upload error:', error);
+ logger.error('Confirm upload failed', {
+ error,
+ correlationId: req.correlationId
+ });
+
+ res.status(500).json({
+ error: 'Upload confirmation failed',
+ message: error instanceof Error ? error.message : 'Unknown error',
+ correlationId: req.correlationId || undefined
+ });
+ }
+ },
+
+ async uploadDocument(req: Request, res: Response): Promise {
+ const startTime = Date.now();
+
+ // 🔍 COMPREHENSIVE DEBUG: Log everything about the request
+ console.log('🚀 =========================');
+ console.log('🚀 DOCUMENT AI UPLOAD STARTED');
+ console.log('🚀 Method:', req.method);
+ console.log('🚀 URL:', req.url);
+ console.log('🚀 Content-Type:', req.get('Content-Type'));
+ console.log('🚀 Content-Length:', req.get('Content-Length'));
+ console.log('🚀 Authorization header present:', !!req.get('Authorization'));
+ console.log('🚀 User from token:', req.user?.uid || 'NOT_FOUND');
+
+ // Debug body in detail
+ console.log('🚀 Has body:', !!req.body);
+ console.log('🚀 Body type:', typeof req.body);
+ console.log('🚀 Body constructor:', req.body?.constructor?.name);
+ console.log('🚀 Body length:', req.body?.length || 0);
+ console.log('🚀 Is Buffer?:', Buffer.isBuffer(req.body));
+
+ // Debug all headers
+ console.log('🚀 All headers:', JSON.stringify(req.headers, null, 2));
+
+ // Debug request properties
+ console.log('🚀 Request readable:', req.readable);
+ console.log('🚀 Request complete:', req.complete);
+
+ // If body exists, show first few bytes
+ if (req.body && req.body.length > 0) {
+ const preview = req.body.slice(0, 100).toString('hex');
+ console.log('🚀 Body preview (hex):', preview);
+
+ // Try to see if it contains multipart boundary
+ const bodyStr = req.body.toString('utf8', 0, Math.min(500, req.body.length));
+ console.log('🚀 Body preview (string):', bodyStr.substring(0, 200));
+ }
+
+ console.log('🚀 =========================');
+
+ try {
+ const userId = req.user?.uid;
+ if (!userId) {
+ console.log('❌ Authentication failed - no userId');
+ res.status(401).json({
+ error: 'User not authenticated',
+ correlationId: req.correlationId
+ });
+ return;
+ }
+
+ console.log('✅ Authentication successful for user:', userId);
+
+ // Get raw body buffer for Document AI processing
+ const rawBody = req.body;
+ if (!rawBody || rawBody.length === 0) {
+ res.status(400).json({
+ error: 'No file data received',
+ correlationId: req.correlationId,
+ debug: {
+ method: req.method,
+ contentType: req.get('Content-Type'),
+ contentLength: req.get('Content-Length'),
+ hasRawBody: !!rawBody,
+ rawBodySize: rawBody?.length || 0,
+ bodyType: typeof rawBody
+ }
+ });
+ return;
+ }
+
+ console.log('✅ Found raw body buffer:', rawBody.length, 'bytes');
+
+ // Create document record first
+ const document = await DocumentModel.create({
+ user_id: userId,
+ original_file_name: 'uploaded-document.pdf',
+ file_path: '',
+ file_size: rawBody.length,
+ status: 'processing_llm'
+ });
+
+ console.log('✅ Document record created:', document.id);
+
+ // Process with Document AI directly
+ const { DocumentAiGenkitProcessor } = await import('../services/documentAiGenkitProcessor');
+ const processor = new DocumentAiGenkitProcessor();
+
+ console.log('✅ Starting Document AI processing...');
+ const result = await processor.processDocument(
+ document.id,
+ userId,
+ rawBody,
+ 'uploaded-document.pdf',
+ 'application/pdf'
);
+ if (result.success) {
+ await DocumentModel.updateById(document.id, {
+ status: 'completed',
+ generated_summary: result.content,
+ processing_completed_at: new Date()
+ });
+
+ console.log('✅ Document AI processing completed successfully');
+
+ res.status(201).json({
+ id: document.id,
+ name: 'uploaded-document.pdf',
+ originalName: 'uploaded-document.pdf',
+ status: 'completed',
+ uploadedAt: document.created_at,
+ uploadedBy: userId,
+ fileSize: rawBody.length,
+ summary: result.content,
+ correlationId: req.correlationId || undefined
+ });
+ return;
+ } else {
+ console.log('❌ Document AI processing failed:', result.error);
+ await DocumentModel.updateById(document.id, {
+ status: 'failed',
+ error_message: result.error
+ });
+
+ res.status(500).json({
+ error: 'Document processing failed',
+ message: result.error,
+ correlationId: req.correlationId || undefined
+ });
+ return;
+ }
+
+ } catch (error) {
+ console.log('❌ Upload error:', error);
+
logger.error('Upload document failed', {
error,
correlationId: req.correlationId
});
+
res.status(500).json({
error: 'Upload failed',
+ message: error instanceof Error ? error.message : 'Unknown error',
correlationId: req.correlationId || undefined
});
}
@@ -552,4 +716,4 @@ export const documentController = {
throw new Error('Failed to get document text');
}
}
-};
\ No newline at end of file
+};
\ No newline at end of file
diff --git a/backend/src/index.ts b/backend/src/index.ts
index 410b752..0fdf3a3 100644
--- a/backend/src/index.ts
+++ b/backend/src/index.ts
@@ -18,20 +18,17 @@ import { notFoundHandler } from './middleware/notFoundHandler';
const app = express();
-// Enable trust proxy to ensure Express works correctly behind the proxy
+// Add this middleware to log all incoming requests
+app.use((req, res, next) => {
+ console.log(`Incoming request: ${req.method} ${req.path}`);
+ next();
+});
+
+// Enable trust proxy to ensure Express works correctly behind a proxy
app.set('trust proxy', 1);
// Security middleware
-app.use(helmet({
- contentSecurityPolicy: {
- directives: {
- defaultSrc: ["'self'"],
- styleSrc: ["'self'", "'unsafe-inline'"],
- scriptSrc: ["'self'"],
- imgSrc: ["'self'", "data:", "https:"],
- },
- },
-}));
+app.use(helmet());
// CORS configuration
const allowedOrigins = [
@@ -43,13 +40,10 @@ const allowedOrigins = [
app.use(cors({
origin: function (origin, callback) {
- console.log('🌐 CORS request from origin:', origin);
-
if (!origin || allowedOrigins.indexOf(origin) !== -1) {
- console.log('✅ CORS allowed for origin:', origin);
callback(null, true);
} else {
- console.log('❌ CORS blocked origin:', origin);
+ logger.warn(`CORS blocked for origin: ${origin}`);
callback(new Error('Not allowed by CORS'));
}
},
@@ -62,7 +56,7 @@ app.use(cors({
// Rate limiting
const limiter = rateLimit({
windowMs: 15 * 60 * 1000, // 15 minutes
- max: 1000, // limit each IP to 1000 requests per windowMs (increased for testing)
+ max: 1000,
message: {
error: 'Too many requests from this IP, please try again later.',
},
@@ -72,27 +66,6 @@ const limiter = rateLimit({
app.use(limiter);
-// Body parsing middleware - only for non-multipart requests
-app.use((req, res, next) => {
- if (req.headers['content-type'] && req.headers['content-type'].includes('multipart/form-data')) {
- // Skip body parsing for multipart requests - let multer handle it
- next();
- } else {
- // Parse JSON and URL-encoded bodies for other requests
- express.json({ limit: '10mb' })(req, res, next);
- }
-});
-
-app.use((req, res, next) => {
- if (req.headers['content-type'] && req.headers['content-type'].includes('multipart/form-data')) {
- // Skip body parsing for multipart requests - let multer handle it
- next();
- } else {
- // Parse URL-encoded bodies for other requests
- express.urlencoded({ extended: true, limit: '10mb' })(req, res, next);
- }
-});
-
// Logging middleware
app.use(morgan('combined', {
stream: {
@@ -100,17 +73,12 @@ app.use(morgan('combined', {
},
}));
-// Request debugging middleware
-app.use((req, res, next) => {
- console.log('📥 Incoming request:', req.method, req.url);
- console.log('📥 Request headers:', Object.keys(req.headers));
- console.log('📥 Content-Type:', req.get('Content-Type'));
- console.log('📥 Authorization:', req.get('Authorization') ? 'Present' : 'Missing');
- next();
-});
+// CRITICAL: Add body parsing BEFORE routes
+app.use(express.json({ limit: '10mb' }));
+app.use(express.urlencoded({ extended: true, limit: '10mb' }));
// Health check endpoint
-app.get('/health', (_req, res) => { // _req to fix TS6133
+app.get('/health', (_req, res) => {
res.status(200).json({
status: 'ok',
timestamp: new Date().toISOString(),
@@ -119,53 +87,23 @@ app.get('/health', (_req, res) => { // _req to fix TS6133
});
});
-// Agentic RAG health check endpoints
-app.get('/health/agentic-rag', async (_req, res) => {
- try {
- const { agenticRAGDatabaseService } = await import('./services/agenticRAGDatabaseService');
- const healthStatus = await agenticRAGDatabaseService.getHealthStatus();
- res.json(healthStatus);
- } catch (error) {
- logger.error('Agentic RAG health check failed', { error });
- res.status(500).json({
- error: 'Health check failed',
- status: 'unhealthy',
- timestamp: new Date().toISOString()
- });
- }
-});
-
-app.get('/health/agentic-rag/metrics', async (_req, res) => {
- try {
- const { agenticRAGDatabaseService } = await import('./services/agenticRAGDatabaseService');
- const startDate = new Date(Date.now() - 30 * 24 * 60 * 60 * 1000); // 30 days ago
- const metrics = await agenticRAGDatabaseService.generatePerformanceReport(startDate, new Date());
- res.json(metrics);
- } catch (error) {
- logger.error('Agentic RAG metrics retrieval failed', { error });
- res.status(500).json({ error: 'Metrics retrieval failed' });
- }
-});
-
-// API routes - remove the /api prefix as it's handled by Firebase
+// API Routes
app.use('/documents', documentRoutes);
app.use('/vector', vectorRoutes);
app.use('/monitoring', monitoringRoutes);
import * as functions from 'firebase-functions';
+import { onRequest } from 'firebase-functions/v2/https';
// API root endpoint
-app.get('/', (_req, res) => { // _req to fix TS6133
+app.get('/', (_req, res) => {
res.json({
message: 'CIM Document Processor API',
version: '1.0.0',
endpoints: {
- auth: '/auth',
documents: '/documents',
health: '/health',
- agenticRagHealth: '/health/agentic-rag',
- agenticRagMetrics: '/health/agentic-rag/metrics',
monitoring: '/monitoring',
},
});
@@ -177,26 +115,11 @@ app.use(notFoundHandler);
// Global error handler (must be last)
app.use(errorHandler);
-// Initialize job queue service for document processing
-import { jobQueueService } from './services/jobQueueService';
-
-// Start the job queue service asynchronously to avoid blocking function startup
-// Use a longer delay to ensure the function is fully initialized
-setTimeout(() => {
- try {
- jobQueueService.start();
- logger.info('Job queue service started successfully');
- } catch (error) {
- logger.error('Failed to start job queue service', { error });
- }
-}, 5000);
-
-// Listen on a port when not in a Firebase Function environment or when PORT is explicitly set
-if (!process.env['FUNCTION_TARGET'] || process.env['PORT']) {
- const port = process.env['PORT'] || 5001;
- app.listen(port, () => {
- logger.info(`API server listening on port ${port}`);
- });
-}
-
-export const api = functions.https.onRequest(app);
\ No newline at end of file
+// Configure Firebase Functions v2 for larger uploads
+export const api = onRequest({
+ timeoutSeconds: 540, // 9 minutes
+ memory: '2GiB',
+ cpu: 1,
+ maxInstances: 10,
+ cors: true
+}, app);
\ No newline at end of file
diff --git a/backend/src/middleware/errorHandler.ts b/backend/src/middleware/errorHandler.ts
index bee3298..7c90bf6 100644
--- a/backend/src/middleware/errorHandler.ts
+++ b/backend/src/middleware/errorHandler.ts
@@ -11,6 +11,18 @@ export const errorHandler = (
req: Request,
res: Response
): void => {
+ console.log('💥💥💥 MAXIMUM DEBUG ERROR HANDLER HIT 💥💥💥');
+ console.log('💥 Error name:', err.name);
+ console.log('💥 Error message:', err.message);
+ console.log('💥 Error code:', (err as any).code);
+ console.log('💥 Error type:', typeof err);
+ console.log('💥 Error constructor:', err.constructor.name);
+ console.log('💥 Error stack:', err.stack);
+ console.log('💥 Request URL:', req.url);
+ console.log('💥 Request method:', req.method);
+ console.log('💥 Full error object:', JSON.stringify(err, Object.getOwnPropertyNames(err), 2));
+ console.log('💥💥💥 END ERROR DEBUG 💥💥💥');
+
let error = { ...err };
error.message = err.message;
@@ -53,6 +65,13 @@ export const errorHandler = (
error = { message, statusCode: 401 } as AppError;
}
+ // Multer errors (check if multer is imported anywhere)
+ if (err.name === 'MulterError' || (err as any).code === 'UNEXPECTED_END_OF_FORM') {
+ console.log('🚨 MULTER ERROR CAUGHT:', err.message);
+ const message = `File upload failed: ${err.message}`;
+ error = { message, statusCode: 400 } as AppError;
+ }
+
// Default error
const statusCode = error.statusCode || 500;
const message = error.message || 'Server Error';
diff --git a/backend/src/middleware/upload.ts b/backend/src/middleware/upload.ts
index ff5144d..1c54a73 100644
--- a/backend/src/middleware/upload.ts
+++ b/backend/src/middleware/upload.ts
@@ -13,9 +13,15 @@ if (!fs.existsSync(uploadDir)) {
// File filter function
const fileFilter = (req: Request, file: any, cb: multer.FileFilterCallback) => {
- console.log('🔍 File filter called for:', file.originalname);
+ console.log('🔍 ===== FILE FILTER CALLED =====');
+ console.log('🔍 File originalname:', file.originalname);
console.log('🔍 File mimetype:', file.mimetype);
console.log('🔍 File size:', file.size);
+ console.log('🔍 File encoding:', file.encoding);
+ console.log('🔍 File fieldname:', file.fieldname);
+ console.log('🔍 Request Content-Type:', req.get('Content-Type'));
+ console.log('🔍 Request Content-Length:', req.get('Content-Length'));
+ console.log('🔍 ===========================');
// Check file type - allow PDF and text files for testing
const allowedTypes = ['application/pdf', 'text/plain', 'text/html'];
@@ -68,6 +74,14 @@ const upload = multer({
// Error handling middleware for multer
export const handleUploadError = (error: any, req: Request, res: Response, next: NextFunction): void => {
+ console.log('🚨 =============================');
+ console.log('🚨 UPLOAD ERROR HANDLER CALLED');
+ console.log('🚨 Error type:', error?.constructor?.name);
+ console.log('🚨 Error message:', error?.message);
+ console.log('🚨 Error code:', error?.code);
+ console.log('🚨 Is MulterError:', error instanceof multer.MulterError);
+ console.log('🚨 =============================');
+
if (error instanceof multer.MulterError) {
logger.error('Multer error during file upload:', {
error: error.message,
@@ -129,12 +143,14 @@ export const handleUploadError = (error: any, req: Request, res: Response, next:
// Main upload middleware with timeout handling
export const uploadMiddleware = (req: Request, res: Response, next: NextFunction) => {
- console.log('📤 Upload middleware called');
+ console.log('📤 =============================');
+ console.log('📤 UPLOAD MIDDLEWARE CALLED');
console.log('📤 Request method:', req.method);
console.log('📤 Request URL:', req.url);
console.log('📤 Content-Type:', req.get('Content-Type'));
console.log('📤 Content-Length:', req.get('Content-Length'));
console.log('📤 User-Agent:', req.get('User-Agent'));
+ console.log('📤 =============================');
// Set a timeout for the upload
const uploadTimeout = setTimeout(() => {
@@ -155,12 +171,25 @@ export const uploadMiddleware = (req: Request, res: Response, next: NextFunction
clearTimeout(uploadTimeout);
if (err) {
console.log('❌ Upload middleware error:', err);
+ console.log('❌ Error details:', {
+ name: err.name,
+ message: err.message,
+ code: err.code,
+ stack: err.stack?.split('\n')[0]
+ });
} else {
console.log('✅ Upload middleware completed successfully');
+ console.log('✅ File after multer processing:', {
+ hasFile: !!req.file,
+ filename: req.file?.originalname,
+ size: req.file?.size,
+ mimetype: req.file?.mimetype
+ });
}
originalNext(err);
};
+ console.log('🔄 Calling multer.single("document")...');
upload.single('document')(req, res, next);
};
diff --git a/backend/src/models/VectorDatabaseModel.ts b/backend/src/models/VectorDatabaseModel.ts
index 573e58b..946bd27 100644
--- a/backend/src/models/VectorDatabaseModel.ts
+++ b/backend/src/models/VectorDatabaseModel.ts
@@ -1,6 +1,6 @@
import { v4 as uuidv4 } from 'uuid';
import { logger } from '../utils/logger';
-import pool from '../config/database';
+import { getSupabaseServiceClient } from '../config/supabase';
export interface DocumentChunk {
id: string;
@@ -15,577 +15,21 @@ export interface DocumentChunk {
updatedAt: Date;
}
-export interface VectorSearchResult {
- documentId: string;
- similarityScore: number;
- chunkContent: string;
- metadata: Record;
-}
-
-export interface DocumentSimilarity {
- id: string;
- sourceDocumentId: string;
- targetDocumentId: string;
- similarityScore: number;
- similarityType: string;
- metadata: Record;
- createdAt: Date;
-}
-
-export interface IndustryEmbedding {
- id: string;
- industryName: string;
- industryDescription?: string;
- embedding: number[];
- documentCount: number;
- averageSimilarity?: number;
- createdAt: Date;
- updatedAt: Date;
-}
-
export class VectorDatabaseModel {
- /**
- * Store document chunks with embeddings
- */
static async storeDocumentChunks(chunks: Omit[]): Promise {
- const client = await pool.connect();
-
- try {
- await client.query('BEGIN');
-
- for (const chunk of chunks) {
- // Ensure embedding is properly formatted for pgvector
- const embeddingArray = Array.isArray(chunk.embedding) ? chunk.embedding : [];
-
- // Validate embedding dimensions (should be 1536 for text-embedding-3-small)
- if (embeddingArray.length !== 1536) {
- logger.warn(`Embedding dimension mismatch: expected 1536, got ${embeddingArray.length}`);
- // Pad or truncate to 1536 dimensions if necessary
- const paddedEmbedding = new Array(1536).fill(0);
- embeddingArray.forEach((val, index) => {
- if (index < 1536) paddedEmbedding[index] = val;
- });
- }
-
- // Format embedding properly for pgvector - must be a JSON array string
- const embeddingString = JSON.stringify(embeddingArray);
-
- await client.query(`
- INSERT INTO document_chunks (
- id, document_id, content, metadata, embedding,
- chunk_index, section, page_number
- ) VALUES ($1, $2, $3, $4, $5::vector, $6, $7, $8)
- ON CONFLICT (id) DO UPDATE SET
- content = EXCLUDED.content,
- metadata = EXCLUDED.metadata,
- embedding = EXCLUDED.embedding,
- section = EXCLUDED.section,
- page_number = EXCLUDED.page_number,
- updated_at = CURRENT_TIMESTAMP
- `, [
- uuidv4(),
- chunk.documentId,
- chunk.content,
- JSON.stringify(chunk.metadata),
- embeddingString, // Pass as JSON string for pgvector
- chunk.chunkIndex,
- chunk.section,
- chunk.pageNumber
- ]);
- }
-
- await client.query('COMMIT');
- logger.info(`Stored ${chunks.length} document chunks in vector database`);
- } catch (error) {
- await client.query('ROLLBACK');
+ const supabase = getSupabaseServiceClient();
+ const { data, error } = await supabase
+ .from('document_chunks')
+ .insert(chunks.map(chunk => ({
+ ...chunk,
+ embedding: `[${chunk.embedding.join(',')}]` // Format for pgvector
+ })));
+
+ if (error) {
logger.error('Failed to store document chunks', error);
throw error;
- } finally {
- client.release();
}
+
+ logger.info(`Stored ${chunks.length} document chunks in vector database`);
}
-
- /**
- * Search for similar content using vector similarity
- */
- static async searchSimilarContent(
- queryEmbedding: number[],
- options: {
- documentId?: string;
- limit?: number;
- similarityThreshold?: number;
- filters?: Record;
- } = {}
- ): Promise {
- const {
- documentId,
- limit = 10,
- similarityThreshold = 0.7,
- filters = {}
- } = options;
-
- // Ensure embedding is properly formatted
- const embeddingArray = Array.isArray(queryEmbedding) ? queryEmbedding : [];
-
- // Validate embedding dimensions
- if (embeddingArray.length !== 1536) {
- logger.warn(`Query embedding dimension mismatch: expected 1536, got ${embeddingArray.length}`);
- // Pad or truncate to 1536 dimensions if necessary
- const paddedEmbedding = new Array(1536).fill(0);
- embeddingArray.forEach((val, index) => {
- if (index < 1536) paddedEmbedding[index] = val;
- });
- }
-
- let query = `
- SELECT
- dc.document_id,
- 1 - (dc.embedding <=> $1::vector) as similarity_score,
- dc.content as chunk_content,
- dc.metadata
- FROM document_chunks dc
- WHERE dc.embedding IS NOT NULL
- `;
-
- const params: any[] = [embeddingArray];
- let paramIndex = 2;
-
- if (documentId) {
- query += ` AND dc.document_id = $${paramIndex}`;
- params.push(documentId);
- paramIndex++;
- }
-
- // Add metadata filters
- Object.entries(filters).forEach(([key, value]) => {
- query += ` AND dc.metadata->>'${key}' = $${paramIndex}`;
- params.push(value);
- paramIndex++;
- });
-
- query += `
- AND 1 - (dc.embedding <=> $1::vector) >= $${paramIndex}
- ORDER BY dc.embedding <=> $1::vector
- LIMIT $${paramIndex + 1}
- `;
- params.push(similarityThreshold, limit);
-
- try {
- const result = await pool.query(query, params);
-
- return result.rows.map((row: any) => ({
- documentId: row.document_id,
- similarityScore: parseFloat(row.similarity_score),
- chunkContent: row.chunk_content,
- metadata: row.metadata
- }));
- } catch (error) {
- logger.error('Vector search failed', error);
- throw error;
- }
- }
-
- /**
- * Get document chunks by document ID
- */
- static async getDocumentChunks(documentId: string): Promise {
- try {
- const result = await pool.query(`
- SELECT
- id,
- document_id,
- content,
- metadata,
- embedding,
- chunk_index,
- section,
- page_number,
- created_at,
- updated_at
- FROM document_chunks
- WHERE document_id = $1
- ORDER BY chunk_index
- `, [documentId]);
-
- return result.rows.map((row: any) => ({
- id: row.id,
- documentId: row.document_id,
- content: row.content,
- metadata: row.metadata || {},
- embedding: row.embedding || [],
- chunkIndex: row.chunk_index,
- section: row.section,
- pageNumber: row.page_number,
- createdAt: row.created_at,
- updatedAt: row.updated_at
- }));
- } catch (error) {
- logger.error('Failed to get document chunks', error);
- throw error;
- }
- }
-
- /**
- * Find similar documents
- */
- static async findSimilarDocuments(
- documentId: string,
- limit: number = 10,
- similarityThreshold: number = 0.6
- ): Promise {
- try {
- // Get document chunks
- const documentChunks = await this.getDocumentChunks(documentId);
- if (documentChunks.length === 0) return [];
-
- // Use the first chunk as reference
- const referenceChunk = documentChunks[0];
- if (!referenceChunk || !referenceChunk.embedding) return [];
-
- const result = await pool.query(`
- SELECT
- id,
- source_document_id,
- target_document_id,
- similarity_score,
- similarity_type,
- metadata,
- created_at
- FROM document_similarities
- WHERE source_document_id = $1
- AND similarity_score >= $2
- ORDER BY similarity_score DESC
- LIMIT $3
- `, [documentId, similarityThreshold, limit]);
-
- return result.rows.map((row: any) => ({
- id: row.id,
- sourceDocumentId: row.source_document_id,
- targetDocumentId: row.target_document_id,
- similarityScore: parseFloat(row.similarity_score),
- similarityType: row.similarity_type,
- metadata: row.metadata || {},
- createdAt: row.created_at
- }));
- } catch (error) {
- logger.error('Failed to find similar documents', error);
- throw error;
- }
- }
-
- /**
- * Update document similarities
- */
- static async updateDocumentSimilarities(): Promise {
- try {
- await pool.query(`
- SELECT update_document_similarities();
- `);
- logger.info('Document similarities updated');
- } catch (error) {
- logger.error('Failed to update document similarities', error);
- throw error;
- }
- }
-
- /**
- * Store industry embedding
- */
- static async storeIndustryEmbedding(industry: Omit): Promise {
- try {
- // Ensure embedding is properly formatted
- const embeddingArray = Array.isArray(industry.embedding) ? industry.embedding : [];
-
- // Validate embedding dimensions
- if (embeddingArray.length !== 1536) {
- logger.warn(`Industry embedding dimension mismatch: expected 1536, got ${embeddingArray.length}`);
- // Pad or truncate to 1536 dimensions if necessary
- const paddedEmbedding = new Array(1536).fill(0);
- embeddingArray.forEach((val, index) => {
- if (index < 1536) paddedEmbedding[index] = val;
- });
- }
-
- await pool.query(`
- INSERT INTO industry_embeddings (
- id, industry_name, industry_description, embedding,
- document_count, average_similarity
- ) VALUES ($1, $2, $3, $4::vector, $5, $6)
- ON CONFLICT (industry_name) DO UPDATE SET
- industry_description = EXCLUDED.industry_description,
- embedding = EXCLUDED.embedding,
- document_count = EXCLUDED.document_count,
- average_similarity = EXCLUDED.average_similarity,
- updated_at = CURRENT_TIMESTAMP
- `, [
- uuidv4(),
- industry.industryName,
- industry.industryDescription,
- embeddingArray,
- industry.documentCount,
- industry.averageSimilarity
- ]);
-
- logger.info(`Stored industry embedding for: ${industry.industryName}`);
- } catch (error) {
- logger.error('Failed to store industry embedding', error);
- throw error;
- }
- }
-
- /**
- * Search by industry
- */
- static async searchByIndustry(
- industryName: string,
- queryEmbedding: number[],
- limit: number = 20
- ): Promise {
- try {
- // Ensure embedding is properly formatted
- const embeddingArray = Array.isArray(queryEmbedding) ? queryEmbedding : [];
-
- // Validate embedding dimensions
- if (embeddingArray.length !== 1536) {
- logger.warn(`Industry search embedding dimension mismatch: expected 1536, got ${embeddingArray.length}`);
- // Pad or truncate to 1536 dimensions if necessary
- const paddedEmbedding = new Array(1536).fill(0);
- embeddingArray.forEach((val, index) => {
- if (index < 1536) paddedEmbedding[index] = val;
- });
- }
-
- const result = await pool.query(`
- SELECT
- dc.document_id,
- 1 - (dc.embedding <=> $1::vector) as similarity_score,
- dc.content as chunk_content,
- dc.metadata
- FROM document_chunks dc
- WHERE dc.embedding IS NOT NULL
- AND dc.metadata->>'industry' = $2
- ORDER BY dc.embedding <=> $1::vector
- LIMIT $3
- `, [embeddingArray, industryName.toLowerCase(), limit]);
-
- return result.rows.map((row: any) => ({
- documentId: row.document_id,
- similarityScore: parseFloat(row.similarity_score),
- chunkContent: row.chunk_content,
- metadata: row.metadata || {}
- }));
- } catch (error) {
- logger.error('Failed to search by industry', error);
- throw error;
- }
- }
-
- /**
- * Track search query for analytics
- */
- static async trackSearchQuery(
- userId: string,
- queryText: string,
- queryEmbedding: number[],
- searchResults: VectorSearchResult[],
- options: {
- filters?: Record;
- limitCount?: number;
- similarityThreshold?: number;
- processingTimeMs?: number;
- } = {}
- ): Promise {
- try {
- // Ensure embedding is properly formatted
- const embeddingArray = Array.isArray(queryEmbedding) ? queryEmbedding : [];
-
- // Validate embedding dimensions
- if (embeddingArray.length !== 1536) {
- logger.warn(`Search tracking embedding dimension mismatch: expected 1536, got ${embeddingArray.length}`);
- // Pad or truncate to 1536 dimensions if necessary
- const paddedEmbedding = new Array(1536).fill(0);
- embeddingArray.forEach((val, index) => {
- if (index < 1536) paddedEmbedding[index] = val;
- });
- }
-
- await pool.query(`
- INSERT INTO vector_similarity_searches (
- id, user_id, query_text, query_embedding, search_results,
- filters, limit_count, similarity_threshold, processing_time_ms
- ) VALUES ($1, $2, $3, $4::vector, $5, $6, $7, $8, $9)
- `, [
- uuidv4(),
- userId,
- queryText,
- embeddingArray,
- JSON.stringify(searchResults),
- JSON.stringify(options.filters || {}),
- options.limitCount || 10,
- options.similarityThreshold || 0.7,
- options.processingTimeMs || 0
- ]);
-
- logger.debug('Search query tracked for analytics');
- } catch (error) {
- logger.error('Failed to track search query', error);
- // Don't throw - analytics failure shouldn't break search
- }
- }
-
- /**
- * Get search analytics
- */
- static async getSearchAnalytics(userId: string, days: number = 30): Promise {
- try {
- const result = await pool.query(`
- SELECT
- query_text,
- COUNT(*) as search_count,
- AVG(processing_time_ms) as avg_processing_time,
- AVG(similarity_threshold) as avg_similarity_threshold,
- MAX(created_at) as last_search
- FROM vector_similarity_searches
- WHERE user_id = $1
- AND created_at >= NOW() - INTERVAL '${days} days'
- GROUP BY query_text
- ORDER BY search_count DESC
- LIMIT 20
- `, [userId]);
-
- return result.rows;
- } catch (error) {
- logger.error('Failed to get search analytics', error);
- throw error;
- }
- }
-
- /**
- * Delete document chunks
- */
- static async deleteDocumentChunks(documentId: string): Promise {
- try {
- await pool.query(`
- DELETE FROM document_chunks
- WHERE document_id = $1
- `, [documentId]);
-
- logger.info(`Deleted chunks for document: ${documentId}`);
- } catch (error) {
- logger.error('Failed to delete document chunks', error);
- throw error;
- }
- }
-
- /**
- * Get vector database statistics
- */
- static async getVectorDatabaseStats(): Promise<{
- totalChunks: number;
- totalDocuments: number;
- totalSearches: number;
- averageSimilarity: number;
- }> {
- try {
- const [chunksResult, documentsResult, searchesResult, similarityResult] = await Promise.all([
- pool.query('SELECT COUNT(*) as count FROM document_chunks'),
- pool.query('SELECT COUNT(DISTINCT document_id) as count FROM document_chunks'),
- pool.query('SELECT COUNT(*) as count FROM vector_similarity_searches'),
- pool.query(`
- SELECT AVG(similarity_score) as avg_similarity
- FROM document_similarities
- WHERE similarity_score > 0
- `)
- ]);
-
- return {
- totalChunks: parseInt(chunksResult.rows[0]?.count || '0'),
- totalDocuments: parseInt(documentsResult.rows[0]?.count || '0'),
- totalSearches: parseInt(searchesResult.rows[0]?.count || '0'),
- averageSimilarity: parseFloat(similarityResult.rows[0]?.avg_similarity || '0')
- };
- } catch (error) {
- logger.error('Failed to get vector database stats', error);
- throw error;
- }
- }
-
- /**
- * Get all chunks (for testing/debugging)
- */
- static async getAllChunks(): Promise {
- try {
- const result = await pool.query(`
- SELECT
- id,
- document_id,
- content,
- metadata,
- embedding,
- chunk_index,
- section,
- page_number,
- created_at,
- updated_at
- FROM document_chunks
- ORDER BY document_id, chunk_index
- LIMIT 1000
- `);
-
- return result.rows.map((row: any) => ({
- id: row.id,
- documentId: row.document_id,
- content: row.content,
- metadata: row.metadata || {},
- embedding: row.embedding || [],
- chunkIndex: row.chunk_index,
- section: row.section,
- pageNumber: row.page_number,
- createdAt: row.created_at,
- updatedAt: row.updated_at
- }));
- } catch (error) {
- logger.error('Failed to get all chunks', error);
- throw error;
- }
- }
-
- /**
- * Get total chunk count
- */
- static async getTotalChunkCount(): Promise {
- try {
- const result = await pool.query('SELECT COUNT(*) as count FROM document_chunks');
- return parseInt(result.rows[0]?.count || '0');
- } catch (error) {
- logger.error('Failed to get total chunk count', error);
- throw error;
- }
- }
-
- /**
- * Get total document count
- */
- static async getTotalDocumentCount(): Promise {
- try {
- const result = await pool.query('SELECT COUNT(DISTINCT document_id) as count FROM document_chunks');
- return parseInt(result.rows[0]?.count || '0');
- } catch (error) {
- logger.error('Failed to get total document count', error);
- throw error;
- }
- }
-
- /**
- * Get average chunk size
- */
- static async getAverageChunkSize(): Promise {
- try {
- const result = await pool.query('SELECT AVG(LENGTH(content)) as avg_size FROM document_chunks');
- return Math.round(parseFloat(result.rows[0]?.avg_size || '0'));
- } catch (error) {
- logger.error('Failed to get average chunk size', error);
- throw error;
- }
- }
-}
\ No newline at end of file
+}
\ No newline at end of file
diff --git a/backend/src/models/types.ts b/backend/src/models/types.ts
index c39b570..891d135 100644
--- a/backend/src/models/types.ts
+++ b/backend/src/models/types.ts
@@ -63,6 +63,7 @@ export interface ProcessingJob {
}
export type ProcessingStatus =
+ | 'uploading'
| 'uploaded'
| 'extracting_text'
| 'processing_llm'
diff --git a/backend/src/routes/documents.ts b/backend/src/routes/documents.ts
index e53e406..f660c68 100644
--- a/backend/src/routes/documents.ts
+++ b/backend/src/routes/documents.ts
@@ -23,9 +23,13 @@ const router = express.Router();
router.use(verifyFirebaseToken);
router.use(addCorrelationId);
-// Essential document management routes (keeping these)
+// NEW Firebase Storage direct upload routes
+router.post('/upload-url', documentController.getUploadUrl);
+router.post('/:id/confirm-upload', validateUUID('id'), documentController.confirmUpload);
+
+// LEGACY multipart upload routes (keeping for backward compatibility)
router.post('/upload', handleFileUpload, documentController.uploadDocument);
-router.post('/', handleFileUpload, documentController.uploadDocument); // Add direct POST to /documents for frontend compatibility
+router.post('/', handleFileUpload, documentController.uploadDocument);
router.get('/', documentController.getDocuments);
// Analytics endpoints (MUST come before /:id routes to avoid conflicts)
diff --git a/backend/src/services/fileStorageService.ts b/backend/src/services/fileStorageService.ts
index 6fec134..69a457f 100644
--- a/backend/src/services/fileStorageService.ts
+++ b/backend/src/services/fileStorageService.ts
@@ -483,6 +483,37 @@ class FileStorageService {
}
}
+ /**
+ * Generate signed upload URL for direct client uploads
+ */
+ async generateSignedUploadUrl(filePath: string, contentType: string, expirationMinutes: number = 60): Promise {
+ try {
+ const bucket = this.storage.bucket(this.bucketName);
+ const file = bucket.file(filePath);
+
+ // Generate signed upload URL with retry logic
+ const [signedUrl] = await this.retryOperation(
+ async () => file.getSignedUrl({
+ version: 'v4',
+ action: 'write',
+ expires: Date.now() + (expirationMinutes * 60 * 1000),
+ contentType: contentType,
+ }),
+ 'generate signed upload URL from GCS'
+ );
+
+ logger.info(`Generated signed upload URL for file: ${filePath}`, {
+ contentType,
+ expirationMinutes,
+ });
+
+ return signedUrl;
+ } catch (error) {
+ logger.error(`Error generating signed upload URL for file: ${filePath}`, error);
+ throw new Error(`Failed to generate upload URL: ${error instanceof Error ? error.message : 'Unknown error'}`);
+ }
+ }
+
/**
* Copy file within Google Cloud Storage
*/
diff --git a/cors.json b/cors.json
new file mode 100644
index 0000000..ace0f26
--- /dev/null
+++ b/cors.json
@@ -0,0 +1,23 @@
+[
+ {
+ "origin": [
+ "https://cim-summarizer.web.app",
+ "https://cim-summarizer.firebaseapp.com",
+ "http://localhost:3000",
+ "http://localhost:5173"
+ ],
+ "method": [
+ "GET",
+ "POST",
+ "PUT",
+ "DELETE",
+ "OPTIONS"
+ ],
+ "responseHeader": [
+ "Content-Type",
+ "Authorization",
+ "X-Requested-With"
+ ],
+ "maxAgeSeconds": 3600
+ }
+]
diff --git a/firebase.json b/firebase.json
new file mode 100644
index 0000000..e0078db
--- /dev/null
+++ b/firebase.json
@@ -0,0 +1,6 @@
+{
+ "storage": {
+ "rules": "storage.rules",
+ "cors": "storage.cors.json"
+ }
+}
\ No newline at end of file
diff --git a/frontend/firebase.json b/frontend/firebase.json
index d7f1b9d..fb89463 100644
--- a/frontend/firebase.json
+++ b/frontend/firebase.json
@@ -63,6 +63,10 @@
}
],
"rewrites": [
+ {
+ "source": "/api/**",
+ "function": "api"
+ },
{
"source": "**",
"destination": "/index.html"
diff --git a/frontend/src/App.tsx b/frontend/src/App.tsx
index a61efcf..cfd907f 100644
--- a/frontend/src/App.tsx
+++ b/frontend/src/App.tsx
@@ -387,19 +387,6 @@ const Dashboard: React.FC = () => {
Welcome, {user?.name || user?.email}
- {/* Debug buttons - show in production for troubleshooting */}
-
- Debug Auth
-
-
- Test API
-
diff --git a/frontend/src/components/DocumentUpload.tsx b/frontend/src/components/DocumentUpload.tsx
index 05e948a..897fbc7 100644
--- a/frontend/src/components/DocumentUpload.tsx
+++ b/frontend/src/components/DocumentUpload.tsx
@@ -14,10 +14,10 @@ interface UploadedFile {
progress: number;
error?: string;
documentId?: string; // Real document ID from backend
- // GCS-specific fields
- gcsError?: boolean;
- storageType?: 'gcs' | 'local';
- gcsUrl?: string;
+ // Firebase Storage specific fields
+ storageError?: boolean;
+ storageType?: 'firebase' | 'local';
+ storageUrl?: string;
}
interface DocumentUploadProps {
@@ -92,17 +92,15 @@ const DocumentUpload: React.FC = ({
try {
// Upload the document with optimized agentic RAG processing (no strategy selection needed)
- const document = await documentService.uploadDocument(
- file,
+ const result = await documentService.uploadDocument(
+ file,
(progress) => {
setUploadedFiles(prev =>
prev.map(f =>
- f.id === uploadedFile.id
- ? { ...f, progress }
- : f
+ f.id === uploadedFile.id ? { ...f, progress } : f
)
);
- },
+ },
abortController.signal
);
@@ -141,13 +139,13 @@ const DocumentUpload: React.FC = ({
} else {
console.error('Upload failed:', error);
- // Handle GCS-specific errors
+ // Handle storage-specific errors
let errorMessage = 'Upload failed';
- let isGCSError = false;
+ let isStorageError = false;
if (GCSErrorHandler.isGCSError(error)) {
errorMessage = GCSErrorHandler.getErrorMessage(error as GCSError);
- isGCSError = true;
+ isStorageError = true;
} else if (error instanceof Error) {
errorMessage = error.message;
}
@@ -159,8 +157,8 @@ const DocumentUpload: React.FC = ({
...f,
status: 'error',
error: errorMessage,
- // Add GCS error indicator
- ...(isGCSError && { gcsError: true })
+ // Add storage error indicator
+ ...(isStorageError && { storageError: true })
}
: f
)
@@ -297,19 +295,19 @@ const DocumentUpload: React.FC = ({
}
};
- const getStatusText = (status: UploadedFile['status'], error?: string, gcsError?: boolean) => {
+ const getStatusText = (status: UploadedFile['status'], error?: string, storageError?: boolean) => {
switch (status) {
case 'uploading':
- return 'Uploading to Google Cloud Storage...';
+ return 'Uploading to Firebase Storage...';
case 'uploaded':
- return 'Uploaded to GCS ✓';
+ return 'Uploaded to Firebase Storage ✓';
case 'processing':
- return 'Processing with Optimized Agentic RAG...';
+ return 'Processing with Document AI + Optimized Agentic RAG...';
case 'completed':
- return 'Completed ✓';
+ return 'Completed ✓ (PDF automatically deleted)';
case 'error':
if (error === 'Upload cancelled') return 'Cancelled';
- if (gcsError) return 'GCS Error';
+ if (storageError) return 'Firebase Storage Error';
return 'Error';
default:
return '';
@@ -323,10 +321,10 @@ const DocumentUpload: React.FC = ({
-
Optimized Agentic RAG Processing
+
Document AI + Optimized Agentic RAG Processing
- All documents are automatically processed using our advanced optimized agentic RAG system,
- which includes intelligent chunking, vectorization, and multi-agent analysis for the best results.
+ All documents are automatically processed using Google Document AI for extraction and our advanced optimized agentic RAG system for analysis,
+ including intelligent chunking, vectorization, and multi-agent CIM review. PDFs are automatically deleted after processing.
@@ -351,7 +349,7 @@ const DocumentUpload: React.FC = ({
Drag and drop PDF files here, or click to browse
- Maximum file size: 50MB • Supported format: PDF • Stored securely in Google Cloud Storage • Automatic Optimized Agentic RAG Processing
+ Maximum file size: 50MB • Supported format: PDF • Stored securely in Firebase Storage • Automatic Document AI + Optimized Agentic RAG Processing • PDFs deleted after processing
@@ -379,8 +377,8 @@ const DocumentUpload: React.FC = ({
Upload Complete
- Files have been uploaded successfully to Google Cloud Storage! You can now navigate away from this page.
- Processing will continue in the background using Optimized Agentic RAG and you can check the status in the Documents tab.
+ Files have been uploaded successfully to Firebase Storage! You can now navigate away from this page.
+ Processing will continue in the background using Document AI + Optimized Agentic RAG. PDFs will be automatically deleted after processing to save costs.
@@ -426,10 +424,10 @@ const DocumentUpload: React.FC = ({
{getStatusIcon(file.status)}
- {getStatusText(file.status, file.error, file.gcsError)}
+ {getStatusText(file.status, file.error, file.storageError)}
- {/* GCS indicator */}
- {file.storageType === 'gcs' && (
+ {/* Firebase Storage indicator */}
+ {file.storageType === 'firebase' && (
)}
@@ -452,4 +450,4 @@ const DocumentUpload: React.FC = ({
);
};
-export default DocumentUpload;
\ No newline at end of file
+export default DocumentUpload;
\ No newline at end of file
diff --git a/frontend/src/services/documentService.ts b/frontend/src/services/documentService.ts
index dc2d521..c73a35b 100644
--- a/frontend/src/services/documentService.ts
+++ b/frontend/src/services/documentService.ts
@@ -60,7 +60,7 @@ export interface Document {
file_path: string;
file_size: number;
uploaded_at: string;
- status: 'uploaded' | 'extracting_text' | 'processing_llm' | 'generating_pdf' | 'completed' | 'failed';
+ status: 'uploading' | 'uploaded' | 'extracting_text' | 'processing_llm' | 'generating_pdf' | 'completed' | 'failed';
extracted_text?: string;
generated_summary?: string;
summary_markdown_path?: string;
@@ -219,7 +219,7 @@ export class GCSErrorHandler {
class DocumentService {
/**
- * Upload a document for processing
+ * Upload a document using Firebase Storage direct upload (new method)
*/
async uploadDocument(
file: File,
@@ -233,7 +233,137 @@ class DocumentService {
throw new Error('Authentication required. Please log in to upload documents.');
}
- console.log('📤 Starting document upload...');
+ console.log('📤 Starting Firebase Storage direct upload...');
+ console.log('📤 File:', file.name, 'Size:', file.size, 'Type:', file.type);
+ console.log('📤 Token available:', !!token);
+
+ // Step 1: Get signed upload URL
+ onProgress?.(5); // 5% - Getting upload URL
+
+ console.log('🌐 Making request to upload-url endpoint');
+ console.log('🌐 Base URL:', API_BASE_URL);
+ console.log('🌐 Full URL would be:', `${API_BASE_URL}/documents/upload-url`);
+ console.log('🌐 Request payload:', { fileName: file.name, fileSize: file.size, contentType: file.type });
+
+ const uploadUrlResponse = await apiClient.post('/documents/upload-url', {
+ fileName: file.name,
+ fileSize: file.size,
+ contentType: file.type
+ }, { signal });
+
+ const { documentId, uploadUrl } = uploadUrlResponse.data;
+ console.log('✅ Got signed upload URL for document:', documentId);
+
+ // Step 2: Upload directly to Firebase Storage
+ onProgress?.(10); // 10% - Starting direct upload
+
+ await this.uploadToFirebaseStorage(file, uploadUrl, onProgress, signal);
+ console.log('✅ File uploaded to Firebase Storage');
+
+ // Step 3: Confirm upload and trigger processing
+ onProgress?.(95); // 95% - Confirming upload
+
+ const confirmResponse = await apiClient.post(`/documents/${documentId}/confirm-upload`, {}, { signal });
+
+ onProgress?.(100); // 100% - Complete
+ console.log('✅ Upload confirmed and processing started');
+
+ return {
+ id: documentId,
+ ...confirmResponse.data
+ };
+
+ } catch (error: any) {
+ console.error('❌ Firebase Storage upload failed:', error);
+
+ // Handle specific error cases
+ if (error.name === 'AbortError') {
+ throw new Error('Upload was cancelled.');
+ }
+
+ if (error.response?.status === 401) {
+ throw new Error('Authentication required. Please log in again.');
+ }
+
+ if (error.response?.status === 400) {
+ throw new Error(error.response?.data?.error || 'Invalid request');
+ }
+
+ if (error.response?.status >= 500) {
+ throw new Error('Server error. Please try again later.');
+ }
+
+ // Generic error fallback
+ throw new Error(error.response?.data?.error || error.message || 'Upload failed');
+ }
+ }
+
+ /**
+ * Upload file directly to Firebase Storage using signed URL
+ */
+ private async uploadToFirebaseStorage(
+ file: File,
+ uploadUrl: string,
+ onProgress?: (progress: number) => void,
+ signal?: AbortSignal
+ ): Promise {
+ return new Promise((resolve, reject) => {
+ const xhr = new XMLHttpRequest();
+
+ // Handle upload progress
+ xhr.upload.addEventListener('progress', (event) => {
+ if (event.lengthComputable && onProgress) {
+ // Map Firebase Storage upload to 10%-90% of overall progress
+ const uploadProgress = Math.round((event.loaded / event.total) * 80) + 10;
+ onProgress(uploadProgress);
+ }
+ });
+
+ // Handle completion
+ xhr.addEventListener('load', () => {
+ if (xhr.status >= 200 && xhr.status < 300) {
+ resolve();
+ } else {
+ reject(new Error(`Firebase Storage upload failed: ${xhr.status} ${xhr.statusText}`));
+ }
+ });
+
+ // Handle errors
+ xhr.addEventListener('error', () => {
+ reject(new Error('Firebase Storage upload failed: Network error'));
+ });
+
+ // Handle abort
+ if (signal) {
+ signal.addEventListener('abort', () => {
+ xhr.abort();
+ reject(new Error('Upload was cancelled'));
+ });
+ }
+
+ // Start upload
+ xhr.open('PUT', uploadUrl);
+ xhr.setRequestHeader('Content-Type', file.type);
+ xhr.send(file);
+ });
+ }
+
+ /**
+ * Legacy multipart upload method (kept for compatibility)
+ */
+ async uploadDocumentLegacy(
+ file: File,
+ onProgress?: (progress: number) => void,
+ signal?: AbortSignal
+ ): Promise {
+ try {
+ // Check authentication before upload
+ const token = await authService.getToken();
+ if (!token) {
+ throw new Error('Authentication required. Please log in to upload documents.');
+ }
+
+ console.log('📤 Starting legacy multipart upload...');
console.log('📤 File:', file.name, 'Size:', file.size, 'Type:', file.type);
console.log('📤 Token available:', !!token);
@@ -243,7 +373,7 @@ class DocumentService {
// Always use optimized agentic RAG processing - no strategy selection needed
formData.append('processingStrategy', 'optimized_agentic_rag');
- const response = await apiClient.post('/documents', formData, {
+ const response = await apiClient.post('/documents/upload', formData, {
headers: {
'Content-Type': 'multipart/form-data',
},
@@ -256,10 +386,10 @@ class DocumentService {
},
});
- console.log('✅ Document upload successful:', response.data);
+ console.log('✅ Legacy document upload successful:', response.data);
return response.data;
} catch (error: any) {
- console.error('❌ Document upload failed:', error);
+ console.error('❌ Legacy document upload failed:', error);
// Provide more specific error messages
if (error.response?.status === 401) {
diff --git a/storage.cors.json b/storage.cors.json
new file mode 100644
index 0000000..3e9d728
--- /dev/null
+++ b/storage.cors.json
@@ -0,0 +1,23 @@
+[
+ {
+ "origin": [
+ "https://cim-summarizer.web.app",
+ "https://cim-summarizer.firebaseapp.com",
+ "http://localhost:3000",
+ "http://localhost:5173"
+ ],
+ "method": [
+ "GET",
+ "POST",
+ "PUT",
+ "DELETE",
+ "OPTIONS"
+ ],
+ "responseHeader": [
+ "Content-Type",
+ "Authorization",
+ "X-Requested-With"
+ ],
+ "maxAgeSeconds": 3600
+ }
+]
\ No newline at end of file
diff --git a/storage.rules b/storage.rules
new file mode 100644
index 0000000..776621d
--- /dev/null
+++ b/storage.rules
@@ -0,0 +1,8 @@
+rules_version = '2';
+service firebase.storage {
+ match /b/{bucket}/o {
+ match /{allPaths=**} {
+ allow read, write: if request.auth != null;
+ }
+ }
+}