fix(core): Overhaul and fix the end-to-end document processing pipeline

This commit is contained in:
Jon
2025-08-01 11:13:03 -04:00
parent 6057d1d7fd
commit 95c92946de
17 changed files with 695 additions and 901 deletions

View File

@@ -2,7 +2,7 @@
"name": "cim-processor-backend",
"version": "1.0.0",
"description": "Backend API for CIM Document Processor",
"main": "index.js",
"main": "dist/index.js",
"scripts": {
"dev": "ts-node-dev --respawn --transpile-only --max-old-space-size=8192 --expose-gc src/index.ts",
"build": "tsc && node src/scripts/prepare-dist.js && cp .puppeteerrc.cjs dist/",

View File

@@ -7,10 +7,11 @@ import { uploadProgressService } from '../services/uploadProgressService';
import { uploadMonitoringService } from '../services/uploadMonitoringService';
export const documentController = {
async uploadDocument(req: Request, res: Response): Promise<void> {
const startTime = Date.now();
const structuredLogger = new StructuredLogger(req.correlationId);
async getUploadUrl(req: Request, res: Response): Promise<void> {
console.log('🎯🎯🎯 GET UPLOAD URL ENDPOINT HIT!');
console.log('🎯 Method:', req.method);
console.log('🎯 URL:', req.url);
console.log('🎯 Headers:', JSON.stringify(req.headers, null, 2));
try {
const userId = req.user?.uid;
if (!userId) {
@@ -21,206 +22,369 @@ export const documentController = {
return;
}
// Check if file was uploaded
if (!req.file) {
res.status(400).json({
error: 'No file uploaded',
correlationId: req.correlationId
const { fileName, fileSize, contentType } = req.body;
if (!fileName || !fileSize || !contentType) {
res.status(400).json({
error: 'Missing required fields: fileName, fileSize, contentType',
correlationId: req.correlationId
});
return;
}
const file = req.file;
// Track upload start
const uploadEventData: any = {
userId,
fileInfo: {
originalName: file.originalname,
size: file.size,
mimetype: file.mimetype,
},
status: 'started',
stage: 'upload_initiated',
};
if (req.correlationId) {
uploadEventData.correlationId = req.correlationId;
}
uploadMonitoringService.trackUploadEvent(uploadEventData);
structuredLogger.uploadStart({
originalName: file.originalname,
size: file.size,
mimetype: file.mimetype,
}, userId);
// Always use optimized agentic RAG processing - no strategy selection needed
const processingStrategy = 'optimized_agentic_rag';
// Store file and get file path
const storageResult = await fileStorageService.storeFile(file, userId);
if (!storageResult.success || !storageResult.fileInfo) {
const processingTime = Date.now() - startTime;
// Track upload failure
const failureEventData: any = {
userId,
fileInfo: {
originalName: file.originalname,
size: file.size,
mimetype: file.mimetype,
},
status: 'failed',
stage: 'file_storage',
error: {
message: storageResult.error || 'Failed to store file',
type: 'storage_error',
code: 'STORAGE_ERROR',
},
processingTime,
};
if (req.correlationId) {
failureEventData.correlationId = req.correlationId;
}
uploadMonitoringService.trackUploadEvent(failureEventData);
structuredLogger.uploadError(
new Error(storageResult.error || 'Failed to store file'),
{
originalName: file.originalname,
size: file.size,
mimetype: file.mimetype,
},
userId,
'file_storage'
);
res.status(500).json({
error: 'Failed to store file',
correlationId: req.correlationId
// Validate file type
if (contentType !== 'application/pdf') {
res.status(400).json({
error: 'Only PDF files are supported',
correlationId: req.correlationId
});
return;
}
// Create document record
// Validate file size (max 50MB)
if (fileSize > 50 * 1024 * 1024) {
res.status(400).json({
error: 'File size exceeds 50MB limit',
correlationId: req.correlationId
});
return;
}
// Generate unique file path
const timestamp = Date.now();
const sanitizedFileName = fileName.replace(/[^a-zA-Z0-9.-]/g, '_');
const filePath = `uploads/${userId}/${timestamp}_${sanitizedFileName}`;
// Create document record first
const document = await DocumentModel.create({
user_id: userId,
original_file_name: file.originalname,
file_path: storageResult.fileInfo.path,
file_size: file.size,
status: 'uploaded'
original_file_name: fileName,
file_path: filePath,
file_size: fileSize,
status: 'uploading'
});
// Always auto-process with optimized agentic RAG
try {
const jobId = await jobQueueService.addJob(
'document_processing',
{
documentId: document.id,
userId: userId,
options: { strategy: processingStrategy }
},
0 // Normal priority
);
logger.info('Document processing job queued with optimized agentic RAG', {
documentId: document.id,
jobId,
strategy: processingStrategy
});
// Update status to indicate it's queued for processing
await DocumentModel.updateById(document.id, { status: 'extracting_text' });
} catch (error) {
logger.error('Failed to queue document processing job', { error, documentId: document.id });
}
// Generate signed upload URL
const { fileStorageService } = await import('../services/fileStorageService');
const uploadUrl = await fileStorageService.generateSignedUploadUrl(filePath, contentType);
// Track upload success
const processingTime = Date.now() - startTime;
const successEventData: any = {
userId,
fileInfo: {
originalName: file.originalname,
size: file.size,
mimetype: file.mimetype,
},
status: 'success',
stage: 'upload_completed',
processingTime,
};
console.log('✅ Generated upload URL for document:', document.id);
if (req.correlationId) {
successEventData.correlationId = req.correlationId;
}
uploadMonitoringService.trackUploadEvent(successEventData);
structuredLogger.uploadSuccess({
originalName: file.originalname,
size: file.size,
mimetype: file.mimetype,
}, userId, processingTime);
// Return document info
res.status(201).json({
id: document.id,
name: document.original_file_name,
originalName: document.original_file_name,
status: 'extracting_text',
uploadedAt: document.created_at,
uploadedBy: userId,
fileSize: document.file_size,
processingStrategy: processingStrategy,
res.status(200).json({
documentId: document.id,
uploadUrl: uploadUrl,
filePath: filePath,
correlationId: req.correlationId || undefined
});
} catch (error) {
const processingTime = Date.now() - startTime;
console.log('❌ Get upload URL error:', error);
logger.error('Get upload URL failed', {
error,
correlationId: req.correlationId
});
// Track upload failure
const errorEventData: any = {
userId: req.user?.uid || 'unknown',
fileInfo: {
originalName: req.file?.originalname || 'unknown',
size: req.file?.size || 0,
mimetype: req.file?.mimetype || 'unknown',
},
status: 'failed',
stage: 'upload_error',
error: {
message: error instanceof Error ? error.message : 'Unknown error',
type: 'upload_error',
},
processingTime,
};
res.status(500).json({
error: 'Failed to generate upload URL',
message: error instanceof Error ? error.message : 'Unknown error',
correlationId: req.correlationId || undefined
});
}
},
if (req.correlationId) {
errorEventData.correlationId = req.correlationId;
async confirmUpload(req: Request, res: Response): Promise<void> {
try {
const userId = req.user?.uid;
if (!userId) {
res.status(401).json({
error: 'User not authenticated',
correlationId: req.correlationId
});
return;
}
uploadMonitoringService.trackUploadEvent(errorEventData);
const { id: documentId } = req.params;
if (!documentId) {
res.status(400).json({
error: 'Document ID is required',
correlationId: req.correlationId
});
return;
}
structuredLogger.uploadError(
error,
{
originalName: req.file?.originalname || 'unknown',
size: req.file?.size || 0,
mimetype: req.file?.mimetype || 'unknown',
},
req.user?.uid || 'unknown',
'upload_error'
// Get document record
const document = await DocumentModel.findById(documentId);
if (!document) {
res.status(404).json({
error: 'Document not found',
correlationId: req.correlationId
});
return;
}
// Verify user owns document
if (document.user_id !== userId) {
res.status(403).json({
error: 'Access denied',
correlationId: req.correlationId
});
return;
}
console.log('🔄 Starting Document AI processing for:', documentId);
// Update status to processing
await DocumentModel.updateById(documentId, {
status: 'processing_llm'
});
// Acknowledge the request immediately
res.status(202).json({
message: 'Upload confirmed, processing has started.',
documentId: documentId,
status: 'processing'
});
// Process in the background
(async () => {
try {
// Download file from Firebase Storage for Document AI processing
const { fileStorageService } = await import('../services/fileStorageService');
let fileBuffer: Buffer | null = null;
for (let i = 0; i < 3; i++) {
await new Promise(resolve => setTimeout(resolve, 2000)); // 2 second delay
fileBuffer = await fileStorageService.getFile(document.file_path);
if (fileBuffer) {
break;
}
}
if (!fileBuffer) {
await DocumentModel.updateById(documentId, {
status: 'failed',
error_message: 'Failed to download uploaded file'
});
return;
}
// Process with Unified Document Processor
const { unifiedDocumentProcessor } = await import('../services/unifiedDocumentProcessor');
const result = await unifiedDocumentProcessor.processDocument(
documentId,
userId,
'', // Text is not needed for this strategy
{ strategy: 'optimized_agentic_rag' }
);
if (result.success) {
// Update document with results
await DocumentModel.updateById(documentId, {
status: 'completed',
generated_summary: result.summary,
processing_completed_at: new Date()
});
// 🗑️ DELETE PDF after successful processing
try {
await fileStorageService.deleteFile(document.file_path);
console.log('✅ PDF deleted after successful processing:', document.file_path);
} catch (deleteError) {
console.log('⚠️ Failed to delete PDF file:', deleteError);
logger.warn('Failed to delete PDF after processing', {
filePath: document.file_path,
documentId,
error: deleteError
});
}
console.log('✅ Document AI processing completed successfully');
} else {
await DocumentModel.updateById(documentId, {
status: 'failed',
error_message: result.error
});
// Also delete PDF on processing failure to avoid storage costs
try {
await fileStorageService.deleteFile(document.file_path);
console.log('🗑️ PDF deleted after processing failure');
} catch (deleteError) {
console.log('⚠️ Failed to delete PDF file after error:', deleteError);
}
}
} catch (error) {
console.log('❌ Background processing error:', error);
logger.error('Background processing failed', {
error,
documentId
});
await DocumentModel.updateById(documentId, {
status: 'failed',
error_message: 'Background processing failed'
});
}
})();
} catch (error) {
console.log('❌ Confirm upload error:', error);
logger.error('Confirm upload failed', {
error,
correlationId: req.correlationId
});
res.status(500).json({
error: 'Upload confirmation failed',
message: error instanceof Error ? error.message : 'Unknown error',
correlationId: req.correlationId || undefined
});
}
},
async uploadDocument(req: Request, res: Response): Promise<void> {
const startTime = Date.now();
// 🔍 COMPREHENSIVE DEBUG: Log everything about the request
console.log('🚀 =========================');
console.log('🚀 DOCUMENT AI UPLOAD STARTED');
console.log('🚀 Method:', req.method);
console.log('🚀 URL:', req.url);
console.log('🚀 Content-Type:', req.get('Content-Type'));
console.log('🚀 Content-Length:', req.get('Content-Length'));
console.log('🚀 Authorization header present:', !!req.get('Authorization'));
console.log('🚀 User from token:', req.user?.uid || 'NOT_FOUND');
// Debug body in detail
console.log('🚀 Has body:', !!req.body);
console.log('🚀 Body type:', typeof req.body);
console.log('🚀 Body constructor:', req.body?.constructor?.name);
console.log('🚀 Body length:', req.body?.length || 0);
console.log('🚀 Is Buffer?:', Buffer.isBuffer(req.body));
// Debug all headers
console.log('🚀 All headers:', JSON.stringify(req.headers, null, 2));
// Debug request properties
console.log('🚀 Request readable:', req.readable);
console.log('🚀 Request complete:', req.complete);
// If body exists, show first few bytes
if (req.body && req.body.length > 0) {
const preview = req.body.slice(0, 100).toString('hex');
console.log('🚀 Body preview (hex):', preview);
// Try to see if it contains multipart boundary
const bodyStr = req.body.toString('utf8', 0, Math.min(500, req.body.length));
console.log('🚀 Body preview (string):', bodyStr.substring(0, 200));
}
console.log('🚀 =========================');
try {
const userId = req.user?.uid;
if (!userId) {
console.log('❌ Authentication failed - no userId');
res.status(401).json({
error: 'User not authenticated',
correlationId: req.correlationId
});
return;
}
console.log('✅ Authentication successful for user:', userId);
// Get raw body buffer for Document AI processing
const rawBody = req.body;
if (!rawBody || rawBody.length === 0) {
res.status(400).json({
error: 'No file data received',
correlationId: req.correlationId,
debug: {
method: req.method,
contentType: req.get('Content-Type'),
contentLength: req.get('Content-Length'),
hasRawBody: !!rawBody,
rawBodySize: rawBody?.length || 0,
bodyType: typeof rawBody
}
});
return;
}
console.log('✅ Found raw body buffer:', rawBody.length, 'bytes');
// Create document record first
const document = await DocumentModel.create({
user_id: userId,
original_file_name: 'uploaded-document.pdf',
file_path: '',
file_size: rawBody.length,
status: 'processing_llm'
});
console.log('✅ Document record created:', document.id);
// Process with Document AI directly
const { DocumentAiGenkitProcessor } = await import('../services/documentAiGenkitProcessor');
const processor = new DocumentAiGenkitProcessor();
console.log('✅ Starting Document AI processing...');
const result = await processor.processDocument(
document.id,
userId,
rawBody,
'uploaded-document.pdf',
'application/pdf'
);
if (result.success) {
await DocumentModel.updateById(document.id, {
status: 'completed',
generated_summary: result.content,
processing_completed_at: new Date()
});
console.log('✅ Document AI processing completed successfully');
res.status(201).json({
id: document.id,
name: 'uploaded-document.pdf',
originalName: 'uploaded-document.pdf',
status: 'completed',
uploadedAt: document.created_at,
uploadedBy: userId,
fileSize: rawBody.length,
summary: result.content,
correlationId: req.correlationId || undefined
});
return;
} else {
console.log('❌ Document AI processing failed:', result.error);
await DocumentModel.updateById(document.id, {
status: 'failed',
error_message: result.error
});
res.status(500).json({
error: 'Document processing failed',
message: result.error,
correlationId: req.correlationId || undefined
});
return;
}
} catch (error) {
console.log('❌ Upload error:', error);
logger.error('Upload document failed', {
error,
correlationId: req.correlationId
});
res.status(500).json({
error: 'Upload failed',
message: error instanceof Error ? error.message : 'Unknown error',
correlationId: req.correlationId || undefined
});
}
@@ -552,4 +716,4 @@ export const documentController = {
throw new Error('Failed to get document text');
}
}
};
};

View File

@@ -18,20 +18,17 @@ import { notFoundHandler } from './middleware/notFoundHandler';
const app = express();
// Enable trust proxy to ensure Express works correctly behind the proxy
// Add this middleware to log all incoming requests
app.use((req, res, next) => {
console.log(`Incoming request: ${req.method} ${req.path}`);
next();
});
// Enable trust proxy to ensure Express works correctly behind a proxy
app.set('trust proxy', 1);
// Security middleware
app.use(helmet({
contentSecurityPolicy: {
directives: {
defaultSrc: ["'self'"],
styleSrc: ["'self'", "'unsafe-inline'"],
scriptSrc: ["'self'"],
imgSrc: ["'self'", "data:", "https:"],
},
},
}));
app.use(helmet());
// CORS configuration
const allowedOrigins = [
@@ -43,13 +40,10 @@ const allowedOrigins = [
app.use(cors({
origin: function (origin, callback) {
console.log('🌐 CORS request from origin:', origin);
if (!origin || allowedOrigins.indexOf(origin) !== -1) {
console.log('✅ CORS allowed for origin:', origin);
callback(null, true);
} else {
console.log('❌ CORS blocked origin:', origin);
logger.warn(`CORS blocked for origin: ${origin}`);
callback(new Error('Not allowed by CORS'));
}
},
@@ -62,7 +56,7 @@ app.use(cors({
// Rate limiting
const limiter = rateLimit({
windowMs: 15 * 60 * 1000, // 15 minutes
max: 1000, // limit each IP to 1000 requests per windowMs (increased for testing)
max: 1000,
message: {
error: 'Too many requests from this IP, please try again later.',
},
@@ -72,27 +66,6 @@ const limiter = rateLimit({
app.use(limiter);
// Body parsing middleware - only for non-multipart requests
app.use((req, res, next) => {
if (req.headers['content-type'] && req.headers['content-type'].includes('multipart/form-data')) {
// Skip body parsing for multipart requests - let multer handle it
next();
} else {
// Parse JSON and URL-encoded bodies for other requests
express.json({ limit: '10mb' })(req, res, next);
}
});
app.use((req, res, next) => {
if (req.headers['content-type'] && req.headers['content-type'].includes('multipart/form-data')) {
// Skip body parsing for multipart requests - let multer handle it
next();
} else {
// Parse URL-encoded bodies for other requests
express.urlencoded({ extended: true, limit: '10mb' })(req, res, next);
}
});
// Logging middleware
app.use(morgan('combined', {
stream: {
@@ -100,17 +73,12 @@ app.use(morgan('combined', {
},
}));
// Request debugging middleware
app.use((req, res, next) => {
console.log('📥 Incoming request:', req.method, req.url);
console.log('📥 Request headers:', Object.keys(req.headers));
console.log('📥 Content-Type:', req.get('Content-Type'));
console.log('📥 Authorization:', req.get('Authorization') ? 'Present' : 'Missing');
next();
});
// CRITICAL: Add body parsing BEFORE routes
app.use(express.json({ limit: '10mb' }));
app.use(express.urlencoded({ extended: true, limit: '10mb' }));
// Health check endpoint
app.get('/health', (_req, res) => { // _req to fix TS6133
app.get('/health', (_req, res) => {
res.status(200).json({
status: 'ok',
timestamp: new Date().toISOString(),
@@ -119,53 +87,23 @@ app.get('/health', (_req, res) => { // _req to fix TS6133
});
});
// Agentic RAG health check endpoints
app.get('/health/agentic-rag', async (_req, res) => {
try {
const { agenticRAGDatabaseService } = await import('./services/agenticRAGDatabaseService');
const healthStatus = await agenticRAGDatabaseService.getHealthStatus();
res.json(healthStatus);
} catch (error) {
logger.error('Agentic RAG health check failed', { error });
res.status(500).json({
error: 'Health check failed',
status: 'unhealthy',
timestamp: new Date().toISOString()
});
}
});
app.get('/health/agentic-rag/metrics', async (_req, res) => {
try {
const { agenticRAGDatabaseService } = await import('./services/agenticRAGDatabaseService');
const startDate = new Date(Date.now() - 30 * 24 * 60 * 60 * 1000); // 30 days ago
const metrics = await agenticRAGDatabaseService.generatePerformanceReport(startDate, new Date());
res.json(metrics);
} catch (error) {
logger.error('Agentic RAG metrics retrieval failed', { error });
res.status(500).json({ error: 'Metrics retrieval failed' });
}
});
// API routes - remove the /api prefix as it's handled by Firebase
// API Routes
app.use('/documents', documentRoutes);
app.use('/vector', vectorRoutes);
app.use('/monitoring', monitoringRoutes);
import * as functions from 'firebase-functions';
import { onRequest } from 'firebase-functions/v2/https';
// API root endpoint
app.get('/', (_req, res) => { // _req to fix TS6133
app.get('/', (_req, res) => {
res.json({
message: 'CIM Document Processor API',
version: '1.0.0',
endpoints: {
auth: '/auth',
documents: '/documents',
health: '/health',
agenticRagHealth: '/health/agentic-rag',
agenticRagMetrics: '/health/agentic-rag/metrics',
monitoring: '/monitoring',
},
});
@@ -177,26 +115,11 @@ app.use(notFoundHandler);
// Global error handler (must be last)
app.use(errorHandler);
// Initialize job queue service for document processing
import { jobQueueService } from './services/jobQueueService';
// Start the job queue service asynchronously to avoid blocking function startup
// Use a longer delay to ensure the function is fully initialized
setTimeout(() => {
try {
jobQueueService.start();
logger.info('Job queue service started successfully');
} catch (error) {
logger.error('Failed to start job queue service', { error });
}
}, 5000);
// Listen on a port when not in a Firebase Function environment or when PORT is explicitly set
if (!process.env['FUNCTION_TARGET'] || process.env['PORT']) {
const port = process.env['PORT'] || 5001;
app.listen(port, () => {
logger.info(`API server listening on port ${port}`);
});
}
export const api = functions.https.onRequest(app);
// Configure Firebase Functions v2 for larger uploads
export const api = onRequest({
timeoutSeconds: 540, // 9 minutes
memory: '2GiB',
cpu: 1,
maxInstances: 10,
cors: true
}, app);

View File

@@ -11,6 +11,18 @@ export const errorHandler = (
req: Request,
res: Response
): void => {
console.log('💥💥💥 MAXIMUM DEBUG ERROR HANDLER HIT 💥💥💥');
console.log('💥 Error name:', err.name);
console.log('💥 Error message:', err.message);
console.log('💥 Error code:', (err as any).code);
console.log('💥 Error type:', typeof err);
console.log('💥 Error constructor:', err.constructor.name);
console.log('💥 Error stack:', err.stack);
console.log('💥 Request URL:', req.url);
console.log('💥 Request method:', req.method);
console.log('💥 Full error object:', JSON.stringify(err, Object.getOwnPropertyNames(err), 2));
console.log('💥💥💥 END ERROR DEBUG 💥💥💥');
let error = { ...err };
error.message = err.message;
@@ -53,6 +65,13 @@ export const errorHandler = (
error = { message, statusCode: 401 } as AppError;
}
// Multer errors (check if multer is imported anywhere)
if (err.name === 'MulterError' || (err as any).code === 'UNEXPECTED_END_OF_FORM') {
console.log('🚨 MULTER ERROR CAUGHT:', err.message);
const message = `File upload failed: ${err.message}`;
error = { message, statusCode: 400 } as AppError;
}
// Default error
const statusCode = error.statusCode || 500;
const message = error.message || 'Server Error';

View File

@@ -13,9 +13,15 @@ if (!fs.existsSync(uploadDir)) {
// File filter function
const fileFilter = (req: Request, file: any, cb: multer.FileFilterCallback) => {
console.log('🔍 File filter called for:', file.originalname);
console.log('🔍 ===== FILE FILTER CALLED =====');
console.log('🔍 File originalname:', file.originalname);
console.log('🔍 File mimetype:', file.mimetype);
console.log('🔍 File size:', file.size);
console.log('🔍 File encoding:', file.encoding);
console.log('🔍 File fieldname:', file.fieldname);
console.log('🔍 Request Content-Type:', req.get('Content-Type'));
console.log('🔍 Request Content-Length:', req.get('Content-Length'));
console.log('🔍 ===========================');
// Check file type - allow PDF and text files for testing
const allowedTypes = ['application/pdf', 'text/plain', 'text/html'];
@@ -68,6 +74,14 @@ const upload = multer({
// Error handling middleware for multer
export const handleUploadError = (error: any, req: Request, res: Response, next: NextFunction): void => {
console.log('🚨 =============================');
console.log('🚨 UPLOAD ERROR HANDLER CALLED');
console.log('🚨 Error type:', error?.constructor?.name);
console.log('🚨 Error message:', error?.message);
console.log('🚨 Error code:', error?.code);
console.log('🚨 Is MulterError:', error instanceof multer.MulterError);
console.log('🚨 =============================');
if (error instanceof multer.MulterError) {
logger.error('Multer error during file upload:', {
error: error.message,
@@ -129,12 +143,14 @@ export const handleUploadError = (error: any, req: Request, res: Response, next:
// Main upload middleware with timeout handling
export const uploadMiddleware = (req: Request, res: Response, next: NextFunction) => {
console.log('📤 Upload middleware called');
console.log('📤 =============================');
console.log('📤 UPLOAD MIDDLEWARE CALLED');
console.log('📤 Request method:', req.method);
console.log('📤 Request URL:', req.url);
console.log('📤 Content-Type:', req.get('Content-Type'));
console.log('📤 Content-Length:', req.get('Content-Length'));
console.log('📤 User-Agent:', req.get('User-Agent'));
console.log('📤 =============================');
// Set a timeout for the upload
const uploadTimeout = setTimeout(() => {
@@ -155,12 +171,25 @@ export const uploadMiddleware = (req: Request, res: Response, next: NextFunction
clearTimeout(uploadTimeout);
if (err) {
console.log('❌ Upload middleware error:', err);
console.log('❌ Error details:', {
name: err.name,
message: err.message,
code: err.code,
stack: err.stack?.split('\n')[0]
});
} else {
console.log('✅ Upload middleware completed successfully');
console.log('✅ File after multer processing:', {
hasFile: !!req.file,
filename: req.file?.originalname,
size: req.file?.size,
mimetype: req.file?.mimetype
});
}
originalNext(err);
};
console.log('🔄 Calling multer.single("document")...');
upload.single('document')(req, res, next);
};

View File

@@ -1,6 +1,6 @@
import { v4 as uuidv4 } from 'uuid';
import { logger } from '../utils/logger';
import pool from '../config/database';
import { getSupabaseServiceClient } from '../config/supabase';
export interface DocumentChunk {
id: string;
@@ -15,577 +15,21 @@ export interface DocumentChunk {
updatedAt: Date;
}
export interface VectorSearchResult {
documentId: string;
similarityScore: number;
chunkContent: string;
metadata: Record<string, any>;
}
export interface DocumentSimilarity {
id: string;
sourceDocumentId: string;
targetDocumentId: string;
similarityScore: number;
similarityType: string;
metadata: Record<string, any>;
createdAt: Date;
}
export interface IndustryEmbedding {
id: string;
industryName: string;
industryDescription?: string;
embedding: number[];
documentCount: number;
averageSimilarity?: number;
createdAt: Date;
updatedAt: Date;
}
export class VectorDatabaseModel {
/**
* Store document chunks with embeddings
*/
static async storeDocumentChunks(chunks: Omit<DocumentChunk, 'id' | 'createdAt' | 'updatedAt'>[]): Promise<void> {
const client = await pool.connect();
try {
await client.query('BEGIN');
for (const chunk of chunks) {
// Ensure embedding is properly formatted for pgvector
const embeddingArray = Array.isArray(chunk.embedding) ? chunk.embedding : [];
// Validate embedding dimensions (should be 1536 for text-embedding-3-small)
if (embeddingArray.length !== 1536) {
logger.warn(`Embedding dimension mismatch: expected 1536, got ${embeddingArray.length}`);
// Pad or truncate to 1536 dimensions if necessary
const paddedEmbedding = new Array(1536).fill(0);
embeddingArray.forEach((val, index) => {
if (index < 1536) paddedEmbedding[index] = val;
});
}
// Format embedding properly for pgvector - must be a JSON array string
const embeddingString = JSON.stringify(embeddingArray);
await client.query(`
INSERT INTO document_chunks (
id, document_id, content, metadata, embedding,
chunk_index, section, page_number
) VALUES ($1, $2, $3, $4, $5::vector, $6, $7, $8)
ON CONFLICT (id) DO UPDATE SET
content = EXCLUDED.content,
metadata = EXCLUDED.metadata,
embedding = EXCLUDED.embedding,
section = EXCLUDED.section,
page_number = EXCLUDED.page_number,
updated_at = CURRENT_TIMESTAMP
`, [
uuidv4(),
chunk.documentId,
chunk.content,
JSON.stringify(chunk.metadata),
embeddingString, // Pass as JSON string for pgvector
chunk.chunkIndex,
chunk.section,
chunk.pageNumber
]);
}
await client.query('COMMIT');
logger.info(`Stored ${chunks.length} document chunks in vector database`);
} catch (error) {
await client.query('ROLLBACK');
const supabase = getSupabaseServiceClient();
const { data, error } = await supabase
.from('document_chunks')
.insert(chunks.map(chunk => ({
...chunk,
embedding: `[${chunk.embedding.join(',')}]` // Format for pgvector
})));
if (error) {
logger.error('Failed to store document chunks', error);
throw error;
} finally {
client.release();
}
logger.info(`Stored ${chunks.length} document chunks in vector database`);
}
/**
* Search for similar content using vector similarity
*/
static async searchSimilarContent(
queryEmbedding: number[],
options: {
documentId?: string;
limit?: number;
similarityThreshold?: number;
filters?: Record<string, any>;
} = {}
): Promise<VectorSearchResult[]> {
const {
documentId,
limit = 10,
similarityThreshold = 0.7,
filters = {}
} = options;
// Ensure embedding is properly formatted
const embeddingArray = Array.isArray(queryEmbedding) ? queryEmbedding : [];
// Validate embedding dimensions
if (embeddingArray.length !== 1536) {
logger.warn(`Query embedding dimension mismatch: expected 1536, got ${embeddingArray.length}`);
// Pad or truncate to 1536 dimensions if necessary
const paddedEmbedding = new Array(1536).fill(0);
embeddingArray.forEach((val, index) => {
if (index < 1536) paddedEmbedding[index] = val;
});
}
let query = `
SELECT
dc.document_id,
1 - (dc.embedding <=> $1::vector) as similarity_score,
dc.content as chunk_content,
dc.metadata
FROM document_chunks dc
WHERE dc.embedding IS NOT NULL
`;
const params: any[] = [embeddingArray];
let paramIndex = 2;
if (documentId) {
query += ` AND dc.document_id = $${paramIndex}`;
params.push(documentId);
paramIndex++;
}
// Add metadata filters
Object.entries(filters).forEach(([key, value]) => {
query += ` AND dc.metadata->>'${key}' = $${paramIndex}`;
params.push(value);
paramIndex++;
});
query += `
AND 1 - (dc.embedding <=> $1::vector) >= $${paramIndex}
ORDER BY dc.embedding <=> $1::vector
LIMIT $${paramIndex + 1}
`;
params.push(similarityThreshold, limit);
try {
const result = await pool.query(query, params);
return result.rows.map((row: any) => ({
documentId: row.document_id,
similarityScore: parseFloat(row.similarity_score),
chunkContent: row.chunk_content,
metadata: row.metadata
}));
} catch (error) {
logger.error('Vector search failed', error);
throw error;
}
}
/**
* Get document chunks by document ID
*/
static async getDocumentChunks(documentId: string): Promise<DocumentChunk[]> {
try {
const result = await pool.query(`
SELECT
id,
document_id,
content,
metadata,
embedding,
chunk_index,
section,
page_number,
created_at,
updated_at
FROM document_chunks
WHERE document_id = $1
ORDER BY chunk_index
`, [documentId]);
return result.rows.map((row: any) => ({
id: row.id,
documentId: row.document_id,
content: row.content,
metadata: row.metadata || {},
embedding: row.embedding || [],
chunkIndex: row.chunk_index,
section: row.section,
pageNumber: row.page_number,
createdAt: row.created_at,
updatedAt: row.updated_at
}));
} catch (error) {
logger.error('Failed to get document chunks', error);
throw error;
}
}
/**
* Find similar documents
*/
static async findSimilarDocuments(
documentId: string,
limit: number = 10,
similarityThreshold: number = 0.6
): Promise<DocumentSimilarity[]> {
try {
// Get document chunks
const documentChunks = await this.getDocumentChunks(documentId);
if (documentChunks.length === 0) return [];
// Use the first chunk as reference
const referenceChunk = documentChunks[0];
if (!referenceChunk || !referenceChunk.embedding) return [];
const result = await pool.query(`
SELECT
id,
source_document_id,
target_document_id,
similarity_score,
similarity_type,
metadata,
created_at
FROM document_similarities
WHERE source_document_id = $1
AND similarity_score >= $2
ORDER BY similarity_score DESC
LIMIT $3
`, [documentId, similarityThreshold, limit]);
return result.rows.map((row: any) => ({
id: row.id,
sourceDocumentId: row.source_document_id,
targetDocumentId: row.target_document_id,
similarityScore: parseFloat(row.similarity_score),
similarityType: row.similarity_type,
metadata: row.metadata || {},
createdAt: row.created_at
}));
} catch (error) {
logger.error('Failed to find similar documents', error);
throw error;
}
}
/**
* Update document similarities
*/
static async updateDocumentSimilarities(): Promise<void> {
try {
await pool.query(`
SELECT update_document_similarities();
`);
logger.info('Document similarities updated');
} catch (error) {
logger.error('Failed to update document similarities', error);
throw error;
}
}
/**
* Store industry embedding
*/
static async storeIndustryEmbedding(industry: Omit<IndustryEmbedding, 'id' | 'createdAt' | 'updatedAt'>): Promise<void> {
try {
// Ensure embedding is properly formatted
const embeddingArray = Array.isArray(industry.embedding) ? industry.embedding : [];
// Validate embedding dimensions
if (embeddingArray.length !== 1536) {
logger.warn(`Industry embedding dimension mismatch: expected 1536, got ${embeddingArray.length}`);
// Pad or truncate to 1536 dimensions if necessary
const paddedEmbedding = new Array(1536).fill(0);
embeddingArray.forEach((val, index) => {
if (index < 1536) paddedEmbedding[index] = val;
});
}
await pool.query(`
INSERT INTO industry_embeddings (
id, industry_name, industry_description, embedding,
document_count, average_similarity
) VALUES ($1, $2, $3, $4::vector, $5, $6)
ON CONFLICT (industry_name) DO UPDATE SET
industry_description = EXCLUDED.industry_description,
embedding = EXCLUDED.embedding,
document_count = EXCLUDED.document_count,
average_similarity = EXCLUDED.average_similarity,
updated_at = CURRENT_TIMESTAMP
`, [
uuidv4(),
industry.industryName,
industry.industryDescription,
embeddingArray,
industry.documentCount,
industry.averageSimilarity
]);
logger.info(`Stored industry embedding for: ${industry.industryName}`);
} catch (error) {
logger.error('Failed to store industry embedding', error);
throw error;
}
}
/**
* Search by industry
*/
static async searchByIndustry(
industryName: string,
queryEmbedding: number[],
limit: number = 20
): Promise<VectorSearchResult[]> {
try {
// Ensure embedding is properly formatted
const embeddingArray = Array.isArray(queryEmbedding) ? queryEmbedding : [];
// Validate embedding dimensions
if (embeddingArray.length !== 1536) {
logger.warn(`Industry search embedding dimension mismatch: expected 1536, got ${embeddingArray.length}`);
// Pad or truncate to 1536 dimensions if necessary
const paddedEmbedding = new Array(1536).fill(0);
embeddingArray.forEach((val, index) => {
if (index < 1536) paddedEmbedding[index] = val;
});
}
const result = await pool.query(`
SELECT
dc.document_id,
1 - (dc.embedding <=> $1::vector) as similarity_score,
dc.content as chunk_content,
dc.metadata
FROM document_chunks dc
WHERE dc.embedding IS NOT NULL
AND dc.metadata->>'industry' = $2
ORDER BY dc.embedding <=> $1::vector
LIMIT $3
`, [embeddingArray, industryName.toLowerCase(), limit]);
return result.rows.map((row: any) => ({
documentId: row.document_id,
similarityScore: parseFloat(row.similarity_score),
chunkContent: row.chunk_content,
metadata: row.metadata || {}
}));
} catch (error) {
logger.error('Failed to search by industry', error);
throw error;
}
}
/**
* Track search query for analytics
*/
static async trackSearchQuery(
userId: string,
queryText: string,
queryEmbedding: number[],
searchResults: VectorSearchResult[],
options: {
filters?: Record<string, any>;
limitCount?: number;
similarityThreshold?: number;
processingTimeMs?: number;
} = {}
): Promise<void> {
try {
// Ensure embedding is properly formatted
const embeddingArray = Array.isArray(queryEmbedding) ? queryEmbedding : [];
// Validate embedding dimensions
if (embeddingArray.length !== 1536) {
logger.warn(`Search tracking embedding dimension mismatch: expected 1536, got ${embeddingArray.length}`);
// Pad or truncate to 1536 dimensions if necessary
const paddedEmbedding = new Array(1536).fill(0);
embeddingArray.forEach((val, index) => {
if (index < 1536) paddedEmbedding[index] = val;
});
}
await pool.query(`
INSERT INTO vector_similarity_searches (
id, user_id, query_text, query_embedding, search_results,
filters, limit_count, similarity_threshold, processing_time_ms
) VALUES ($1, $2, $3, $4::vector, $5, $6, $7, $8, $9)
`, [
uuidv4(),
userId,
queryText,
embeddingArray,
JSON.stringify(searchResults),
JSON.stringify(options.filters || {}),
options.limitCount || 10,
options.similarityThreshold || 0.7,
options.processingTimeMs || 0
]);
logger.debug('Search query tracked for analytics');
} catch (error) {
logger.error('Failed to track search query', error);
// Don't throw - analytics failure shouldn't break search
}
}
/**
* Get search analytics
*/
static async getSearchAnalytics(userId: string, days: number = 30): Promise<any[]> {
try {
const result = await pool.query(`
SELECT
query_text,
COUNT(*) as search_count,
AVG(processing_time_ms) as avg_processing_time,
AVG(similarity_threshold) as avg_similarity_threshold,
MAX(created_at) as last_search
FROM vector_similarity_searches
WHERE user_id = $1
AND created_at >= NOW() - INTERVAL '${days} days'
GROUP BY query_text
ORDER BY search_count DESC
LIMIT 20
`, [userId]);
return result.rows;
} catch (error) {
logger.error('Failed to get search analytics', error);
throw error;
}
}
/**
* Delete document chunks
*/
static async deleteDocumentChunks(documentId: string): Promise<void> {
try {
await pool.query(`
DELETE FROM document_chunks
WHERE document_id = $1
`, [documentId]);
logger.info(`Deleted chunks for document: ${documentId}`);
} catch (error) {
logger.error('Failed to delete document chunks', error);
throw error;
}
}
/**
* Get vector database statistics
*/
static async getVectorDatabaseStats(): Promise<{
totalChunks: number;
totalDocuments: number;
totalSearches: number;
averageSimilarity: number;
}> {
try {
const [chunksResult, documentsResult, searchesResult, similarityResult] = await Promise.all([
pool.query('SELECT COUNT(*) as count FROM document_chunks'),
pool.query('SELECT COUNT(DISTINCT document_id) as count FROM document_chunks'),
pool.query('SELECT COUNT(*) as count FROM vector_similarity_searches'),
pool.query(`
SELECT AVG(similarity_score) as avg_similarity
FROM document_similarities
WHERE similarity_score > 0
`)
]);
return {
totalChunks: parseInt(chunksResult.rows[0]?.count || '0'),
totalDocuments: parseInt(documentsResult.rows[0]?.count || '0'),
totalSearches: parseInt(searchesResult.rows[0]?.count || '0'),
averageSimilarity: parseFloat(similarityResult.rows[0]?.avg_similarity || '0')
};
} catch (error) {
logger.error('Failed to get vector database stats', error);
throw error;
}
}
/**
* Get all chunks (for testing/debugging)
*/
static async getAllChunks(): Promise<DocumentChunk[]> {
try {
const result = await pool.query(`
SELECT
id,
document_id,
content,
metadata,
embedding,
chunk_index,
section,
page_number,
created_at,
updated_at
FROM document_chunks
ORDER BY document_id, chunk_index
LIMIT 1000
`);
return result.rows.map((row: any) => ({
id: row.id,
documentId: row.document_id,
content: row.content,
metadata: row.metadata || {},
embedding: row.embedding || [],
chunkIndex: row.chunk_index,
section: row.section,
pageNumber: row.page_number,
createdAt: row.created_at,
updatedAt: row.updated_at
}));
} catch (error) {
logger.error('Failed to get all chunks', error);
throw error;
}
}
/**
* Get total chunk count
*/
static async getTotalChunkCount(): Promise<number> {
try {
const result = await pool.query('SELECT COUNT(*) as count FROM document_chunks');
return parseInt(result.rows[0]?.count || '0');
} catch (error) {
logger.error('Failed to get total chunk count', error);
throw error;
}
}
/**
* Get total document count
*/
static async getTotalDocumentCount(): Promise<number> {
try {
const result = await pool.query('SELECT COUNT(DISTINCT document_id) as count FROM document_chunks');
return parseInt(result.rows[0]?.count || '0');
} catch (error) {
logger.error('Failed to get total document count', error);
throw error;
}
}
/**
* Get average chunk size
*/
static async getAverageChunkSize(): Promise<number> {
try {
const result = await pool.query('SELECT AVG(LENGTH(content)) as avg_size FROM document_chunks');
return Math.round(parseFloat(result.rows[0]?.avg_size || '0'));
} catch (error) {
logger.error('Failed to get average chunk size', error);
throw error;
}
}
}
}

View File

@@ -63,6 +63,7 @@ export interface ProcessingJob {
}
export type ProcessingStatus =
| 'uploading'
| 'uploaded'
| 'extracting_text'
| 'processing_llm'

View File

@@ -23,9 +23,13 @@ const router = express.Router();
router.use(verifyFirebaseToken);
router.use(addCorrelationId);
// Essential document management routes (keeping these)
// NEW Firebase Storage direct upload routes
router.post('/upload-url', documentController.getUploadUrl);
router.post('/:id/confirm-upload', validateUUID('id'), documentController.confirmUpload);
// LEGACY multipart upload routes (keeping for backward compatibility)
router.post('/upload', handleFileUpload, documentController.uploadDocument);
router.post('/', handleFileUpload, documentController.uploadDocument); // Add direct POST to /documents for frontend compatibility
router.post('/', handleFileUpload, documentController.uploadDocument);
router.get('/', documentController.getDocuments);
// Analytics endpoints (MUST come before /:id routes to avoid conflicts)

View File

@@ -483,6 +483,37 @@ class FileStorageService {
}
}
/**
* Generate signed upload URL for direct client uploads
*/
async generateSignedUploadUrl(filePath: string, contentType: string, expirationMinutes: number = 60): Promise<string> {
try {
const bucket = this.storage.bucket(this.bucketName);
const file = bucket.file(filePath);
// Generate signed upload URL with retry logic
const [signedUrl] = await this.retryOperation(
async () => file.getSignedUrl({
version: 'v4',
action: 'write',
expires: Date.now() + (expirationMinutes * 60 * 1000),
contentType: contentType,
}),
'generate signed upload URL from GCS'
);
logger.info(`Generated signed upload URL for file: ${filePath}`, {
contentType,
expirationMinutes,
});
return signedUrl;
} catch (error) {
logger.error(`Error generating signed upload URL for file: ${filePath}`, error);
throw new Error(`Failed to generate upload URL: ${error instanceof Error ? error.message : 'Unknown error'}`);
}
}
/**
* Copy file within Google Cloud Storage
*/

23
cors.json Normal file
View File

@@ -0,0 +1,23 @@
[
{
"origin": [
"https://cim-summarizer.web.app",
"https://cim-summarizer.firebaseapp.com",
"http://localhost:3000",
"http://localhost:5173"
],
"method": [
"GET",
"POST",
"PUT",
"DELETE",
"OPTIONS"
],
"responseHeader": [
"Content-Type",
"Authorization",
"X-Requested-With"
],
"maxAgeSeconds": 3600
}
]

6
firebase.json Normal file
View File

@@ -0,0 +1,6 @@
{
"storage": {
"rules": "storage.rules",
"cors": "storage.cors.json"
}
}

View File

@@ -63,6 +63,10 @@
}
],
"rewrites": [
{
"source": "/api/**",
"function": "api"
},
{
"source": "**",
"destination": "/index.html"

View File

@@ -387,19 +387,6 @@ const Dashboard: React.FC = () => {
<span className="text-sm text-white">
Welcome, {user?.name || user?.email}
</span>
{/* Debug buttons - show in production for troubleshooting */}
<button
onClick={handleDebugAuth}
className="bg-yellow-500 hover:bg-yellow-600 text-white px-3 py-1 rounded text-sm"
>
Debug Auth
</button>
<button
onClick={handleTestAPIAuth}
className="bg-blue-500 hover:bg-blue-600 text-white px-3 py-1 rounded text-sm"
>
Test API
</button>
<LogoutButton variant="button" className="bg-error-500 hover:bg-error-600 text-white" />
</div>
</div>

View File

@@ -14,10 +14,10 @@ interface UploadedFile {
progress: number;
error?: string;
documentId?: string; // Real document ID from backend
// GCS-specific fields
gcsError?: boolean;
storageType?: 'gcs' | 'local';
gcsUrl?: string;
// Firebase Storage specific fields
storageError?: boolean;
storageType?: 'firebase' | 'local';
storageUrl?: string;
}
interface DocumentUploadProps {
@@ -92,17 +92,15 @@ const DocumentUpload: React.FC<DocumentUploadProps> = ({
try {
// Upload the document with optimized agentic RAG processing (no strategy selection needed)
const document = await documentService.uploadDocument(
file,
const result = await documentService.uploadDocument(
file,
(progress) => {
setUploadedFiles(prev =>
prev.map(f =>
f.id === uploadedFile.id
? { ...f, progress }
: f
f.id === uploadedFile.id ? { ...f, progress } : f
)
);
},
},
abortController.signal
);
@@ -141,13 +139,13 @@ const DocumentUpload: React.FC<DocumentUploadProps> = ({
} else {
console.error('Upload failed:', error);
// Handle GCS-specific errors
// Handle storage-specific errors
let errorMessage = 'Upload failed';
let isGCSError = false;
let isStorageError = false;
if (GCSErrorHandler.isGCSError(error)) {
errorMessage = GCSErrorHandler.getErrorMessage(error as GCSError);
isGCSError = true;
isStorageError = true;
} else if (error instanceof Error) {
errorMessage = error.message;
}
@@ -159,8 +157,8 @@ const DocumentUpload: React.FC<DocumentUploadProps> = ({
...f,
status: 'error',
error: errorMessage,
// Add GCS error indicator
...(isGCSError && { gcsError: true })
// Add storage error indicator
...(isStorageError && { storageError: true })
}
: f
)
@@ -297,19 +295,19 @@ const DocumentUpload: React.FC<DocumentUploadProps> = ({
}
};
const getStatusText = (status: UploadedFile['status'], error?: string, gcsError?: boolean) => {
const getStatusText = (status: UploadedFile['status'], error?: string, storageError?: boolean) => {
switch (status) {
case 'uploading':
return 'Uploading to Google Cloud Storage...';
return 'Uploading to Firebase Storage...';
case 'uploaded':
return 'Uploaded to GCS ✓';
return 'Uploaded to Firebase Storage ✓';
case 'processing':
return 'Processing with Optimized Agentic RAG...';
return 'Processing with Document AI + Optimized Agentic RAG...';
case 'completed':
return 'Completed ✓';
return 'Completed ✓ (PDF automatically deleted)';
case 'error':
if (error === 'Upload cancelled') return 'Cancelled';
if (gcsError) return 'GCS Error';
if (storageError) return 'Firebase Storage Error';
return 'Error';
default:
return '';
@@ -323,10 +321,10 @@ const DocumentUpload: React.FC<DocumentUploadProps> = ({
<div className="flex items-center">
<CheckCircle className="h-5 w-5 text-blue-600 mr-2" />
<div>
<h3 className="text-sm font-medium text-blue-800">Optimized Agentic RAG Processing</h3>
<h3 className="text-sm font-medium text-blue-800">Document AI + Optimized Agentic RAG Processing</h3>
<p className="text-sm text-blue-700 mt-1">
All documents are automatically processed using our advanced optimized agentic RAG system,
which includes intelligent chunking, vectorization, and multi-agent analysis for the best results.
All documents are automatically processed using Google Document AI for extraction and our advanced optimized agentic RAG system for analysis,
including intelligent chunking, vectorization, and multi-agent CIM review. PDFs are automatically deleted after processing.
</p>
</div>
</div>
@@ -351,7 +349,7 @@ const DocumentUpload: React.FC<DocumentUploadProps> = ({
Drag and drop PDF files here, or click to browse
</p>
<p className="text-xs text-gray-500">
Maximum file size: 50MB Supported format: PDF Stored securely in Google Cloud Storage Automatic Optimized Agentic RAG Processing
Maximum file size: 50MB Supported format: PDF Stored securely in Firebase Storage Automatic Document AI + Optimized Agentic RAG Processing PDFs deleted after processing
</p>
</div>
@@ -379,8 +377,8 @@ const DocumentUpload: React.FC<DocumentUploadProps> = ({
<div>
<h4 className="text-sm font-medium text-success-800">Upload Complete</h4>
<p className="text-sm text-success-700 mt-1">
Files have been uploaded successfully to Google Cloud Storage! You can now navigate away from this page.
Processing will continue in the background using Optimized Agentic RAG and you can check the status in the Documents tab.
Files have been uploaded successfully to Firebase Storage! You can now navigate away from this page.
Processing will continue in the background using Document AI + Optimized Agentic RAG. PDFs will be automatically deleted after processing to save costs.
</p>
</div>
</div>
@@ -426,10 +424,10 @@ const DocumentUpload: React.FC<DocumentUploadProps> = ({
<div className="flex items-center space-x-1">
{getStatusIcon(file.status)}
<span className="text-xs text-gray-600">
{getStatusText(file.status, file.error, file.gcsError)}
{getStatusText(file.status, file.error, file.storageError)}
</span>
{/* GCS indicator */}
{file.storageType === 'gcs' && (
{/* Firebase Storage indicator */}
{file.storageType === 'firebase' && (
<Cloud className="h-3 w-3 text-blue-500" />
)}
</div>
@@ -452,4 +450,4 @@ const DocumentUpload: React.FC<DocumentUploadProps> = ({
);
};
export default DocumentUpload;
export default DocumentUpload;

View File

@@ -60,7 +60,7 @@ export interface Document {
file_path: string;
file_size: number;
uploaded_at: string;
status: 'uploaded' | 'extracting_text' | 'processing_llm' | 'generating_pdf' | 'completed' | 'failed';
status: 'uploading' | 'uploaded' | 'extracting_text' | 'processing_llm' | 'generating_pdf' | 'completed' | 'failed';
extracted_text?: string;
generated_summary?: string;
summary_markdown_path?: string;
@@ -219,7 +219,7 @@ export class GCSErrorHandler {
class DocumentService {
/**
* Upload a document for processing
* Upload a document using Firebase Storage direct upload (new method)
*/
async uploadDocument(
file: File,
@@ -233,7 +233,137 @@ class DocumentService {
throw new Error('Authentication required. Please log in to upload documents.');
}
console.log('📤 Starting document upload...');
console.log('📤 Starting Firebase Storage direct upload...');
console.log('📤 File:', file.name, 'Size:', file.size, 'Type:', file.type);
console.log('📤 Token available:', !!token);
// Step 1: Get signed upload URL
onProgress?.(5); // 5% - Getting upload URL
console.log('🌐 Making request to upload-url endpoint');
console.log('🌐 Base URL:', API_BASE_URL);
console.log('🌐 Full URL would be:', `${API_BASE_URL}/documents/upload-url`);
console.log('🌐 Request payload:', { fileName: file.name, fileSize: file.size, contentType: file.type });
const uploadUrlResponse = await apiClient.post('/documents/upload-url', {
fileName: file.name,
fileSize: file.size,
contentType: file.type
}, { signal });
const { documentId, uploadUrl } = uploadUrlResponse.data;
console.log('✅ Got signed upload URL for document:', documentId);
// Step 2: Upload directly to Firebase Storage
onProgress?.(10); // 10% - Starting direct upload
await this.uploadToFirebaseStorage(file, uploadUrl, onProgress, signal);
console.log('✅ File uploaded to Firebase Storage');
// Step 3: Confirm upload and trigger processing
onProgress?.(95); // 95% - Confirming upload
const confirmResponse = await apiClient.post(`/documents/${documentId}/confirm-upload`, {}, { signal });
onProgress?.(100); // 100% - Complete
console.log('✅ Upload confirmed and processing started');
return {
id: documentId,
...confirmResponse.data
};
} catch (error: any) {
console.error('❌ Firebase Storage upload failed:', error);
// Handle specific error cases
if (error.name === 'AbortError') {
throw new Error('Upload was cancelled.');
}
if (error.response?.status === 401) {
throw new Error('Authentication required. Please log in again.');
}
if (error.response?.status === 400) {
throw new Error(error.response?.data?.error || 'Invalid request');
}
if (error.response?.status >= 500) {
throw new Error('Server error. Please try again later.');
}
// Generic error fallback
throw new Error(error.response?.data?.error || error.message || 'Upload failed');
}
}
/**
* Upload file directly to Firebase Storage using signed URL
*/
private async uploadToFirebaseStorage(
file: File,
uploadUrl: string,
onProgress?: (progress: number) => void,
signal?: AbortSignal
): Promise<void> {
return new Promise((resolve, reject) => {
const xhr = new XMLHttpRequest();
// Handle upload progress
xhr.upload.addEventListener('progress', (event) => {
if (event.lengthComputable && onProgress) {
// Map Firebase Storage upload to 10%-90% of overall progress
const uploadProgress = Math.round((event.loaded / event.total) * 80) + 10;
onProgress(uploadProgress);
}
});
// Handle completion
xhr.addEventListener('load', () => {
if (xhr.status >= 200 && xhr.status < 300) {
resolve();
} else {
reject(new Error(`Firebase Storage upload failed: ${xhr.status} ${xhr.statusText}`));
}
});
// Handle errors
xhr.addEventListener('error', () => {
reject(new Error('Firebase Storage upload failed: Network error'));
});
// Handle abort
if (signal) {
signal.addEventListener('abort', () => {
xhr.abort();
reject(new Error('Upload was cancelled'));
});
}
// Start upload
xhr.open('PUT', uploadUrl);
xhr.setRequestHeader('Content-Type', file.type);
xhr.send(file);
});
}
/**
* Legacy multipart upload method (kept for compatibility)
*/
async uploadDocumentLegacy(
file: File,
onProgress?: (progress: number) => void,
signal?: AbortSignal
): Promise<Document> {
try {
// Check authentication before upload
const token = await authService.getToken();
if (!token) {
throw new Error('Authentication required. Please log in to upload documents.');
}
console.log('📤 Starting legacy multipart upload...');
console.log('📤 File:', file.name, 'Size:', file.size, 'Type:', file.type);
console.log('📤 Token available:', !!token);
@@ -243,7 +373,7 @@ class DocumentService {
// Always use optimized agentic RAG processing - no strategy selection needed
formData.append('processingStrategy', 'optimized_agentic_rag');
const response = await apiClient.post('/documents', formData, {
const response = await apiClient.post('/documents/upload', formData, {
headers: {
'Content-Type': 'multipart/form-data',
},
@@ -256,10 +386,10 @@ class DocumentService {
},
});
console.log('✅ Document upload successful:', response.data);
console.log('✅ Legacy document upload successful:', response.data);
return response.data;
} catch (error: any) {
console.error('❌ Document upload failed:', error);
console.error('❌ Legacy document upload failed:', error);
// Provide more specific error messages
if (error.response?.status === 401) {

23
storage.cors.json Normal file
View File

@@ -0,0 +1,23 @@
[
{
"origin": [
"https://cim-summarizer.web.app",
"https://cim-summarizer.firebaseapp.com",
"http://localhost:3000",
"http://localhost:5173"
],
"method": [
"GET",
"POST",
"PUT",
"DELETE",
"OPTIONS"
],
"responseHeader": [
"Content-Type",
"Authorization",
"X-Requested-With"
],
"maxAgeSeconds": 3600
}
]

8
storage.rules Normal file
View File

@@ -0,0 +1,8 @@
rules_version = '2';
service firebase.storage {
match /b/{bucket}/o {
match /{allPaths=**} {
allow read, write: if request.auth != null;
}
}
}