fix(core): Overhaul and fix the end-to-end document processing pipeline
This commit is contained in:
@@ -2,7 +2,7 @@
|
|||||||
"name": "cim-processor-backend",
|
"name": "cim-processor-backend",
|
||||||
"version": "1.0.0",
|
"version": "1.0.0",
|
||||||
"description": "Backend API for CIM Document Processor",
|
"description": "Backend API for CIM Document Processor",
|
||||||
"main": "index.js",
|
"main": "dist/index.js",
|
||||||
"scripts": {
|
"scripts": {
|
||||||
"dev": "ts-node-dev --respawn --transpile-only --max-old-space-size=8192 --expose-gc src/index.ts",
|
"dev": "ts-node-dev --respawn --transpile-only --max-old-space-size=8192 --expose-gc src/index.ts",
|
||||||
"build": "tsc && node src/scripts/prepare-dist.js && cp .puppeteerrc.cjs dist/",
|
"build": "tsc && node src/scripts/prepare-dist.js && cp .puppeteerrc.cjs dist/",
|
||||||
|
|||||||
@@ -7,10 +7,11 @@ import { uploadProgressService } from '../services/uploadProgressService';
|
|||||||
import { uploadMonitoringService } from '../services/uploadMonitoringService';
|
import { uploadMonitoringService } from '../services/uploadMonitoringService';
|
||||||
|
|
||||||
export const documentController = {
|
export const documentController = {
|
||||||
async uploadDocument(req: Request, res: Response): Promise<void> {
|
async getUploadUrl(req: Request, res: Response): Promise<void> {
|
||||||
const startTime = Date.now();
|
console.log('🎯🎯🎯 GET UPLOAD URL ENDPOINT HIT!');
|
||||||
const structuredLogger = new StructuredLogger(req.correlationId);
|
console.log('🎯 Method:', req.method);
|
||||||
|
console.log('🎯 URL:', req.url);
|
||||||
|
console.log('🎯 Headers:', JSON.stringify(req.headers, null, 2));
|
||||||
try {
|
try {
|
||||||
const userId = req.user?.uid;
|
const userId = req.user?.uid;
|
||||||
if (!userId) {
|
if (!userId) {
|
||||||
@@ -21,206 +22,369 @@ export const documentController = {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check if file was uploaded
|
const { fileName, fileSize, contentType } = req.body;
|
||||||
if (!req.file) {
|
|
||||||
res.status(400).json({
|
if (!fileName || !fileSize || !contentType) {
|
||||||
error: 'No file uploaded',
|
res.status(400).json({
|
||||||
correlationId: req.correlationId
|
error: 'Missing required fields: fileName, fileSize, contentType',
|
||||||
|
correlationId: req.correlationId
|
||||||
});
|
});
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
const file = req.file;
|
// Validate file type
|
||||||
|
if (contentType !== 'application/pdf') {
|
||||||
// Track upload start
|
res.status(400).json({
|
||||||
const uploadEventData: any = {
|
error: 'Only PDF files are supported',
|
||||||
userId,
|
correlationId: req.correlationId
|
||||||
fileInfo: {
|
|
||||||
originalName: file.originalname,
|
|
||||||
size: file.size,
|
|
||||||
mimetype: file.mimetype,
|
|
||||||
},
|
|
||||||
status: 'started',
|
|
||||||
stage: 'upload_initiated',
|
|
||||||
};
|
|
||||||
|
|
||||||
if (req.correlationId) {
|
|
||||||
uploadEventData.correlationId = req.correlationId;
|
|
||||||
}
|
|
||||||
|
|
||||||
uploadMonitoringService.trackUploadEvent(uploadEventData);
|
|
||||||
|
|
||||||
structuredLogger.uploadStart({
|
|
||||||
originalName: file.originalname,
|
|
||||||
size: file.size,
|
|
||||||
mimetype: file.mimetype,
|
|
||||||
}, userId);
|
|
||||||
|
|
||||||
// Always use optimized agentic RAG processing - no strategy selection needed
|
|
||||||
const processingStrategy = 'optimized_agentic_rag';
|
|
||||||
|
|
||||||
// Store file and get file path
|
|
||||||
const storageResult = await fileStorageService.storeFile(file, userId);
|
|
||||||
|
|
||||||
if (!storageResult.success || !storageResult.fileInfo) {
|
|
||||||
const processingTime = Date.now() - startTime;
|
|
||||||
|
|
||||||
// Track upload failure
|
|
||||||
const failureEventData: any = {
|
|
||||||
userId,
|
|
||||||
fileInfo: {
|
|
||||||
originalName: file.originalname,
|
|
||||||
size: file.size,
|
|
||||||
mimetype: file.mimetype,
|
|
||||||
},
|
|
||||||
status: 'failed',
|
|
||||||
stage: 'file_storage',
|
|
||||||
error: {
|
|
||||||
message: storageResult.error || 'Failed to store file',
|
|
||||||
type: 'storage_error',
|
|
||||||
code: 'STORAGE_ERROR',
|
|
||||||
},
|
|
||||||
processingTime,
|
|
||||||
};
|
|
||||||
|
|
||||||
if (req.correlationId) {
|
|
||||||
failureEventData.correlationId = req.correlationId;
|
|
||||||
}
|
|
||||||
|
|
||||||
uploadMonitoringService.trackUploadEvent(failureEventData);
|
|
||||||
|
|
||||||
structuredLogger.uploadError(
|
|
||||||
new Error(storageResult.error || 'Failed to store file'),
|
|
||||||
{
|
|
||||||
originalName: file.originalname,
|
|
||||||
size: file.size,
|
|
||||||
mimetype: file.mimetype,
|
|
||||||
},
|
|
||||||
userId,
|
|
||||||
'file_storage'
|
|
||||||
);
|
|
||||||
|
|
||||||
res.status(500).json({
|
|
||||||
error: 'Failed to store file',
|
|
||||||
correlationId: req.correlationId
|
|
||||||
});
|
});
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Create document record
|
// Validate file size (max 50MB)
|
||||||
|
if (fileSize > 50 * 1024 * 1024) {
|
||||||
|
res.status(400).json({
|
||||||
|
error: 'File size exceeds 50MB limit',
|
||||||
|
correlationId: req.correlationId
|
||||||
|
});
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Generate unique file path
|
||||||
|
const timestamp = Date.now();
|
||||||
|
const sanitizedFileName = fileName.replace(/[^a-zA-Z0-9.-]/g, '_');
|
||||||
|
const filePath = `uploads/${userId}/${timestamp}_${sanitizedFileName}`;
|
||||||
|
|
||||||
|
// Create document record first
|
||||||
const document = await DocumentModel.create({
|
const document = await DocumentModel.create({
|
||||||
user_id: userId,
|
user_id: userId,
|
||||||
original_file_name: file.originalname,
|
original_file_name: fileName,
|
||||||
file_path: storageResult.fileInfo.path,
|
file_path: filePath,
|
||||||
file_size: file.size,
|
file_size: fileSize,
|
||||||
status: 'uploaded'
|
status: 'uploading'
|
||||||
});
|
});
|
||||||
|
|
||||||
// Always auto-process with optimized agentic RAG
|
// Generate signed upload URL
|
||||||
try {
|
const { fileStorageService } = await import('../services/fileStorageService');
|
||||||
const jobId = await jobQueueService.addJob(
|
const uploadUrl = await fileStorageService.generateSignedUploadUrl(filePath, contentType);
|
||||||
'document_processing',
|
|
||||||
{
|
|
||||||
documentId: document.id,
|
|
||||||
userId: userId,
|
|
||||||
options: { strategy: processingStrategy }
|
|
||||||
},
|
|
||||||
0 // Normal priority
|
|
||||||
);
|
|
||||||
logger.info('Document processing job queued with optimized agentic RAG', {
|
|
||||||
documentId: document.id,
|
|
||||||
jobId,
|
|
||||||
strategy: processingStrategy
|
|
||||||
});
|
|
||||||
|
|
||||||
// Update status to indicate it's queued for processing
|
|
||||||
await DocumentModel.updateById(document.id, { status: 'extracting_text' });
|
|
||||||
} catch (error) {
|
|
||||||
logger.error('Failed to queue document processing job', { error, documentId: document.id });
|
|
||||||
}
|
|
||||||
|
|
||||||
// Track upload success
|
console.log('✅ Generated upload URL for document:', document.id);
|
||||||
const processingTime = Date.now() - startTime;
|
|
||||||
const successEventData: any = {
|
|
||||||
userId,
|
|
||||||
fileInfo: {
|
|
||||||
originalName: file.originalname,
|
|
||||||
size: file.size,
|
|
||||||
mimetype: file.mimetype,
|
|
||||||
},
|
|
||||||
status: 'success',
|
|
||||||
stage: 'upload_completed',
|
|
||||||
processingTime,
|
|
||||||
};
|
|
||||||
|
|
||||||
if (req.correlationId) {
|
res.status(200).json({
|
||||||
successEventData.correlationId = req.correlationId;
|
documentId: document.id,
|
||||||
}
|
uploadUrl: uploadUrl,
|
||||||
|
filePath: filePath,
|
||||||
uploadMonitoringService.trackUploadEvent(successEventData);
|
|
||||||
|
|
||||||
structuredLogger.uploadSuccess({
|
|
||||||
originalName: file.originalname,
|
|
||||||
size: file.size,
|
|
||||||
mimetype: file.mimetype,
|
|
||||||
}, userId, processingTime);
|
|
||||||
|
|
||||||
// Return document info
|
|
||||||
res.status(201).json({
|
|
||||||
id: document.id,
|
|
||||||
name: document.original_file_name,
|
|
||||||
originalName: document.original_file_name,
|
|
||||||
status: 'extracting_text',
|
|
||||||
uploadedAt: document.created_at,
|
|
||||||
uploadedBy: userId,
|
|
||||||
fileSize: document.file_size,
|
|
||||||
processingStrategy: processingStrategy,
|
|
||||||
correlationId: req.correlationId || undefined
|
correlationId: req.correlationId || undefined
|
||||||
});
|
});
|
||||||
|
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
const processingTime = Date.now() - startTime;
|
console.log('❌ Get upload URL error:', error);
|
||||||
|
logger.error('Get upload URL failed', {
|
||||||
|
error,
|
||||||
|
correlationId: req.correlationId
|
||||||
|
});
|
||||||
|
|
||||||
// Track upload failure
|
res.status(500).json({
|
||||||
const errorEventData: any = {
|
error: 'Failed to generate upload URL',
|
||||||
userId: req.user?.uid || 'unknown',
|
message: error instanceof Error ? error.message : 'Unknown error',
|
||||||
fileInfo: {
|
correlationId: req.correlationId || undefined
|
||||||
originalName: req.file?.originalname || 'unknown',
|
});
|
||||||
size: req.file?.size || 0,
|
}
|
||||||
mimetype: req.file?.mimetype || 'unknown',
|
},
|
||||||
},
|
|
||||||
status: 'failed',
|
|
||||||
stage: 'upload_error',
|
|
||||||
error: {
|
|
||||||
message: error instanceof Error ? error.message : 'Unknown error',
|
|
||||||
type: 'upload_error',
|
|
||||||
},
|
|
||||||
processingTime,
|
|
||||||
};
|
|
||||||
|
|
||||||
if (req.correlationId) {
|
async confirmUpload(req: Request, res: Response): Promise<void> {
|
||||||
errorEventData.correlationId = req.correlationId;
|
try {
|
||||||
|
const userId = req.user?.uid;
|
||||||
|
if (!userId) {
|
||||||
|
res.status(401).json({
|
||||||
|
error: 'User not authenticated',
|
||||||
|
correlationId: req.correlationId
|
||||||
|
});
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
uploadMonitoringService.trackUploadEvent(errorEventData);
|
const { id: documentId } = req.params;
|
||||||
|
if (!documentId) {
|
||||||
|
res.status(400).json({
|
||||||
|
error: 'Document ID is required',
|
||||||
|
correlationId: req.correlationId
|
||||||
|
});
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
structuredLogger.uploadError(
|
// Get document record
|
||||||
error,
|
const document = await DocumentModel.findById(documentId);
|
||||||
{
|
if (!document) {
|
||||||
originalName: req.file?.originalname || 'unknown',
|
res.status(404).json({
|
||||||
size: req.file?.size || 0,
|
error: 'Document not found',
|
||||||
mimetype: req.file?.mimetype || 'unknown',
|
correlationId: req.correlationId
|
||||||
},
|
});
|
||||||
req.user?.uid || 'unknown',
|
return;
|
||||||
'upload_error'
|
}
|
||||||
|
|
||||||
|
// Verify user owns document
|
||||||
|
if (document.user_id !== userId) {
|
||||||
|
res.status(403).json({
|
||||||
|
error: 'Access denied',
|
||||||
|
correlationId: req.correlationId
|
||||||
|
});
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log('🔄 Starting Document AI processing for:', documentId);
|
||||||
|
|
||||||
|
// Update status to processing
|
||||||
|
await DocumentModel.updateById(documentId, {
|
||||||
|
status: 'processing_llm'
|
||||||
|
});
|
||||||
|
|
||||||
|
// Acknowledge the request immediately
|
||||||
|
res.status(202).json({
|
||||||
|
message: 'Upload confirmed, processing has started.',
|
||||||
|
documentId: documentId,
|
||||||
|
status: 'processing'
|
||||||
|
});
|
||||||
|
|
||||||
|
// Process in the background
|
||||||
|
(async () => {
|
||||||
|
try {
|
||||||
|
// Download file from Firebase Storage for Document AI processing
|
||||||
|
const { fileStorageService } = await import('../services/fileStorageService');
|
||||||
|
|
||||||
|
let fileBuffer: Buffer | null = null;
|
||||||
|
for (let i = 0; i < 3; i++) {
|
||||||
|
await new Promise(resolve => setTimeout(resolve, 2000)); // 2 second delay
|
||||||
|
fileBuffer = await fileStorageService.getFile(document.file_path);
|
||||||
|
if (fileBuffer) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!fileBuffer) {
|
||||||
|
await DocumentModel.updateById(documentId, {
|
||||||
|
status: 'failed',
|
||||||
|
error_message: 'Failed to download uploaded file'
|
||||||
|
});
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Process with Unified Document Processor
|
||||||
|
const { unifiedDocumentProcessor } = await import('../services/unifiedDocumentProcessor');
|
||||||
|
|
||||||
|
const result = await unifiedDocumentProcessor.processDocument(
|
||||||
|
documentId,
|
||||||
|
userId,
|
||||||
|
'', // Text is not needed for this strategy
|
||||||
|
{ strategy: 'optimized_agentic_rag' }
|
||||||
|
);
|
||||||
|
|
||||||
|
if (result.success) {
|
||||||
|
// Update document with results
|
||||||
|
await DocumentModel.updateById(documentId, {
|
||||||
|
status: 'completed',
|
||||||
|
generated_summary: result.summary,
|
||||||
|
processing_completed_at: new Date()
|
||||||
|
});
|
||||||
|
|
||||||
|
// 🗑️ DELETE PDF after successful processing
|
||||||
|
try {
|
||||||
|
await fileStorageService.deleteFile(document.file_path);
|
||||||
|
console.log('✅ PDF deleted after successful processing:', document.file_path);
|
||||||
|
} catch (deleteError) {
|
||||||
|
console.log('⚠️ Failed to delete PDF file:', deleteError);
|
||||||
|
logger.warn('Failed to delete PDF after processing', {
|
||||||
|
filePath: document.file_path,
|
||||||
|
documentId,
|
||||||
|
error: deleteError
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log('✅ Document AI processing completed successfully');
|
||||||
|
} else {
|
||||||
|
await DocumentModel.updateById(documentId, {
|
||||||
|
status: 'failed',
|
||||||
|
error_message: result.error
|
||||||
|
});
|
||||||
|
|
||||||
|
// Also delete PDF on processing failure to avoid storage costs
|
||||||
|
try {
|
||||||
|
await fileStorageService.deleteFile(document.file_path);
|
||||||
|
console.log('🗑️ PDF deleted after processing failure');
|
||||||
|
} catch (deleteError) {
|
||||||
|
console.log('⚠️ Failed to delete PDF file after error:', deleteError);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
console.log('❌ Background processing error:', error);
|
||||||
|
logger.error('Background processing failed', {
|
||||||
|
error,
|
||||||
|
documentId
|
||||||
|
});
|
||||||
|
await DocumentModel.updateById(documentId, {
|
||||||
|
status: 'failed',
|
||||||
|
error_message: 'Background processing failed'
|
||||||
|
});
|
||||||
|
}
|
||||||
|
})();
|
||||||
|
|
||||||
|
} catch (error) {
|
||||||
|
console.log('❌ Confirm upload error:', error);
|
||||||
|
logger.error('Confirm upload failed', {
|
||||||
|
error,
|
||||||
|
correlationId: req.correlationId
|
||||||
|
});
|
||||||
|
|
||||||
|
res.status(500).json({
|
||||||
|
error: 'Upload confirmation failed',
|
||||||
|
message: error instanceof Error ? error.message : 'Unknown error',
|
||||||
|
correlationId: req.correlationId || undefined
|
||||||
|
});
|
||||||
|
}
|
||||||
|
},
|
||||||
|
|
||||||
|
async uploadDocument(req: Request, res: Response): Promise<void> {
|
||||||
|
const startTime = Date.now();
|
||||||
|
|
||||||
|
// 🔍 COMPREHENSIVE DEBUG: Log everything about the request
|
||||||
|
console.log('🚀 =========================');
|
||||||
|
console.log('🚀 DOCUMENT AI UPLOAD STARTED');
|
||||||
|
console.log('🚀 Method:', req.method);
|
||||||
|
console.log('🚀 URL:', req.url);
|
||||||
|
console.log('🚀 Content-Type:', req.get('Content-Type'));
|
||||||
|
console.log('🚀 Content-Length:', req.get('Content-Length'));
|
||||||
|
console.log('🚀 Authorization header present:', !!req.get('Authorization'));
|
||||||
|
console.log('🚀 User from token:', req.user?.uid || 'NOT_FOUND');
|
||||||
|
|
||||||
|
// Debug body in detail
|
||||||
|
console.log('🚀 Has body:', !!req.body);
|
||||||
|
console.log('🚀 Body type:', typeof req.body);
|
||||||
|
console.log('🚀 Body constructor:', req.body?.constructor?.name);
|
||||||
|
console.log('🚀 Body length:', req.body?.length || 0);
|
||||||
|
console.log('🚀 Is Buffer?:', Buffer.isBuffer(req.body));
|
||||||
|
|
||||||
|
// Debug all headers
|
||||||
|
console.log('🚀 All headers:', JSON.stringify(req.headers, null, 2));
|
||||||
|
|
||||||
|
// Debug request properties
|
||||||
|
console.log('🚀 Request readable:', req.readable);
|
||||||
|
console.log('🚀 Request complete:', req.complete);
|
||||||
|
|
||||||
|
// If body exists, show first few bytes
|
||||||
|
if (req.body && req.body.length > 0) {
|
||||||
|
const preview = req.body.slice(0, 100).toString('hex');
|
||||||
|
console.log('🚀 Body preview (hex):', preview);
|
||||||
|
|
||||||
|
// Try to see if it contains multipart boundary
|
||||||
|
const bodyStr = req.body.toString('utf8', 0, Math.min(500, req.body.length));
|
||||||
|
console.log('🚀 Body preview (string):', bodyStr.substring(0, 200));
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log('🚀 =========================');
|
||||||
|
|
||||||
|
try {
|
||||||
|
const userId = req.user?.uid;
|
||||||
|
if (!userId) {
|
||||||
|
console.log('❌ Authentication failed - no userId');
|
||||||
|
res.status(401).json({
|
||||||
|
error: 'User not authenticated',
|
||||||
|
correlationId: req.correlationId
|
||||||
|
});
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log('✅ Authentication successful for user:', userId);
|
||||||
|
|
||||||
|
// Get raw body buffer for Document AI processing
|
||||||
|
const rawBody = req.body;
|
||||||
|
if (!rawBody || rawBody.length === 0) {
|
||||||
|
res.status(400).json({
|
||||||
|
error: 'No file data received',
|
||||||
|
correlationId: req.correlationId,
|
||||||
|
debug: {
|
||||||
|
method: req.method,
|
||||||
|
contentType: req.get('Content-Type'),
|
||||||
|
contentLength: req.get('Content-Length'),
|
||||||
|
hasRawBody: !!rawBody,
|
||||||
|
rawBodySize: rawBody?.length || 0,
|
||||||
|
bodyType: typeof rawBody
|
||||||
|
}
|
||||||
|
});
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log('✅ Found raw body buffer:', rawBody.length, 'bytes');
|
||||||
|
|
||||||
|
// Create document record first
|
||||||
|
const document = await DocumentModel.create({
|
||||||
|
user_id: userId,
|
||||||
|
original_file_name: 'uploaded-document.pdf',
|
||||||
|
file_path: '',
|
||||||
|
file_size: rawBody.length,
|
||||||
|
status: 'processing_llm'
|
||||||
|
});
|
||||||
|
|
||||||
|
console.log('✅ Document record created:', document.id);
|
||||||
|
|
||||||
|
// Process with Document AI directly
|
||||||
|
const { DocumentAiGenkitProcessor } = await import('../services/documentAiGenkitProcessor');
|
||||||
|
const processor = new DocumentAiGenkitProcessor();
|
||||||
|
|
||||||
|
console.log('✅ Starting Document AI processing...');
|
||||||
|
const result = await processor.processDocument(
|
||||||
|
document.id,
|
||||||
|
userId,
|
||||||
|
rawBody,
|
||||||
|
'uploaded-document.pdf',
|
||||||
|
'application/pdf'
|
||||||
);
|
);
|
||||||
|
|
||||||
|
if (result.success) {
|
||||||
|
await DocumentModel.updateById(document.id, {
|
||||||
|
status: 'completed',
|
||||||
|
generated_summary: result.content,
|
||||||
|
processing_completed_at: new Date()
|
||||||
|
});
|
||||||
|
|
||||||
|
console.log('✅ Document AI processing completed successfully');
|
||||||
|
|
||||||
|
res.status(201).json({
|
||||||
|
id: document.id,
|
||||||
|
name: 'uploaded-document.pdf',
|
||||||
|
originalName: 'uploaded-document.pdf',
|
||||||
|
status: 'completed',
|
||||||
|
uploadedAt: document.created_at,
|
||||||
|
uploadedBy: userId,
|
||||||
|
fileSize: rawBody.length,
|
||||||
|
summary: result.content,
|
||||||
|
correlationId: req.correlationId || undefined
|
||||||
|
});
|
||||||
|
return;
|
||||||
|
} else {
|
||||||
|
console.log('❌ Document AI processing failed:', result.error);
|
||||||
|
await DocumentModel.updateById(document.id, {
|
||||||
|
status: 'failed',
|
||||||
|
error_message: result.error
|
||||||
|
});
|
||||||
|
|
||||||
|
res.status(500).json({
|
||||||
|
error: 'Document processing failed',
|
||||||
|
message: result.error,
|
||||||
|
correlationId: req.correlationId || undefined
|
||||||
|
});
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
} catch (error) {
|
||||||
|
console.log('❌ Upload error:', error);
|
||||||
|
|
||||||
logger.error('Upload document failed', {
|
logger.error('Upload document failed', {
|
||||||
error,
|
error,
|
||||||
correlationId: req.correlationId
|
correlationId: req.correlationId
|
||||||
});
|
});
|
||||||
|
|
||||||
res.status(500).json({
|
res.status(500).json({
|
||||||
error: 'Upload failed',
|
error: 'Upload failed',
|
||||||
|
message: error instanceof Error ? error.message : 'Unknown error',
|
||||||
correlationId: req.correlationId || undefined
|
correlationId: req.correlationId || undefined
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
@@ -552,4 +716,4 @@ export const documentController = {
|
|||||||
throw new Error('Failed to get document text');
|
throw new Error('Failed to get document text');
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
@@ -18,20 +18,17 @@ import { notFoundHandler } from './middleware/notFoundHandler';
|
|||||||
|
|
||||||
const app = express();
|
const app = express();
|
||||||
|
|
||||||
// Enable trust proxy to ensure Express works correctly behind the proxy
|
// Add this middleware to log all incoming requests
|
||||||
|
app.use((req, res, next) => {
|
||||||
|
console.log(`Incoming request: ${req.method} ${req.path}`);
|
||||||
|
next();
|
||||||
|
});
|
||||||
|
|
||||||
|
// Enable trust proxy to ensure Express works correctly behind a proxy
|
||||||
app.set('trust proxy', 1);
|
app.set('trust proxy', 1);
|
||||||
|
|
||||||
// Security middleware
|
// Security middleware
|
||||||
app.use(helmet({
|
app.use(helmet());
|
||||||
contentSecurityPolicy: {
|
|
||||||
directives: {
|
|
||||||
defaultSrc: ["'self'"],
|
|
||||||
styleSrc: ["'self'", "'unsafe-inline'"],
|
|
||||||
scriptSrc: ["'self'"],
|
|
||||||
imgSrc: ["'self'", "data:", "https:"],
|
|
||||||
},
|
|
||||||
},
|
|
||||||
}));
|
|
||||||
|
|
||||||
// CORS configuration
|
// CORS configuration
|
||||||
const allowedOrigins = [
|
const allowedOrigins = [
|
||||||
@@ -43,13 +40,10 @@ const allowedOrigins = [
|
|||||||
|
|
||||||
app.use(cors({
|
app.use(cors({
|
||||||
origin: function (origin, callback) {
|
origin: function (origin, callback) {
|
||||||
console.log('🌐 CORS request from origin:', origin);
|
|
||||||
|
|
||||||
if (!origin || allowedOrigins.indexOf(origin) !== -1) {
|
if (!origin || allowedOrigins.indexOf(origin) !== -1) {
|
||||||
console.log('✅ CORS allowed for origin:', origin);
|
|
||||||
callback(null, true);
|
callback(null, true);
|
||||||
} else {
|
} else {
|
||||||
console.log('❌ CORS blocked origin:', origin);
|
logger.warn(`CORS blocked for origin: ${origin}`);
|
||||||
callback(new Error('Not allowed by CORS'));
|
callback(new Error('Not allowed by CORS'));
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
@@ -62,7 +56,7 @@ app.use(cors({
|
|||||||
// Rate limiting
|
// Rate limiting
|
||||||
const limiter = rateLimit({
|
const limiter = rateLimit({
|
||||||
windowMs: 15 * 60 * 1000, // 15 minutes
|
windowMs: 15 * 60 * 1000, // 15 minutes
|
||||||
max: 1000, // limit each IP to 1000 requests per windowMs (increased for testing)
|
max: 1000,
|
||||||
message: {
|
message: {
|
||||||
error: 'Too many requests from this IP, please try again later.',
|
error: 'Too many requests from this IP, please try again later.',
|
||||||
},
|
},
|
||||||
@@ -72,27 +66,6 @@ const limiter = rateLimit({
|
|||||||
|
|
||||||
app.use(limiter);
|
app.use(limiter);
|
||||||
|
|
||||||
// Body parsing middleware - only for non-multipart requests
|
|
||||||
app.use((req, res, next) => {
|
|
||||||
if (req.headers['content-type'] && req.headers['content-type'].includes('multipart/form-data')) {
|
|
||||||
// Skip body parsing for multipart requests - let multer handle it
|
|
||||||
next();
|
|
||||||
} else {
|
|
||||||
// Parse JSON and URL-encoded bodies for other requests
|
|
||||||
express.json({ limit: '10mb' })(req, res, next);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
app.use((req, res, next) => {
|
|
||||||
if (req.headers['content-type'] && req.headers['content-type'].includes('multipart/form-data')) {
|
|
||||||
// Skip body parsing for multipart requests - let multer handle it
|
|
||||||
next();
|
|
||||||
} else {
|
|
||||||
// Parse URL-encoded bodies for other requests
|
|
||||||
express.urlencoded({ extended: true, limit: '10mb' })(req, res, next);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
// Logging middleware
|
// Logging middleware
|
||||||
app.use(morgan('combined', {
|
app.use(morgan('combined', {
|
||||||
stream: {
|
stream: {
|
||||||
@@ -100,17 +73,12 @@ app.use(morgan('combined', {
|
|||||||
},
|
},
|
||||||
}));
|
}));
|
||||||
|
|
||||||
// Request debugging middleware
|
// CRITICAL: Add body parsing BEFORE routes
|
||||||
app.use((req, res, next) => {
|
app.use(express.json({ limit: '10mb' }));
|
||||||
console.log('📥 Incoming request:', req.method, req.url);
|
app.use(express.urlencoded({ extended: true, limit: '10mb' }));
|
||||||
console.log('📥 Request headers:', Object.keys(req.headers));
|
|
||||||
console.log('📥 Content-Type:', req.get('Content-Type'));
|
|
||||||
console.log('📥 Authorization:', req.get('Authorization') ? 'Present' : 'Missing');
|
|
||||||
next();
|
|
||||||
});
|
|
||||||
|
|
||||||
// Health check endpoint
|
// Health check endpoint
|
||||||
app.get('/health', (_req, res) => { // _req to fix TS6133
|
app.get('/health', (_req, res) => {
|
||||||
res.status(200).json({
|
res.status(200).json({
|
||||||
status: 'ok',
|
status: 'ok',
|
||||||
timestamp: new Date().toISOString(),
|
timestamp: new Date().toISOString(),
|
||||||
@@ -119,53 +87,23 @@ app.get('/health', (_req, res) => { // _req to fix TS6133
|
|||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
// Agentic RAG health check endpoints
|
// API Routes
|
||||||
app.get('/health/agentic-rag', async (_req, res) => {
|
|
||||||
try {
|
|
||||||
const { agenticRAGDatabaseService } = await import('./services/agenticRAGDatabaseService');
|
|
||||||
const healthStatus = await agenticRAGDatabaseService.getHealthStatus();
|
|
||||||
res.json(healthStatus);
|
|
||||||
} catch (error) {
|
|
||||||
logger.error('Agentic RAG health check failed', { error });
|
|
||||||
res.status(500).json({
|
|
||||||
error: 'Health check failed',
|
|
||||||
status: 'unhealthy',
|
|
||||||
timestamp: new Date().toISOString()
|
|
||||||
});
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
app.get('/health/agentic-rag/metrics', async (_req, res) => {
|
|
||||||
try {
|
|
||||||
const { agenticRAGDatabaseService } = await import('./services/agenticRAGDatabaseService');
|
|
||||||
const startDate = new Date(Date.now() - 30 * 24 * 60 * 60 * 1000); // 30 days ago
|
|
||||||
const metrics = await agenticRAGDatabaseService.generatePerformanceReport(startDate, new Date());
|
|
||||||
res.json(metrics);
|
|
||||||
} catch (error) {
|
|
||||||
logger.error('Agentic RAG metrics retrieval failed', { error });
|
|
||||||
res.status(500).json({ error: 'Metrics retrieval failed' });
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
// API routes - remove the /api prefix as it's handled by Firebase
|
|
||||||
app.use('/documents', documentRoutes);
|
app.use('/documents', documentRoutes);
|
||||||
app.use('/vector', vectorRoutes);
|
app.use('/vector', vectorRoutes);
|
||||||
app.use('/monitoring', monitoringRoutes);
|
app.use('/monitoring', monitoringRoutes);
|
||||||
|
|
||||||
|
|
||||||
import * as functions from 'firebase-functions';
|
import * as functions from 'firebase-functions';
|
||||||
|
import { onRequest } from 'firebase-functions/v2/https';
|
||||||
|
|
||||||
// API root endpoint
|
// API root endpoint
|
||||||
app.get('/', (_req, res) => { // _req to fix TS6133
|
app.get('/', (_req, res) => {
|
||||||
res.json({
|
res.json({
|
||||||
message: 'CIM Document Processor API',
|
message: 'CIM Document Processor API',
|
||||||
version: '1.0.0',
|
version: '1.0.0',
|
||||||
endpoints: {
|
endpoints: {
|
||||||
auth: '/auth',
|
|
||||||
documents: '/documents',
|
documents: '/documents',
|
||||||
health: '/health',
|
health: '/health',
|
||||||
agenticRagHealth: '/health/agentic-rag',
|
|
||||||
agenticRagMetrics: '/health/agentic-rag/metrics',
|
|
||||||
monitoring: '/monitoring',
|
monitoring: '/monitoring',
|
||||||
},
|
},
|
||||||
});
|
});
|
||||||
@@ -177,26 +115,11 @@ app.use(notFoundHandler);
|
|||||||
// Global error handler (must be last)
|
// Global error handler (must be last)
|
||||||
app.use(errorHandler);
|
app.use(errorHandler);
|
||||||
|
|
||||||
// Initialize job queue service for document processing
|
// Configure Firebase Functions v2 for larger uploads
|
||||||
import { jobQueueService } from './services/jobQueueService';
|
export const api = onRequest({
|
||||||
|
timeoutSeconds: 540, // 9 minutes
|
||||||
// Start the job queue service asynchronously to avoid blocking function startup
|
memory: '2GiB',
|
||||||
// Use a longer delay to ensure the function is fully initialized
|
cpu: 1,
|
||||||
setTimeout(() => {
|
maxInstances: 10,
|
||||||
try {
|
cors: true
|
||||||
jobQueueService.start();
|
}, app);
|
||||||
logger.info('Job queue service started successfully');
|
|
||||||
} catch (error) {
|
|
||||||
logger.error('Failed to start job queue service', { error });
|
|
||||||
}
|
|
||||||
}, 5000);
|
|
||||||
|
|
||||||
// Listen on a port when not in a Firebase Function environment or when PORT is explicitly set
|
|
||||||
if (!process.env['FUNCTION_TARGET'] || process.env['PORT']) {
|
|
||||||
const port = process.env['PORT'] || 5001;
|
|
||||||
app.listen(port, () => {
|
|
||||||
logger.info(`API server listening on port ${port}`);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
export const api = functions.https.onRequest(app);
|
|
||||||
@@ -11,6 +11,18 @@ export const errorHandler = (
|
|||||||
req: Request,
|
req: Request,
|
||||||
res: Response
|
res: Response
|
||||||
): void => {
|
): void => {
|
||||||
|
console.log('💥💥💥 MAXIMUM DEBUG ERROR HANDLER HIT 💥💥💥');
|
||||||
|
console.log('💥 Error name:', err.name);
|
||||||
|
console.log('💥 Error message:', err.message);
|
||||||
|
console.log('💥 Error code:', (err as any).code);
|
||||||
|
console.log('💥 Error type:', typeof err);
|
||||||
|
console.log('💥 Error constructor:', err.constructor.name);
|
||||||
|
console.log('💥 Error stack:', err.stack);
|
||||||
|
console.log('💥 Request URL:', req.url);
|
||||||
|
console.log('💥 Request method:', req.method);
|
||||||
|
console.log('💥 Full error object:', JSON.stringify(err, Object.getOwnPropertyNames(err), 2));
|
||||||
|
console.log('💥💥💥 END ERROR DEBUG 💥💥💥');
|
||||||
|
|
||||||
let error = { ...err };
|
let error = { ...err };
|
||||||
error.message = err.message;
|
error.message = err.message;
|
||||||
|
|
||||||
@@ -53,6 +65,13 @@ export const errorHandler = (
|
|||||||
error = { message, statusCode: 401 } as AppError;
|
error = { message, statusCode: 401 } as AppError;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Multer errors (check if multer is imported anywhere)
|
||||||
|
if (err.name === 'MulterError' || (err as any).code === 'UNEXPECTED_END_OF_FORM') {
|
||||||
|
console.log('🚨 MULTER ERROR CAUGHT:', err.message);
|
||||||
|
const message = `File upload failed: ${err.message}`;
|
||||||
|
error = { message, statusCode: 400 } as AppError;
|
||||||
|
}
|
||||||
|
|
||||||
// Default error
|
// Default error
|
||||||
const statusCode = error.statusCode || 500;
|
const statusCode = error.statusCode || 500;
|
||||||
const message = error.message || 'Server Error';
|
const message = error.message || 'Server Error';
|
||||||
|
|||||||
@@ -13,9 +13,15 @@ if (!fs.existsSync(uploadDir)) {
|
|||||||
|
|
||||||
// File filter function
|
// File filter function
|
||||||
const fileFilter = (req: Request, file: any, cb: multer.FileFilterCallback) => {
|
const fileFilter = (req: Request, file: any, cb: multer.FileFilterCallback) => {
|
||||||
console.log('🔍 File filter called for:', file.originalname);
|
console.log('🔍 ===== FILE FILTER CALLED =====');
|
||||||
|
console.log('🔍 File originalname:', file.originalname);
|
||||||
console.log('🔍 File mimetype:', file.mimetype);
|
console.log('🔍 File mimetype:', file.mimetype);
|
||||||
console.log('🔍 File size:', file.size);
|
console.log('🔍 File size:', file.size);
|
||||||
|
console.log('🔍 File encoding:', file.encoding);
|
||||||
|
console.log('🔍 File fieldname:', file.fieldname);
|
||||||
|
console.log('🔍 Request Content-Type:', req.get('Content-Type'));
|
||||||
|
console.log('🔍 Request Content-Length:', req.get('Content-Length'));
|
||||||
|
console.log('🔍 ===========================');
|
||||||
|
|
||||||
// Check file type - allow PDF and text files for testing
|
// Check file type - allow PDF and text files for testing
|
||||||
const allowedTypes = ['application/pdf', 'text/plain', 'text/html'];
|
const allowedTypes = ['application/pdf', 'text/plain', 'text/html'];
|
||||||
@@ -68,6 +74,14 @@ const upload = multer({
|
|||||||
|
|
||||||
// Error handling middleware for multer
|
// Error handling middleware for multer
|
||||||
export const handleUploadError = (error: any, req: Request, res: Response, next: NextFunction): void => {
|
export const handleUploadError = (error: any, req: Request, res: Response, next: NextFunction): void => {
|
||||||
|
console.log('🚨 =============================');
|
||||||
|
console.log('🚨 UPLOAD ERROR HANDLER CALLED');
|
||||||
|
console.log('🚨 Error type:', error?.constructor?.name);
|
||||||
|
console.log('🚨 Error message:', error?.message);
|
||||||
|
console.log('🚨 Error code:', error?.code);
|
||||||
|
console.log('🚨 Is MulterError:', error instanceof multer.MulterError);
|
||||||
|
console.log('🚨 =============================');
|
||||||
|
|
||||||
if (error instanceof multer.MulterError) {
|
if (error instanceof multer.MulterError) {
|
||||||
logger.error('Multer error during file upload:', {
|
logger.error('Multer error during file upload:', {
|
||||||
error: error.message,
|
error: error.message,
|
||||||
@@ -129,12 +143,14 @@ export const handleUploadError = (error: any, req: Request, res: Response, next:
|
|||||||
|
|
||||||
// Main upload middleware with timeout handling
|
// Main upload middleware with timeout handling
|
||||||
export const uploadMiddleware = (req: Request, res: Response, next: NextFunction) => {
|
export const uploadMiddleware = (req: Request, res: Response, next: NextFunction) => {
|
||||||
console.log('📤 Upload middleware called');
|
console.log('📤 =============================');
|
||||||
|
console.log('📤 UPLOAD MIDDLEWARE CALLED');
|
||||||
console.log('📤 Request method:', req.method);
|
console.log('📤 Request method:', req.method);
|
||||||
console.log('📤 Request URL:', req.url);
|
console.log('📤 Request URL:', req.url);
|
||||||
console.log('📤 Content-Type:', req.get('Content-Type'));
|
console.log('📤 Content-Type:', req.get('Content-Type'));
|
||||||
console.log('📤 Content-Length:', req.get('Content-Length'));
|
console.log('📤 Content-Length:', req.get('Content-Length'));
|
||||||
console.log('📤 User-Agent:', req.get('User-Agent'));
|
console.log('📤 User-Agent:', req.get('User-Agent'));
|
||||||
|
console.log('📤 =============================');
|
||||||
|
|
||||||
// Set a timeout for the upload
|
// Set a timeout for the upload
|
||||||
const uploadTimeout = setTimeout(() => {
|
const uploadTimeout = setTimeout(() => {
|
||||||
@@ -155,12 +171,25 @@ export const uploadMiddleware = (req: Request, res: Response, next: NextFunction
|
|||||||
clearTimeout(uploadTimeout);
|
clearTimeout(uploadTimeout);
|
||||||
if (err) {
|
if (err) {
|
||||||
console.log('❌ Upload middleware error:', err);
|
console.log('❌ Upload middleware error:', err);
|
||||||
|
console.log('❌ Error details:', {
|
||||||
|
name: err.name,
|
||||||
|
message: err.message,
|
||||||
|
code: err.code,
|
||||||
|
stack: err.stack?.split('\n')[0]
|
||||||
|
});
|
||||||
} else {
|
} else {
|
||||||
console.log('✅ Upload middleware completed successfully');
|
console.log('✅ Upload middleware completed successfully');
|
||||||
|
console.log('✅ File after multer processing:', {
|
||||||
|
hasFile: !!req.file,
|
||||||
|
filename: req.file?.originalname,
|
||||||
|
size: req.file?.size,
|
||||||
|
mimetype: req.file?.mimetype
|
||||||
|
});
|
||||||
}
|
}
|
||||||
originalNext(err);
|
originalNext(err);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
console.log('🔄 Calling multer.single("document")...');
|
||||||
upload.single('document')(req, res, next);
|
upload.single('document')(req, res, next);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
import { v4 as uuidv4 } from 'uuid';
|
import { v4 as uuidv4 } from 'uuid';
|
||||||
import { logger } from '../utils/logger';
|
import { logger } from '../utils/logger';
|
||||||
import pool from '../config/database';
|
import { getSupabaseServiceClient } from '../config/supabase';
|
||||||
|
|
||||||
export interface DocumentChunk {
|
export interface DocumentChunk {
|
||||||
id: string;
|
id: string;
|
||||||
@@ -15,577 +15,21 @@ export interface DocumentChunk {
|
|||||||
updatedAt: Date;
|
updatedAt: Date;
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface VectorSearchResult {
|
|
||||||
documentId: string;
|
|
||||||
similarityScore: number;
|
|
||||||
chunkContent: string;
|
|
||||||
metadata: Record<string, any>;
|
|
||||||
}
|
|
||||||
|
|
||||||
export interface DocumentSimilarity {
|
|
||||||
id: string;
|
|
||||||
sourceDocumentId: string;
|
|
||||||
targetDocumentId: string;
|
|
||||||
similarityScore: number;
|
|
||||||
similarityType: string;
|
|
||||||
metadata: Record<string, any>;
|
|
||||||
createdAt: Date;
|
|
||||||
}
|
|
||||||
|
|
||||||
export interface IndustryEmbedding {
|
|
||||||
id: string;
|
|
||||||
industryName: string;
|
|
||||||
industryDescription?: string;
|
|
||||||
embedding: number[];
|
|
||||||
documentCount: number;
|
|
||||||
averageSimilarity?: number;
|
|
||||||
createdAt: Date;
|
|
||||||
updatedAt: Date;
|
|
||||||
}
|
|
||||||
|
|
||||||
export class VectorDatabaseModel {
|
export class VectorDatabaseModel {
|
||||||
/**
|
|
||||||
* Store document chunks with embeddings
|
|
||||||
*/
|
|
||||||
static async storeDocumentChunks(chunks: Omit<DocumentChunk, 'id' | 'createdAt' | 'updatedAt'>[]): Promise<void> {
|
static async storeDocumentChunks(chunks: Omit<DocumentChunk, 'id' | 'createdAt' | 'updatedAt'>[]): Promise<void> {
|
||||||
const client = await pool.connect();
|
const supabase = getSupabaseServiceClient();
|
||||||
|
const { data, error } = await supabase
|
||||||
try {
|
.from('document_chunks')
|
||||||
await client.query('BEGIN');
|
.insert(chunks.map(chunk => ({
|
||||||
|
...chunk,
|
||||||
for (const chunk of chunks) {
|
embedding: `[${chunk.embedding.join(',')}]` // Format for pgvector
|
||||||
// Ensure embedding is properly formatted for pgvector
|
})));
|
||||||
const embeddingArray = Array.isArray(chunk.embedding) ? chunk.embedding : [];
|
|
||||||
|
if (error) {
|
||||||
// Validate embedding dimensions (should be 1536 for text-embedding-3-small)
|
|
||||||
if (embeddingArray.length !== 1536) {
|
|
||||||
logger.warn(`Embedding dimension mismatch: expected 1536, got ${embeddingArray.length}`);
|
|
||||||
// Pad or truncate to 1536 dimensions if necessary
|
|
||||||
const paddedEmbedding = new Array(1536).fill(0);
|
|
||||||
embeddingArray.forEach((val, index) => {
|
|
||||||
if (index < 1536) paddedEmbedding[index] = val;
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
// Format embedding properly for pgvector - must be a JSON array string
|
|
||||||
const embeddingString = JSON.stringify(embeddingArray);
|
|
||||||
|
|
||||||
await client.query(`
|
|
||||||
INSERT INTO document_chunks (
|
|
||||||
id, document_id, content, metadata, embedding,
|
|
||||||
chunk_index, section, page_number
|
|
||||||
) VALUES ($1, $2, $3, $4, $5::vector, $6, $7, $8)
|
|
||||||
ON CONFLICT (id) DO UPDATE SET
|
|
||||||
content = EXCLUDED.content,
|
|
||||||
metadata = EXCLUDED.metadata,
|
|
||||||
embedding = EXCLUDED.embedding,
|
|
||||||
section = EXCLUDED.section,
|
|
||||||
page_number = EXCLUDED.page_number,
|
|
||||||
updated_at = CURRENT_TIMESTAMP
|
|
||||||
`, [
|
|
||||||
uuidv4(),
|
|
||||||
chunk.documentId,
|
|
||||||
chunk.content,
|
|
||||||
JSON.stringify(chunk.metadata),
|
|
||||||
embeddingString, // Pass as JSON string for pgvector
|
|
||||||
chunk.chunkIndex,
|
|
||||||
chunk.section,
|
|
||||||
chunk.pageNumber
|
|
||||||
]);
|
|
||||||
}
|
|
||||||
|
|
||||||
await client.query('COMMIT');
|
|
||||||
logger.info(`Stored ${chunks.length} document chunks in vector database`);
|
|
||||||
} catch (error) {
|
|
||||||
await client.query('ROLLBACK');
|
|
||||||
logger.error('Failed to store document chunks', error);
|
logger.error('Failed to store document chunks', error);
|
||||||
throw error;
|
throw error;
|
||||||
} finally {
|
|
||||||
client.release();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
logger.info(`Stored ${chunks.length} document chunks in vector database`);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
/**
|
|
||||||
* Search for similar content using vector similarity
|
|
||||||
*/
|
|
||||||
static async searchSimilarContent(
|
|
||||||
queryEmbedding: number[],
|
|
||||||
options: {
|
|
||||||
documentId?: string;
|
|
||||||
limit?: number;
|
|
||||||
similarityThreshold?: number;
|
|
||||||
filters?: Record<string, any>;
|
|
||||||
} = {}
|
|
||||||
): Promise<VectorSearchResult[]> {
|
|
||||||
const {
|
|
||||||
documentId,
|
|
||||||
limit = 10,
|
|
||||||
similarityThreshold = 0.7,
|
|
||||||
filters = {}
|
|
||||||
} = options;
|
|
||||||
|
|
||||||
// Ensure embedding is properly formatted
|
|
||||||
const embeddingArray = Array.isArray(queryEmbedding) ? queryEmbedding : [];
|
|
||||||
|
|
||||||
// Validate embedding dimensions
|
|
||||||
if (embeddingArray.length !== 1536) {
|
|
||||||
logger.warn(`Query embedding dimension mismatch: expected 1536, got ${embeddingArray.length}`);
|
|
||||||
// Pad or truncate to 1536 dimensions if necessary
|
|
||||||
const paddedEmbedding = new Array(1536).fill(0);
|
|
||||||
embeddingArray.forEach((val, index) => {
|
|
||||||
if (index < 1536) paddedEmbedding[index] = val;
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
let query = `
|
|
||||||
SELECT
|
|
||||||
dc.document_id,
|
|
||||||
1 - (dc.embedding <=> $1::vector) as similarity_score,
|
|
||||||
dc.content as chunk_content,
|
|
||||||
dc.metadata
|
|
||||||
FROM document_chunks dc
|
|
||||||
WHERE dc.embedding IS NOT NULL
|
|
||||||
`;
|
|
||||||
|
|
||||||
const params: any[] = [embeddingArray];
|
|
||||||
let paramIndex = 2;
|
|
||||||
|
|
||||||
if (documentId) {
|
|
||||||
query += ` AND dc.document_id = $${paramIndex}`;
|
|
||||||
params.push(documentId);
|
|
||||||
paramIndex++;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Add metadata filters
|
|
||||||
Object.entries(filters).forEach(([key, value]) => {
|
|
||||||
query += ` AND dc.metadata->>'${key}' = $${paramIndex}`;
|
|
||||||
params.push(value);
|
|
||||||
paramIndex++;
|
|
||||||
});
|
|
||||||
|
|
||||||
query += `
|
|
||||||
AND 1 - (dc.embedding <=> $1::vector) >= $${paramIndex}
|
|
||||||
ORDER BY dc.embedding <=> $1::vector
|
|
||||||
LIMIT $${paramIndex + 1}
|
|
||||||
`;
|
|
||||||
params.push(similarityThreshold, limit);
|
|
||||||
|
|
||||||
try {
|
|
||||||
const result = await pool.query(query, params);
|
|
||||||
|
|
||||||
return result.rows.map((row: any) => ({
|
|
||||||
documentId: row.document_id,
|
|
||||||
similarityScore: parseFloat(row.similarity_score),
|
|
||||||
chunkContent: row.chunk_content,
|
|
||||||
metadata: row.metadata
|
|
||||||
}));
|
|
||||||
} catch (error) {
|
|
||||||
logger.error('Vector search failed', error);
|
|
||||||
throw error;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Get document chunks by document ID
|
|
||||||
*/
|
|
||||||
static async getDocumentChunks(documentId: string): Promise<DocumentChunk[]> {
|
|
||||||
try {
|
|
||||||
const result = await pool.query(`
|
|
||||||
SELECT
|
|
||||||
id,
|
|
||||||
document_id,
|
|
||||||
content,
|
|
||||||
metadata,
|
|
||||||
embedding,
|
|
||||||
chunk_index,
|
|
||||||
section,
|
|
||||||
page_number,
|
|
||||||
created_at,
|
|
||||||
updated_at
|
|
||||||
FROM document_chunks
|
|
||||||
WHERE document_id = $1
|
|
||||||
ORDER BY chunk_index
|
|
||||||
`, [documentId]);
|
|
||||||
|
|
||||||
return result.rows.map((row: any) => ({
|
|
||||||
id: row.id,
|
|
||||||
documentId: row.document_id,
|
|
||||||
content: row.content,
|
|
||||||
metadata: row.metadata || {},
|
|
||||||
embedding: row.embedding || [],
|
|
||||||
chunkIndex: row.chunk_index,
|
|
||||||
section: row.section,
|
|
||||||
pageNumber: row.page_number,
|
|
||||||
createdAt: row.created_at,
|
|
||||||
updatedAt: row.updated_at
|
|
||||||
}));
|
|
||||||
} catch (error) {
|
|
||||||
logger.error('Failed to get document chunks', error);
|
|
||||||
throw error;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Find similar documents
|
|
||||||
*/
|
|
||||||
static async findSimilarDocuments(
|
|
||||||
documentId: string,
|
|
||||||
limit: number = 10,
|
|
||||||
similarityThreshold: number = 0.6
|
|
||||||
): Promise<DocumentSimilarity[]> {
|
|
||||||
try {
|
|
||||||
// Get document chunks
|
|
||||||
const documentChunks = await this.getDocumentChunks(documentId);
|
|
||||||
if (documentChunks.length === 0) return [];
|
|
||||||
|
|
||||||
// Use the first chunk as reference
|
|
||||||
const referenceChunk = documentChunks[0];
|
|
||||||
if (!referenceChunk || !referenceChunk.embedding) return [];
|
|
||||||
|
|
||||||
const result = await pool.query(`
|
|
||||||
SELECT
|
|
||||||
id,
|
|
||||||
source_document_id,
|
|
||||||
target_document_id,
|
|
||||||
similarity_score,
|
|
||||||
similarity_type,
|
|
||||||
metadata,
|
|
||||||
created_at
|
|
||||||
FROM document_similarities
|
|
||||||
WHERE source_document_id = $1
|
|
||||||
AND similarity_score >= $2
|
|
||||||
ORDER BY similarity_score DESC
|
|
||||||
LIMIT $3
|
|
||||||
`, [documentId, similarityThreshold, limit]);
|
|
||||||
|
|
||||||
return result.rows.map((row: any) => ({
|
|
||||||
id: row.id,
|
|
||||||
sourceDocumentId: row.source_document_id,
|
|
||||||
targetDocumentId: row.target_document_id,
|
|
||||||
similarityScore: parseFloat(row.similarity_score),
|
|
||||||
similarityType: row.similarity_type,
|
|
||||||
metadata: row.metadata || {},
|
|
||||||
createdAt: row.created_at
|
|
||||||
}));
|
|
||||||
} catch (error) {
|
|
||||||
logger.error('Failed to find similar documents', error);
|
|
||||||
throw error;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Update document similarities
|
|
||||||
*/
|
|
||||||
static async updateDocumentSimilarities(): Promise<void> {
|
|
||||||
try {
|
|
||||||
await pool.query(`
|
|
||||||
SELECT update_document_similarities();
|
|
||||||
`);
|
|
||||||
logger.info('Document similarities updated');
|
|
||||||
} catch (error) {
|
|
||||||
logger.error('Failed to update document similarities', error);
|
|
||||||
throw error;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Store industry embedding
|
|
||||||
*/
|
|
||||||
static async storeIndustryEmbedding(industry: Omit<IndustryEmbedding, 'id' | 'createdAt' | 'updatedAt'>): Promise<void> {
|
|
||||||
try {
|
|
||||||
// Ensure embedding is properly formatted
|
|
||||||
const embeddingArray = Array.isArray(industry.embedding) ? industry.embedding : [];
|
|
||||||
|
|
||||||
// Validate embedding dimensions
|
|
||||||
if (embeddingArray.length !== 1536) {
|
|
||||||
logger.warn(`Industry embedding dimension mismatch: expected 1536, got ${embeddingArray.length}`);
|
|
||||||
// Pad or truncate to 1536 dimensions if necessary
|
|
||||||
const paddedEmbedding = new Array(1536).fill(0);
|
|
||||||
embeddingArray.forEach((val, index) => {
|
|
||||||
if (index < 1536) paddedEmbedding[index] = val;
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
await pool.query(`
|
|
||||||
INSERT INTO industry_embeddings (
|
|
||||||
id, industry_name, industry_description, embedding,
|
|
||||||
document_count, average_similarity
|
|
||||||
) VALUES ($1, $2, $3, $4::vector, $5, $6)
|
|
||||||
ON CONFLICT (industry_name) DO UPDATE SET
|
|
||||||
industry_description = EXCLUDED.industry_description,
|
|
||||||
embedding = EXCLUDED.embedding,
|
|
||||||
document_count = EXCLUDED.document_count,
|
|
||||||
average_similarity = EXCLUDED.average_similarity,
|
|
||||||
updated_at = CURRENT_TIMESTAMP
|
|
||||||
`, [
|
|
||||||
uuidv4(),
|
|
||||||
industry.industryName,
|
|
||||||
industry.industryDescription,
|
|
||||||
embeddingArray,
|
|
||||||
industry.documentCount,
|
|
||||||
industry.averageSimilarity
|
|
||||||
]);
|
|
||||||
|
|
||||||
logger.info(`Stored industry embedding for: ${industry.industryName}`);
|
|
||||||
} catch (error) {
|
|
||||||
logger.error('Failed to store industry embedding', error);
|
|
||||||
throw error;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Search by industry
|
|
||||||
*/
|
|
||||||
static async searchByIndustry(
|
|
||||||
industryName: string,
|
|
||||||
queryEmbedding: number[],
|
|
||||||
limit: number = 20
|
|
||||||
): Promise<VectorSearchResult[]> {
|
|
||||||
try {
|
|
||||||
// Ensure embedding is properly formatted
|
|
||||||
const embeddingArray = Array.isArray(queryEmbedding) ? queryEmbedding : [];
|
|
||||||
|
|
||||||
// Validate embedding dimensions
|
|
||||||
if (embeddingArray.length !== 1536) {
|
|
||||||
logger.warn(`Industry search embedding dimension mismatch: expected 1536, got ${embeddingArray.length}`);
|
|
||||||
// Pad or truncate to 1536 dimensions if necessary
|
|
||||||
const paddedEmbedding = new Array(1536).fill(0);
|
|
||||||
embeddingArray.forEach((val, index) => {
|
|
||||||
if (index < 1536) paddedEmbedding[index] = val;
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
const result = await pool.query(`
|
|
||||||
SELECT
|
|
||||||
dc.document_id,
|
|
||||||
1 - (dc.embedding <=> $1::vector) as similarity_score,
|
|
||||||
dc.content as chunk_content,
|
|
||||||
dc.metadata
|
|
||||||
FROM document_chunks dc
|
|
||||||
WHERE dc.embedding IS NOT NULL
|
|
||||||
AND dc.metadata->>'industry' = $2
|
|
||||||
ORDER BY dc.embedding <=> $1::vector
|
|
||||||
LIMIT $3
|
|
||||||
`, [embeddingArray, industryName.toLowerCase(), limit]);
|
|
||||||
|
|
||||||
return result.rows.map((row: any) => ({
|
|
||||||
documentId: row.document_id,
|
|
||||||
similarityScore: parseFloat(row.similarity_score),
|
|
||||||
chunkContent: row.chunk_content,
|
|
||||||
metadata: row.metadata || {}
|
|
||||||
}));
|
|
||||||
} catch (error) {
|
|
||||||
logger.error('Failed to search by industry', error);
|
|
||||||
throw error;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Track search query for analytics
|
|
||||||
*/
|
|
||||||
static async trackSearchQuery(
|
|
||||||
userId: string,
|
|
||||||
queryText: string,
|
|
||||||
queryEmbedding: number[],
|
|
||||||
searchResults: VectorSearchResult[],
|
|
||||||
options: {
|
|
||||||
filters?: Record<string, any>;
|
|
||||||
limitCount?: number;
|
|
||||||
similarityThreshold?: number;
|
|
||||||
processingTimeMs?: number;
|
|
||||||
} = {}
|
|
||||||
): Promise<void> {
|
|
||||||
try {
|
|
||||||
// Ensure embedding is properly formatted
|
|
||||||
const embeddingArray = Array.isArray(queryEmbedding) ? queryEmbedding : [];
|
|
||||||
|
|
||||||
// Validate embedding dimensions
|
|
||||||
if (embeddingArray.length !== 1536) {
|
|
||||||
logger.warn(`Search tracking embedding dimension mismatch: expected 1536, got ${embeddingArray.length}`);
|
|
||||||
// Pad or truncate to 1536 dimensions if necessary
|
|
||||||
const paddedEmbedding = new Array(1536).fill(0);
|
|
||||||
embeddingArray.forEach((val, index) => {
|
|
||||||
if (index < 1536) paddedEmbedding[index] = val;
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
await pool.query(`
|
|
||||||
INSERT INTO vector_similarity_searches (
|
|
||||||
id, user_id, query_text, query_embedding, search_results,
|
|
||||||
filters, limit_count, similarity_threshold, processing_time_ms
|
|
||||||
) VALUES ($1, $2, $3, $4::vector, $5, $6, $7, $8, $9)
|
|
||||||
`, [
|
|
||||||
uuidv4(),
|
|
||||||
userId,
|
|
||||||
queryText,
|
|
||||||
embeddingArray,
|
|
||||||
JSON.stringify(searchResults),
|
|
||||||
JSON.stringify(options.filters || {}),
|
|
||||||
options.limitCount || 10,
|
|
||||||
options.similarityThreshold || 0.7,
|
|
||||||
options.processingTimeMs || 0
|
|
||||||
]);
|
|
||||||
|
|
||||||
logger.debug('Search query tracked for analytics');
|
|
||||||
} catch (error) {
|
|
||||||
logger.error('Failed to track search query', error);
|
|
||||||
// Don't throw - analytics failure shouldn't break search
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Get search analytics
|
|
||||||
*/
|
|
||||||
static async getSearchAnalytics(userId: string, days: number = 30): Promise<any[]> {
|
|
||||||
try {
|
|
||||||
const result = await pool.query(`
|
|
||||||
SELECT
|
|
||||||
query_text,
|
|
||||||
COUNT(*) as search_count,
|
|
||||||
AVG(processing_time_ms) as avg_processing_time,
|
|
||||||
AVG(similarity_threshold) as avg_similarity_threshold,
|
|
||||||
MAX(created_at) as last_search
|
|
||||||
FROM vector_similarity_searches
|
|
||||||
WHERE user_id = $1
|
|
||||||
AND created_at >= NOW() - INTERVAL '${days} days'
|
|
||||||
GROUP BY query_text
|
|
||||||
ORDER BY search_count DESC
|
|
||||||
LIMIT 20
|
|
||||||
`, [userId]);
|
|
||||||
|
|
||||||
return result.rows;
|
|
||||||
} catch (error) {
|
|
||||||
logger.error('Failed to get search analytics', error);
|
|
||||||
throw error;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Delete document chunks
|
|
||||||
*/
|
|
||||||
static async deleteDocumentChunks(documentId: string): Promise<void> {
|
|
||||||
try {
|
|
||||||
await pool.query(`
|
|
||||||
DELETE FROM document_chunks
|
|
||||||
WHERE document_id = $1
|
|
||||||
`, [documentId]);
|
|
||||||
|
|
||||||
logger.info(`Deleted chunks for document: ${documentId}`);
|
|
||||||
} catch (error) {
|
|
||||||
logger.error('Failed to delete document chunks', error);
|
|
||||||
throw error;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Get vector database statistics
|
|
||||||
*/
|
|
||||||
static async getVectorDatabaseStats(): Promise<{
|
|
||||||
totalChunks: number;
|
|
||||||
totalDocuments: number;
|
|
||||||
totalSearches: number;
|
|
||||||
averageSimilarity: number;
|
|
||||||
}> {
|
|
||||||
try {
|
|
||||||
const [chunksResult, documentsResult, searchesResult, similarityResult] = await Promise.all([
|
|
||||||
pool.query('SELECT COUNT(*) as count FROM document_chunks'),
|
|
||||||
pool.query('SELECT COUNT(DISTINCT document_id) as count FROM document_chunks'),
|
|
||||||
pool.query('SELECT COUNT(*) as count FROM vector_similarity_searches'),
|
|
||||||
pool.query(`
|
|
||||||
SELECT AVG(similarity_score) as avg_similarity
|
|
||||||
FROM document_similarities
|
|
||||||
WHERE similarity_score > 0
|
|
||||||
`)
|
|
||||||
]);
|
|
||||||
|
|
||||||
return {
|
|
||||||
totalChunks: parseInt(chunksResult.rows[0]?.count || '0'),
|
|
||||||
totalDocuments: parseInt(documentsResult.rows[0]?.count || '0'),
|
|
||||||
totalSearches: parseInt(searchesResult.rows[0]?.count || '0'),
|
|
||||||
averageSimilarity: parseFloat(similarityResult.rows[0]?.avg_similarity || '0')
|
|
||||||
};
|
|
||||||
} catch (error) {
|
|
||||||
logger.error('Failed to get vector database stats', error);
|
|
||||||
throw error;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Get all chunks (for testing/debugging)
|
|
||||||
*/
|
|
||||||
static async getAllChunks(): Promise<DocumentChunk[]> {
|
|
||||||
try {
|
|
||||||
const result = await pool.query(`
|
|
||||||
SELECT
|
|
||||||
id,
|
|
||||||
document_id,
|
|
||||||
content,
|
|
||||||
metadata,
|
|
||||||
embedding,
|
|
||||||
chunk_index,
|
|
||||||
section,
|
|
||||||
page_number,
|
|
||||||
created_at,
|
|
||||||
updated_at
|
|
||||||
FROM document_chunks
|
|
||||||
ORDER BY document_id, chunk_index
|
|
||||||
LIMIT 1000
|
|
||||||
`);
|
|
||||||
|
|
||||||
return result.rows.map((row: any) => ({
|
|
||||||
id: row.id,
|
|
||||||
documentId: row.document_id,
|
|
||||||
content: row.content,
|
|
||||||
metadata: row.metadata || {},
|
|
||||||
embedding: row.embedding || [],
|
|
||||||
chunkIndex: row.chunk_index,
|
|
||||||
section: row.section,
|
|
||||||
pageNumber: row.page_number,
|
|
||||||
createdAt: row.created_at,
|
|
||||||
updatedAt: row.updated_at
|
|
||||||
}));
|
|
||||||
} catch (error) {
|
|
||||||
logger.error('Failed to get all chunks', error);
|
|
||||||
throw error;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Get total chunk count
|
|
||||||
*/
|
|
||||||
static async getTotalChunkCount(): Promise<number> {
|
|
||||||
try {
|
|
||||||
const result = await pool.query('SELECT COUNT(*) as count FROM document_chunks');
|
|
||||||
return parseInt(result.rows[0]?.count || '0');
|
|
||||||
} catch (error) {
|
|
||||||
logger.error('Failed to get total chunk count', error);
|
|
||||||
throw error;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Get total document count
|
|
||||||
*/
|
|
||||||
static async getTotalDocumentCount(): Promise<number> {
|
|
||||||
try {
|
|
||||||
const result = await pool.query('SELECT COUNT(DISTINCT document_id) as count FROM document_chunks');
|
|
||||||
return parseInt(result.rows[0]?.count || '0');
|
|
||||||
} catch (error) {
|
|
||||||
logger.error('Failed to get total document count', error);
|
|
||||||
throw error;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Get average chunk size
|
|
||||||
*/
|
|
||||||
static async getAverageChunkSize(): Promise<number> {
|
|
||||||
try {
|
|
||||||
const result = await pool.query('SELECT AVG(LENGTH(content)) as avg_size FROM document_chunks');
|
|
||||||
return Math.round(parseFloat(result.rows[0]?.avg_size || '0'));
|
|
||||||
} catch (error) {
|
|
||||||
logger.error('Failed to get average chunk size', error);
|
|
||||||
throw error;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -63,6 +63,7 @@ export interface ProcessingJob {
|
|||||||
}
|
}
|
||||||
|
|
||||||
export type ProcessingStatus =
|
export type ProcessingStatus =
|
||||||
|
| 'uploading'
|
||||||
| 'uploaded'
|
| 'uploaded'
|
||||||
| 'extracting_text'
|
| 'extracting_text'
|
||||||
| 'processing_llm'
|
| 'processing_llm'
|
||||||
|
|||||||
@@ -23,9 +23,13 @@ const router = express.Router();
|
|||||||
router.use(verifyFirebaseToken);
|
router.use(verifyFirebaseToken);
|
||||||
router.use(addCorrelationId);
|
router.use(addCorrelationId);
|
||||||
|
|
||||||
// Essential document management routes (keeping these)
|
// NEW Firebase Storage direct upload routes
|
||||||
|
router.post('/upload-url', documentController.getUploadUrl);
|
||||||
|
router.post('/:id/confirm-upload', validateUUID('id'), documentController.confirmUpload);
|
||||||
|
|
||||||
|
// LEGACY multipart upload routes (keeping for backward compatibility)
|
||||||
router.post('/upload', handleFileUpload, documentController.uploadDocument);
|
router.post('/upload', handleFileUpload, documentController.uploadDocument);
|
||||||
router.post('/', handleFileUpload, documentController.uploadDocument); // Add direct POST to /documents for frontend compatibility
|
router.post('/', handleFileUpload, documentController.uploadDocument);
|
||||||
router.get('/', documentController.getDocuments);
|
router.get('/', documentController.getDocuments);
|
||||||
|
|
||||||
// Analytics endpoints (MUST come before /:id routes to avoid conflicts)
|
// Analytics endpoints (MUST come before /:id routes to avoid conflicts)
|
||||||
|
|||||||
@@ -483,6 +483,37 @@ class FileStorageService {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Generate signed upload URL for direct client uploads
|
||||||
|
*/
|
||||||
|
async generateSignedUploadUrl(filePath: string, contentType: string, expirationMinutes: number = 60): Promise<string> {
|
||||||
|
try {
|
||||||
|
const bucket = this.storage.bucket(this.bucketName);
|
||||||
|
const file = bucket.file(filePath);
|
||||||
|
|
||||||
|
// Generate signed upload URL with retry logic
|
||||||
|
const [signedUrl] = await this.retryOperation(
|
||||||
|
async () => file.getSignedUrl({
|
||||||
|
version: 'v4',
|
||||||
|
action: 'write',
|
||||||
|
expires: Date.now() + (expirationMinutes * 60 * 1000),
|
||||||
|
contentType: contentType,
|
||||||
|
}),
|
||||||
|
'generate signed upload URL from GCS'
|
||||||
|
);
|
||||||
|
|
||||||
|
logger.info(`Generated signed upload URL for file: ${filePath}`, {
|
||||||
|
contentType,
|
||||||
|
expirationMinutes,
|
||||||
|
});
|
||||||
|
|
||||||
|
return signedUrl;
|
||||||
|
} catch (error) {
|
||||||
|
logger.error(`Error generating signed upload URL for file: ${filePath}`, error);
|
||||||
|
throw new Error(`Failed to generate upload URL: ${error instanceof Error ? error.message : 'Unknown error'}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Copy file within Google Cloud Storage
|
* Copy file within Google Cloud Storage
|
||||||
*/
|
*/
|
||||||
|
|||||||
23
cors.json
Normal file
23
cors.json
Normal file
@@ -0,0 +1,23 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"origin": [
|
||||||
|
"https://cim-summarizer.web.app",
|
||||||
|
"https://cim-summarizer.firebaseapp.com",
|
||||||
|
"http://localhost:3000",
|
||||||
|
"http://localhost:5173"
|
||||||
|
],
|
||||||
|
"method": [
|
||||||
|
"GET",
|
||||||
|
"POST",
|
||||||
|
"PUT",
|
||||||
|
"DELETE",
|
||||||
|
"OPTIONS"
|
||||||
|
],
|
||||||
|
"responseHeader": [
|
||||||
|
"Content-Type",
|
||||||
|
"Authorization",
|
||||||
|
"X-Requested-With"
|
||||||
|
],
|
||||||
|
"maxAgeSeconds": 3600
|
||||||
|
}
|
||||||
|
]
|
||||||
6
firebase.json
Normal file
6
firebase.json
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
{
|
||||||
|
"storage": {
|
||||||
|
"rules": "storage.rules",
|
||||||
|
"cors": "storage.cors.json"
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -63,6 +63,10 @@
|
|||||||
}
|
}
|
||||||
],
|
],
|
||||||
"rewrites": [
|
"rewrites": [
|
||||||
|
{
|
||||||
|
"source": "/api/**",
|
||||||
|
"function": "api"
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"source": "**",
|
"source": "**",
|
||||||
"destination": "/index.html"
|
"destination": "/index.html"
|
||||||
|
|||||||
@@ -387,19 +387,6 @@ const Dashboard: React.FC = () => {
|
|||||||
<span className="text-sm text-white">
|
<span className="text-sm text-white">
|
||||||
Welcome, {user?.name || user?.email}
|
Welcome, {user?.name || user?.email}
|
||||||
</span>
|
</span>
|
||||||
{/* Debug buttons - show in production for troubleshooting */}
|
|
||||||
<button
|
|
||||||
onClick={handleDebugAuth}
|
|
||||||
className="bg-yellow-500 hover:bg-yellow-600 text-white px-3 py-1 rounded text-sm"
|
|
||||||
>
|
|
||||||
Debug Auth
|
|
||||||
</button>
|
|
||||||
<button
|
|
||||||
onClick={handleTestAPIAuth}
|
|
||||||
className="bg-blue-500 hover:bg-blue-600 text-white px-3 py-1 rounded text-sm"
|
|
||||||
>
|
|
||||||
Test API
|
|
||||||
</button>
|
|
||||||
<LogoutButton variant="button" className="bg-error-500 hover:bg-error-600 text-white" />
|
<LogoutButton variant="button" className="bg-error-500 hover:bg-error-600 text-white" />
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|||||||
@@ -14,10 +14,10 @@ interface UploadedFile {
|
|||||||
progress: number;
|
progress: number;
|
||||||
error?: string;
|
error?: string;
|
||||||
documentId?: string; // Real document ID from backend
|
documentId?: string; // Real document ID from backend
|
||||||
// GCS-specific fields
|
// Firebase Storage specific fields
|
||||||
gcsError?: boolean;
|
storageError?: boolean;
|
||||||
storageType?: 'gcs' | 'local';
|
storageType?: 'firebase' | 'local';
|
||||||
gcsUrl?: string;
|
storageUrl?: string;
|
||||||
}
|
}
|
||||||
|
|
||||||
interface DocumentUploadProps {
|
interface DocumentUploadProps {
|
||||||
@@ -92,17 +92,15 @@ const DocumentUpload: React.FC<DocumentUploadProps> = ({
|
|||||||
|
|
||||||
try {
|
try {
|
||||||
// Upload the document with optimized agentic RAG processing (no strategy selection needed)
|
// Upload the document with optimized agentic RAG processing (no strategy selection needed)
|
||||||
const document = await documentService.uploadDocument(
|
const result = await documentService.uploadDocument(
|
||||||
file,
|
file,
|
||||||
(progress) => {
|
(progress) => {
|
||||||
setUploadedFiles(prev =>
|
setUploadedFiles(prev =>
|
||||||
prev.map(f =>
|
prev.map(f =>
|
||||||
f.id === uploadedFile.id
|
f.id === uploadedFile.id ? { ...f, progress } : f
|
||||||
? { ...f, progress }
|
|
||||||
: f
|
|
||||||
)
|
)
|
||||||
);
|
);
|
||||||
},
|
},
|
||||||
abortController.signal
|
abortController.signal
|
||||||
);
|
);
|
||||||
|
|
||||||
@@ -141,13 +139,13 @@ const DocumentUpload: React.FC<DocumentUploadProps> = ({
|
|||||||
} else {
|
} else {
|
||||||
console.error('Upload failed:', error);
|
console.error('Upload failed:', error);
|
||||||
|
|
||||||
// Handle GCS-specific errors
|
// Handle storage-specific errors
|
||||||
let errorMessage = 'Upload failed';
|
let errorMessage = 'Upload failed';
|
||||||
let isGCSError = false;
|
let isStorageError = false;
|
||||||
|
|
||||||
if (GCSErrorHandler.isGCSError(error)) {
|
if (GCSErrorHandler.isGCSError(error)) {
|
||||||
errorMessage = GCSErrorHandler.getErrorMessage(error as GCSError);
|
errorMessage = GCSErrorHandler.getErrorMessage(error as GCSError);
|
||||||
isGCSError = true;
|
isStorageError = true;
|
||||||
} else if (error instanceof Error) {
|
} else if (error instanceof Error) {
|
||||||
errorMessage = error.message;
|
errorMessage = error.message;
|
||||||
}
|
}
|
||||||
@@ -159,8 +157,8 @@ const DocumentUpload: React.FC<DocumentUploadProps> = ({
|
|||||||
...f,
|
...f,
|
||||||
status: 'error',
|
status: 'error',
|
||||||
error: errorMessage,
|
error: errorMessage,
|
||||||
// Add GCS error indicator
|
// Add storage error indicator
|
||||||
...(isGCSError && { gcsError: true })
|
...(isStorageError && { storageError: true })
|
||||||
}
|
}
|
||||||
: f
|
: f
|
||||||
)
|
)
|
||||||
@@ -297,19 +295,19 @@ const DocumentUpload: React.FC<DocumentUploadProps> = ({
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
const getStatusText = (status: UploadedFile['status'], error?: string, gcsError?: boolean) => {
|
const getStatusText = (status: UploadedFile['status'], error?: string, storageError?: boolean) => {
|
||||||
switch (status) {
|
switch (status) {
|
||||||
case 'uploading':
|
case 'uploading':
|
||||||
return 'Uploading to Google Cloud Storage...';
|
return 'Uploading to Firebase Storage...';
|
||||||
case 'uploaded':
|
case 'uploaded':
|
||||||
return 'Uploaded to GCS ✓';
|
return 'Uploaded to Firebase Storage ✓';
|
||||||
case 'processing':
|
case 'processing':
|
||||||
return 'Processing with Optimized Agentic RAG...';
|
return 'Processing with Document AI + Optimized Agentic RAG...';
|
||||||
case 'completed':
|
case 'completed':
|
||||||
return 'Completed ✓';
|
return 'Completed ✓ (PDF automatically deleted)';
|
||||||
case 'error':
|
case 'error':
|
||||||
if (error === 'Upload cancelled') return 'Cancelled';
|
if (error === 'Upload cancelled') return 'Cancelled';
|
||||||
if (gcsError) return 'GCS Error';
|
if (storageError) return 'Firebase Storage Error';
|
||||||
return 'Error';
|
return 'Error';
|
||||||
default:
|
default:
|
||||||
return '';
|
return '';
|
||||||
@@ -323,10 +321,10 @@ const DocumentUpload: React.FC<DocumentUploadProps> = ({
|
|||||||
<div className="flex items-center">
|
<div className="flex items-center">
|
||||||
<CheckCircle className="h-5 w-5 text-blue-600 mr-2" />
|
<CheckCircle className="h-5 w-5 text-blue-600 mr-2" />
|
||||||
<div>
|
<div>
|
||||||
<h3 className="text-sm font-medium text-blue-800">Optimized Agentic RAG Processing</h3>
|
<h3 className="text-sm font-medium text-blue-800">Document AI + Optimized Agentic RAG Processing</h3>
|
||||||
<p className="text-sm text-blue-700 mt-1">
|
<p className="text-sm text-blue-700 mt-1">
|
||||||
All documents are automatically processed using our advanced optimized agentic RAG system,
|
All documents are automatically processed using Google Document AI for extraction and our advanced optimized agentic RAG system for analysis,
|
||||||
which includes intelligent chunking, vectorization, and multi-agent analysis for the best results.
|
including intelligent chunking, vectorization, and multi-agent CIM review. PDFs are automatically deleted after processing.
|
||||||
</p>
|
</p>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
@@ -351,7 +349,7 @@ const DocumentUpload: React.FC<DocumentUploadProps> = ({
|
|||||||
Drag and drop PDF files here, or click to browse
|
Drag and drop PDF files here, or click to browse
|
||||||
</p>
|
</p>
|
||||||
<p className="text-xs text-gray-500">
|
<p className="text-xs text-gray-500">
|
||||||
Maximum file size: 50MB • Supported format: PDF • Stored securely in Google Cloud Storage • Automatic Optimized Agentic RAG Processing
|
Maximum file size: 50MB • Supported format: PDF • Stored securely in Firebase Storage • Automatic Document AI + Optimized Agentic RAG Processing • PDFs deleted after processing
|
||||||
</p>
|
</p>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
@@ -379,8 +377,8 @@ const DocumentUpload: React.FC<DocumentUploadProps> = ({
|
|||||||
<div>
|
<div>
|
||||||
<h4 className="text-sm font-medium text-success-800">Upload Complete</h4>
|
<h4 className="text-sm font-medium text-success-800">Upload Complete</h4>
|
||||||
<p className="text-sm text-success-700 mt-1">
|
<p className="text-sm text-success-700 mt-1">
|
||||||
Files have been uploaded successfully to Google Cloud Storage! You can now navigate away from this page.
|
Files have been uploaded successfully to Firebase Storage! You can now navigate away from this page.
|
||||||
Processing will continue in the background using Optimized Agentic RAG and you can check the status in the Documents tab.
|
Processing will continue in the background using Document AI + Optimized Agentic RAG. PDFs will be automatically deleted after processing to save costs.
|
||||||
</p>
|
</p>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
@@ -426,10 +424,10 @@ const DocumentUpload: React.FC<DocumentUploadProps> = ({
|
|||||||
<div className="flex items-center space-x-1">
|
<div className="flex items-center space-x-1">
|
||||||
{getStatusIcon(file.status)}
|
{getStatusIcon(file.status)}
|
||||||
<span className="text-xs text-gray-600">
|
<span className="text-xs text-gray-600">
|
||||||
{getStatusText(file.status, file.error, file.gcsError)}
|
{getStatusText(file.status, file.error, file.storageError)}
|
||||||
</span>
|
</span>
|
||||||
{/* GCS indicator */}
|
{/* Firebase Storage indicator */}
|
||||||
{file.storageType === 'gcs' && (
|
{file.storageType === 'firebase' && (
|
||||||
<Cloud className="h-3 w-3 text-blue-500" />
|
<Cloud className="h-3 w-3 text-blue-500" />
|
||||||
)}
|
)}
|
||||||
</div>
|
</div>
|
||||||
@@ -452,4 +450,4 @@ const DocumentUpload: React.FC<DocumentUploadProps> = ({
|
|||||||
);
|
);
|
||||||
};
|
};
|
||||||
|
|
||||||
export default DocumentUpload;
|
export default DocumentUpload;
|
||||||
@@ -60,7 +60,7 @@ export interface Document {
|
|||||||
file_path: string;
|
file_path: string;
|
||||||
file_size: number;
|
file_size: number;
|
||||||
uploaded_at: string;
|
uploaded_at: string;
|
||||||
status: 'uploaded' | 'extracting_text' | 'processing_llm' | 'generating_pdf' | 'completed' | 'failed';
|
status: 'uploading' | 'uploaded' | 'extracting_text' | 'processing_llm' | 'generating_pdf' | 'completed' | 'failed';
|
||||||
extracted_text?: string;
|
extracted_text?: string;
|
||||||
generated_summary?: string;
|
generated_summary?: string;
|
||||||
summary_markdown_path?: string;
|
summary_markdown_path?: string;
|
||||||
@@ -219,7 +219,7 @@ export class GCSErrorHandler {
|
|||||||
|
|
||||||
class DocumentService {
|
class DocumentService {
|
||||||
/**
|
/**
|
||||||
* Upload a document for processing
|
* Upload a document using Firebase Storage direct upload (new method)
|
||||||
*/
|
*/
|
||||||
async uploadDocument(
|
async uploadDocument(
|
||||||
file: File,
|
file: File,
|
||||||
@@ -233,7 +233,137 @@ class DocumentService {
|
|||||||
throw new Error('Authentication required. Please log in to upload documents.');
|
throw new Error('Authentication required. Please log in to upload documents.');
|
||||||
}
|
}
|
||||||
|
|
||||||
console.log('📤 Starting document upload...');
|
console.log('📤 Starting Firebase Storage direct upload...');
|
||||||
|
console.log('📤 File:', file.name, 'Size:', file.size, 'Type:', file.type);
|
||||||
|
console.log('📤 Token available:', !!token);
|
||||||
|
|
||||||
|
// Step 1: Get signed upload URL
|
||||||
|
onProgress?.(5); // 5% - Getting upload URL
|
||||||
|
|
||||||
|
console.log('🌐 Making request to upload-url endpoint');
|
||||||
|
console.log('🌐 Base URL:', API_BASE_URL);
|
||||||
|
console.log('🌐 Full URL would be:', `${API_BASE_URL}/documents/upload-url`);
|
||||||
|
console.log('🌐 Request payload:', { fileName: file.name, fileSize: file.size, contentType: file.type });
|
||||||
|
|
||||||
|
const uploadUrlResponse = await apiClient.post('/documents/upload-url', {
|
||||||
|
fileName: file.name,
|
||||||
|
fileSize: file.size,
|
||||||
|
contentType: file.type
|
||||||
|
}, { signal });
|
||||||
|
|
||||||
|
const { documentId, uploadUrl } = uploadUrlResponse.data;
|
||||||
|
console.log('✅ Got signed upload URL for document:', documentId);
|
||||||
|
|
||||||
|
// Step 2: Upload directly to Firebase Storage
|
||||||
|
onProgress?.(10); // 10% - Starting direct upload
|
||||||
|
|
||||||
|
await this.uploadToFirebaseStorage(file, uploadUrl, onProgress, signal);
|
||||||
|
console.log('✅ File uploaded to Firebase Storage');
|
||||||
|
|
||||||
|
// Step 3: Confirm upload and trigger processing
|
||||||
|
onProgress?.(95); // 95% - Confirming upload
|
||||||
|
|
||||||
|
const confirmResponse = await apiClient.post(`/documents/${documentId}/confirm-upload`, {}, { signal });
|
||||||
|
|
||||||
|
onProgress?.(100); // 100% - Complete
|
||||||
|
console.log('✅ Upload confirmed and processing started');
|
||||||
|
|
||||||
|
return {
|
||||||
|
id: documentId,
|
||||||
|
...confirmResponse.data
|
||||||
|
};
|
||||||
|
|
||||||
|
} catch (error: any) {
|
||||||
|
console.error('❌ Firebase Storage upload failed:', error);
|
||||||
|
|
||||||
|
// Handle specific error cases
|
||||||
|
if (error.name === 'AbortError') {
|
||||||
|
throw new Error('Upload was cancelled.');
|
||||||
|
}
|
||||||
|
|
||||||
|
if (error.response?.status === 401) {
|
||||||
|
throw new Error('Authentication required. Please log in again.');
|
||||||
|
}
|
||||||
|
|
||||||
|
if (error.response?.status === 400) {
|
||||||
|
throw new Error(error.response?.data?.error || 'Invalid request');
|
||||||
|
}
|
||||||
|
|
||||||
|
if (error.response?.status >= 500) {
|
||||||
|
throw new Error('Server error. Please try again later.');
|
||||||
|
}
|
||||||
|
|
||||||
|
// Generic error fallback
|
||||||
|
throw new Error(error.response?.data?.error || error.message || 'Upload failed');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Upload file directly to Firebase Storage using signed URL
|
||||||
|
*/
|
||||||
|
private async uploadToFirebaseStorage(
|
||||||
|
file: File,
|
||||||
|
uploadUrl: string,
|
||||||
|
onProgress?: (progress: number) => void,
|
||||||
|
signal?: AbortSignal
|
||||||
|
): Promise<void> {
|
||||||
|
return new Promise((resolve, reject) => {
|
||||||
|
const xhr = new XMLHttpRequest();
|
||||||
|
|
||||||
|
// Handle upload progress
|
||||||
|
xhr.upload.addEventListener('progress', (event) => {
|
||||||
|
if (event.lengthComputable && onProgress) {
|
||||||
|
// Map Firebase Storage upload to 10%-90% of overall progress
|
||||||
|
const uploadProgress = Math.round((event.loaded / event.total) * 80) + 10;
|
||||||
|
onProgress(uploadProgress);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// Handle completion
|
||||||
|
xhr.addEventListener('load', () => {
|
||||||
|
if (xhr.status >= 200 && xhr.status < 300) {
|
||||||
|
resolve();
|
||||||
|
} else {
|
||||||
|
reject(new Error(`Firebase Storage upload failed: ${xhr.status} ${xhr.statusText}`));
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// Handle errors
|
||||||
|
xhr.addEventListener('error', () => {
|
||||||
|
reject(new Error('Firebase Storage upload failed: Network error'));
|
||||||
|
});
|
||||||
|
|
||||||
|
// Handle abort
|
||||||
|
if (signal) {
|
||||||
|
signal.addEventListener('abort', () => {
|
||||||
|
xhr.abort();
|
||||||
|
reject(new Error('Upload was cancelled'));
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// Start upload
|
||||||
|
xhr.open('PUT', uploadUrl);
|
||||||
|
xhr.setRequestHeader('Content-Type', file.type);
|
||||||
|
xhr.send(file);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Legacy multipart upload method (kept for compatibility)
|
||||||
|
*/
|
||||||
|
async uploadDocumentLegacy(
|
||||||
|
file: File,
|
||||||
|
onProgress?: (progress: number) => void,
|
||||||
|
signal?: AbortSignal
|
||||||
|
): Promise<Document> {
|
||||||
|
try {
|
||||||
|
// Check authentication before upload
|
||||||
|
const token = await authService.getToken();
|
||||||
|
if (!token) {
|
||||||
|
throw new Error('Authentication required. Please log in to upload documents.');
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log('📤 Starting legacy multipart upload...');
|
||||||
console.log('📤 File:', file.name, 'Size:', file.size, 'Type:', file.type);
|
console.log('📤 File:', file.name, 'Size:', file.size, 'Type:', file.type);
|
||||||
console.log('📤 Token available:', !!token);
|
console.log('📤 Token available:', !!token);
|
||||||
|
|
||||||
@@ -243,7 +373,7 @@ class DocumentService {
|
|||||||
// Always use optimized agentic RAG processing - no strategy selection needed
|
// Always use optimized agentic RAG processing - no strategy selection needed
|
||||||
formData.append('processingStrategy', 'optimized_agentic_rag');
|
formData.append('processingStrategy', 'optimized_agentic_rag');
|
||||||
|
|
||||||
const response = await apiClient.post('/documents', formData, {
|
const response = await apiClient.post('/documents/upload', formData, {
|
||||||
headers: {
|
headers: {
|
||||||
'Content-Type': 'multipart/form-data',
|
'Content-Type': 'multipart/form-data',
|
||||||
},
|
},
|
||||||
@@ -256,10 +386,10 @@ class DocumentService {
|
|||||||
},
|
},
|
||||||
});
|
});
|
||||||
|
|
||||||
console.log('✅ Document upload successful:', response.data);
|
console.log('✅ Legacy document upload successful:', response.data);
|
||||||
return response.data;
|
return response.data;
|
||||||
} catch (error: any) {
|
} catch (error: any) {
|
||||||
console.error('❌ Document upload failed:', error);
|
console.error('❌ Legacy document upload failed:', error);
|
||||||
|
|
||||||
// Provide more specific error messages
|
// Provide more specific error messages
|
||||||
if (error.response?.status === 401) {
|
if (error.response?.status === 401) {
|
||||||
|
|||||||
23
storage.cors.json
Normal file
23
storage.cors.json
Normal file
@@ -0,0 +1,23 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"origin": [
|
||||||
|
"https://cim-summarizer.web.app",
|
||||||
|
"https://cim-summarizer.firebaseapp.com",
|
||||||
|
"http://localhost:3000",
|
||||||
|
"http://localhost:5173"
|
||||||
|
],
|
||||||
|
"method": [
|
||||||
|
"GET",
|
||||||
|
"POST",
|
||||||
|
"PUT",
|
||||||
|
"DELETE",
|
||||||
|
"OPTIONS"
|
||||||
|
],
|
||||||
|
"responseHeader": [
|
||||||
|
"Content-Type",
|
||||||
|
"Authorization",
|
||||||
|
"X-Requested-With"
|
||||||
|
],
|
||||||
|
"maxAgeSeconds": 3600
|
||||||
|
}
|
||||||
|
]
|
||||||
8
storage.rules
Normal file
8
storage.rules
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
rules_version = '2';
|
||||||
|
service firebase.storage {
|
||||||
|
match /b/{bucket}/o {
|
||||||
|
match /{allPaths=**} {
|
||||||
|
allow read, write: if request.auth != null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user