fix(core): Overhaul and fix the end-to-end document processing pipeline

This commit is contained in:
Jon
2025-08-01 11:13:03 -04:00
parent 6057d1d7fd
commit 95c92946de
17 changed files with 695 additions and 901 deletions

View File

@@ -63,6 +63,10 @@
}
],
"rewrites": [
{
"source": "/api/**",
"function": "api"
},
{
"source": "**",
"destination": "/index.html"

View File

@@ -387,19 +387,6 @@ const Dashboard: React.FC = () => {
<span className="text-sm text-white">
Welcome, {user?.name || user?.email}
</span>
{/* Debug buttons - show in production for troubleshooting */}
<button
onClick={handleDebugAuth}
className="bg-yellow-500 hover:bg-yellow-600 text-white px-3 py-1 rounded text-sm"
>
Debug Auth
</button>
<button
onClick={handleTestAPIAuth}
className="bg-blue-500 hover:bg-blue-600 text-white px-3 py-1 rounded text-sm"
>
Test API
</button>
<LogoutButton variant="button" className="bg-error-500 hover:bg-error-600 text-white" />
</div>
</div>

View File

@@ -14,10 +14,10 @@ interface UploadedFile {
progress: number;
error?: string;
documentId?: string; // Real document ID from backend
// GCS-specific fields
gcsError?: boolean;
storageType?: 'gcs' | 'local';
gcsUrl?: string;
// Firebase Storage specific fields
storageError?: boolean;
storageType?: 'firebase' | 'local';
storageUrl?: string;
}
interface DocumentUploadProps {
@@ -92,17 +92,15 @@ const DocumentUpload: React.FC<DocumentUploadProps> = ({
try {
// Upload the document with optimized agentic RAG processing (no strategy selection needed)
const document = await documentService.uploadDocument(
file,
const result = await documentService.uploadDocument(
file,
(progress) => {
setUploadedFiles(prev =>
prev.map(f =>
f.id === uploadedFile.id
? { ...f, progress }
: f
f.id === uploadedFile.id ? { ...f, progress } : f
)
);
},
},
abortController.signal
);
@@ -141,13 +139,13 @@ const DocumentUpload: React.FC<DocumentUploadProps> = ({
} else {
console.error('Upload failed:', error);
// Handle GCS-specific errors
// Handle storage-specific errors
let errorMessage = 'Upload failed';
let isGCSError = false;
let isStorageError = false;
if (GCSErrorHandler.isGCSError(error)) {
errorMessage = GCSErrorHandler.getErrorMessage(error as GCSError);
isGCSError = true;
isStorageError = true;
} else if (error instanceof Error) {
errorMessage = error.message;
}
@@ -159,8 +157,8 @@ const DocumentUpload: React.FC<DocumentUploadProps> = ({
...f,
status: 'error',
error: errorMessage,
// Add GCS error indicator
...(isGCSError && { gcsError: true })
// Add storage error indicator
...(isStorageError && { storageError: true })
}
: f
)
@@ -297,19 +295,19 @@ const DocumentUpload: React.FC<DocumentUploadProps> = ({
}
};
const getStatusText = (status: UploadedFile['status'], error?: string, gcsError?: boolean) => {
const getStatusText = (status: UploadedFile['status'], error?: string, storageError?: boolean) => {
switch (status) {
case 'uploading':
return 'Uploading to Google Cloud Storage...';
return 'Uploading to Firebase Storage...';
case 'uploaded':
return 'Uploaded to GCS ✓';
return 'Uploaded to Firebase Storage ✓';
case 'processing':
return 'Processing with Optimized Agentic RAG...';
return 'Processing with Document AI + Optimized Agentic RAG...';
case 'completed':
return 'Completed ✓';
return 'Completed ✓ (PDF automatically deleted)';
case 'error':
if (error === 'Upload cancelled') return 'Cancelled';
if (gcsError) return 'GCS Error';
if (storageError) return 'Firebase Storage Error';
return 'Error';
default:
return '';
@@ -323,10 +321,10 @@ const DocumentUpload: React.FC<DocumentUploadProps> = ({
<div className="flex items-center">
<CheckCircle className="h-5 w-5 text-blue-600 mr-2" />
<div>
<h3 className="text-sm font-medium text-blue-800">Optimized Agentic RAG Processing</h3>
<h3 className="text-sm font-medium text-blue-800">Document AI + Optimized Agentic RAG Processing</h3>
<p className="text-sm text-blue-700 mt-1">
All documents are automatically processed using our advanced optimized agentic RAG system,
which includes intelligent chunking, vectorization, and multi-agent analysis for the best results.
All documents are automatically processed using Google Document AI for extraction and our advanced optimized agentic RAG system for analysis,
including intelligent chunking, vectorization, and multi-agent CIM review. PDFs are automatically deleted after processing.
</p>
</div>
</div>
@@ -351,7 +349,7 @@ const DocumentUpload: React.FC<DocumentUploadProps> = ({
Drag and drop PDF files here, or click to browse
</p>
<p className="text-xs text-gray-500">
Maximum file size: 50MB Supported format: PDF Stored securely in Google Cloud Storage Automatic Optimized Agentic RAG Processing
Maximum file size: 50MB Supported format: PDF Stored securely in Firebase Storage Automatic Document AI + Optimized Agentic RAG Processing PDFs deleted after processing
</p>
</div>
@@ -379,8 +377,8 @@ const DocumentUpload: React.FC<DocumentUploadProps> = ({
<div>
<h4 className="text-sm font-medium text-success-800">Upload Complete</h4>
<p className="text-sm text-success-700 mt-1">
Files have been uploaded successfully to Google Cloud Storage! You can now navigate away from this page.
Processing will continue in the background using Optimized Agentic RAG and you can check the status in the Documents tab.
Files have been uploaded successfully to Firebase Storage! You can now navigate away from this page.
Processing will continue in the background using Document AI + Optimized Agentic RAG. PDFs will be automatically deleted after processing to save costs.
</p>
</div>
</div>
@@ -426,10 +424,10 @@ const DocumentUpload: React.FC<DocumentUploadProps> = ({
<div className="flex items-center space-x-1">
{getStatusIcon(file.status)}
<span className="text-xs text-gray-600">
{getStatusText(file.status, file.error, file.gcsError)}
{getStatusText(file.status, file.error, file.storageError)}
</span>
{/* GCS indicator */}
{file.storageType === 'gcs' && (
{/* Firebase Storage indicator */}
{file.storageType === 'firebase' && (
<Cloud className="h-3 w-3 text-blue-500" />
)}
</div>
@@ -452,4 +450,4 @@ const DocumentUpload: React.FC<DocumentUploadProps> = ({
);
};
export default DocumentUpload;
export default DocumentUpload;

View File

@@ -60,7 +60,7 @@ export interface Document {
file_path: string;
file_size: number;
uploaded_at: string;
status: 'uploaded' | 'extracting_text' | 'processing_llm' | 'generating_pdf' | 'completed' | 'failed';
status: 'uploading' | 'uploaded' | 'extracting_text' | 'processing_llm' | 'generating_pdf' | 'completed' | 'failed';
extracted_text?: string;
generated_summary?: string;
summary_markdown_path?: string;
@@ -219,7 +219,7 @@ export class GCSErrorHandler {
class DocumentService {
/**
* Upload a document for processing
* Upload a document using Firebase Storage direct upload (new method)
*/
async uploadDocument(
file: File,
@@ -233,7 +233,137 @@ class DocumentService {
throw new Error('Authentication required. Please log in to upload documents.');
}
console.log('📤 Starting document upload...');
console.log('📤 Starting Firebase Storage direct upload...');
console.log('📤 File:', file.name, 'Size:', file.size, 'Type:', file.type);
console.log('📤 Token available:', !!token);
// Step 1: Get signed upload URL
onProgress?.(5); // 5% - Getting upload URL
console.log('🌐 Making request to upload-url endpoint');
console.log('🌐 Base URL:', API_BASE_URL);
console.log('🌐 Full URL would be:', `${API_BASE_URL}/documents/upload-url`);
console.log('🌐 Request payload:', { fileName: file.name, fileSize: file.size, contentType: file.type });
const uploadUrlResponse = await apiClient.post('/documents/upload-url', {
fileName: file.name,
fileSize: file.size,
contentType: file.type
}, { signal });
const { documentId, uploadUrl } = uploadUrlResponse.data;
console.log('✅ Got signed upload URL for document:', documentId);
// Step 2: Upload directly to Firebase Storage
onProgress?.(10); // 10% - Starting direct upload
await this.uploadToFirebaseStorage(file, uploadUrl, onProgress, signal);
console.log('✅ File uploaded to Firebase Storage');
// Step 3: Confirm upload and trigger processing
onProgress?.(95); // 95% - Confirming upload
const confirmResponse = await apiClient.post(`/documents/${documentId}/confirm-upload`, {}, { signal });
onProgress?.(100); // 100% - Complete
console.log('✅ Upload confirmed and processing started');
return {
id: documentId,
...confirmResponse.data
};
} catch (error: any) {
console.error('❌ Firebase Storage upload failed:', error);
// Handle specific error cases
if (error.name === 'AbortError') {
throw new Error('Upload was cancelled.');
}
if (error.response?.status === 401) {
throw new Error('Authentication required. Please log in again.');
}
if (error.response?.status === 400) {
throw new Error(error.response?.data?.error || 'Invalid request');
}
if (error.response?.status >= 500) {
throw new Error('Server error. Please try again later.');
}
// Generic error fallback
throw new Error(error.response?.data?.error || error.message || 'Upload failed');
}
}
/**
* Upload file directly to Firebase Storage using signed URL
*/
private async uploadToFirebaseStorage(
file: File,
uploadUrl: string,
onProgress?: (progress: number) => void,
signal?: AbortSignal
): Promise<void> {
return new Promise((resolve, reject) => {
const xhr = new XMLHttpRequest();
// Handle upload progress
xhr.upload.addEventListener('progress', (event) => {
if (event.lengthComputable && onProgress) {
// Map Firebase Storage upload to 10%-90% of overall progress
const uploadProgress = Math.round((event.loaded / event.total) * 80) + 10;
onProgress(uploadProgress);
}
});
// Handle completion
xhr.addEventListener('load', () => {
if (xhr.status >= 200 && xhr.status < 300) {
resolve();
} else {
reject(new Error(`Firebase Storage upload failed: ${xhr.status} ${xhr.statusText}`));
}
});
// Handle errors
xhr.addEventListener('error', () => {
reject(new Error('Firebase Storage upload failed: Network error'));
});
// Handle abort
if (signal) {
signal.addEventListener('abort', () => {
xhr.abort();
reject(new Error('Upload was cancelled'));
});
}
// Start upload
xhr.open('PUT', uploadUrl);
xhr.setRequestHeader('Content-Type', file.type);
xhr.send(file);
});
}
/**
* Legacy multipart upload method (kept for compatibility)
*/
async uploadDocumentLegacy(
file: File,
onProgress?: (progress: number) => void,
signal?: AbortSignal
): Promise<Document> {
try {
// Check authentication before upload
const token = await authService.getToken();
if (!token) {
throw new Error('Authentication required. Please log in to upload documents.');
}
console.log('📤 Starting legacy multipart upload...');
console.log('📤 File:', file.name, 'Size:', file.size, 'Type:', file.type);
console.log('📤 Token available:', !!token);
@@ -243,7 +373,7 @@ class DocumentService {
// Always use optimized agentic RAG processing - no strategy selection needed
formData.append('processingStrategy', 'optimized_agentic_rag');
const response = await apiClient.post('/documents', formData, {
const response = await apiClient.post('/documents/upload', formData, {
headers: {
'Content-Type': 'multipart/form-data',
},
@@ -256,10 +386,10 @@ class DocumentService {
},
});
console.log('✅ Document upload successful:', response.data);
console.log('✅ Legacy document upload successful:', response.data);
return response.data;
} catch (error: any) {
console.error('❌ Document upload failed:', error);
console.error('❌ Legacy document upload failed:', error);
// Provide more specific error messages
if (error.response?.status === 401) {