Major release with significant performance improvements and new processing strategy. ## Core Changes - Implemented simple_full_document processing strategy (default) - Full document → LLM approach: 1-2 passes, ~5-6 minutes processing time - Achieved 100% completeness with 2 API calls (down from 5+) - Removed redundant Document AI passes for faster processing ## Financial Data Extraction - Enhanced deterministic financial table parser - Improved FY3/FY2/FY1/LTM identification from varying CIM formats - Automatic merging of parser results with LLM extraction ## Code Quality & Infrastructure - Cleaned up debug logging (removed emoji markers from production code) - Fixed Firebase Secrets configuration (using modern defineSecret approach) - Updated OpenAI API key - Resolved deployment conflicts (secrets vs environment variables) - Added .env files to Firebase ignore list ## Deployment - Firebase Functions v2 deployment successful - All 7 required secrets verified and configured - Function URL: https://api-y56ccs6wva-uc.a.run.app ## Performance Improvements - Processing time: ~5-6 minutes (down from 23+ minutes) - API calls: 1-2 (down from 5+) - Completeness: 100% achievable - LLM Model: claude-3-7-sonnet-latest ## Breaking Changes - Default processing strategy changed to 'simple_full_document' - RAG processor available as alternative strategy 'document_ai_agentic_rag' ## Files Changed - 36 files changed, 5642 insertions(+), 4451 deletions(-) - Removed deprecated documentation files - Cleaned up unused services and models This release represents a major refactoring focused on speed, accuracy, and maintainability.
78 lines
2.9 KiB
PL/PgSQL
78 lines
2.9 KiB
PL/PgSQL
-- Processing Jobs Table
|
|
-- This table stores document processing jobs that need to be executed
|
|
-- Replaces the in-memory job queue with persistent database storage
|
|
|
|
CREATE TABLE IF NOT EXISTS processing_jobs (
|
|
-- Primary key
|
|
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
|
|
|
|
-- Job data
|
|
document_id UUID NOT NULL REFERENCES documents(id) ON DELETE CASCADE,
|
|
user_id TEXT NOT NULL,
|
|
|
|
-- Job status and progress
|
|
status TEXT NOT NULL CHECK (status IN ('pending', 'processing', 'completed', 'failed', 'retrying')),
|
|
attempts INTEGER NOT NULL DEFAULT 0,
|
|
max_attempts INTEGER NOT NULL DEFAULT 3,
|
|
|
|
-- Processing options (stored as JSONB)
|
|
options JSONB,
|
|
|
|
-- Timestamps
|
|
created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW(),
|
|
started_at TIMESTAMP WITH TIME ZONE,
|
|
completed_at TIMESTAMP WITH TIME ZONE,
|
|
updated_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(),
|
|
|
|
-- Error tracking
|
|
error TEXT,
|
|
last_error_at TIMESTAMP WITH TIME ZONE,
|
|
|
|
-- Result storage
|
|
result JSONB
|
|
);
|
|
|
|
-- Indexes for efficient querying
|
|
CREATE INDEX IF NOT EXISTS idx_processing_jobs_status ON processing_jobs(status);
|
|
CREATE INDEX IF NOT EXISTS idx_processing_jobs_created_at ON processing_jobs(created_at);
|
|
CREATE INDEX IF NOT EXISTS idx_processing_jobs_document_id ON processing_jobs(document_id);
|
|
CREATE INDEX IF NOT EXISTS idx_processing_jobs_user_id ON processing_jobs(user_id);
|
|
CREATE INDEX IF NOT EXISTS idx_processing_jobs_pending ON processing_jobs(status, created_at) WHERE status = 'pending';
|
|
|
|
-- Function to automatically update updated_at timestamp
|
|
CREATE OR REPLACE FUNCTION update_processing_jobs_updated_at()
|
|
RETURNS TRIGGER AS $$
|
|
BEGIN
|
|
NEW.updated_at = NOW();
|
|
RETURN NEW;
|
|
END;
|
|
$$ LANGUAGE plpgsql;
|
|
|
|
-- Trigger to call the update function
|
|
DROP TRIGGER IF EXISTS set_processing_jobs_updated_at ON processing_jobs;
|
|
CREATE TRIGGER set_processing_jobs_updated_at
|
|
BEFORE UPDATE ON processing_jobs
|
|
FOR EACH ROW
|
|
EXECUTE FUNCTION update_processing_jobs_updated_at();
|
|
|
|
-- Grant permissions (adjust role name as needed)
|
|
-- ALTER TABLE processing_jobs ENABLE ROW LEVEL SECURITY;
|
|
|
|
-- Optional: Create a view for monitoring
|
|
CREATE OR REPLACE VIEW processing_jobs_summary AS
|
|
SELECT
|
|
status,
|
|
COUNT(*) as count,
|
|
AVG(EXTRACT(EPOCH FROM (COALESCE(completed_at, NOW()) - created_at))) as avg_duration_seconds,
|
|
MAX(created_at) as latest_created_at
|
|
FROM processing_jobs
|
|
GROUP BY status;
|
|
|
|
-- Comments for documentation
|
|
COMMENT ON TABLE processing_jobs IS 'Stores document processing jobs for async background processing';
|
|
COMMENT ON COLUMN processing_jobs.status IS 'Current status: pending, processing, completed, failed, retrying';
|
|
COMMENT ON COLUMN processing_jobs.attempts IS 'Number of processing attempts made';
|
|
COMMENT ON COLUMN processing_jobs.max_attempts IS 'Maximum number of retry attempts allowed';
|
|
COMMENT ON COLUMN processing_jobs.options IS 'Processing options and configuration (JSON)';
|
|
COMMENT ON COLUMN processing_jobs.error IS 'Last error message if processing failed';
|