Files
cim_summary/backend/sql/create_processing_jobs_table.sql
admin 9c916d12f4 feat: Production release v2.0.0 - Simple Document Processor
Major release with significant performance improvements and new processing strategy.

## Core Changes
- Implemented simple_full_document processing strategy (default)
- Full document → LLM approach: 1-2 passes, ~5-6 minutes processing time
- Achieved 100% completeness with 2 API calls (down from 5+)
- Removed redundant Document AI passes for faster processing

## Financial Data Extraction
- Enhanced deterministic financial table parser
- Improved FY3/FY2/FY1/LTM identification from varying CIM formats
- Automatic merging of parser results with LLM extraction

## Code Quality & Infrastructure
- Cleaned up debug logging (removed emoji markers from production code)
- Fixed Firebase Secrets configuration (using modern defineSecret approach)
- Updated OpenAI API key
- Resolved deployment conflicts (secrets vs environment variables)
- Added .env files to Firebase ignore list

## Deployment
- Firebase Functions v2 deployment successful
- All 7 required secrets verified and configured
- Function URL: https://api-y56ccs6wva-uc.a.run.app

## Performance Improvements
- Processing time: ~5-6 minutes (down from 23+ minutes)
- API calls: 1-2 (down from 5+)
- Completeness: 100% achievable
- LLM Model: claude-3-7-sonnet-latest

## Breaking Changes
- Default processing strategy changed to 'simple_full_document'
- RAG processor available as alternative strategy 'document_ai_agentic_rag'

## Files Changed
- 36 files changed, 5642 insertions(+), 4451 deletions(-)
- Removed deprecated documentation files
- Cleaned up unused services and models

This release represents a major refactoring focused on speed, accuracy, and maintainability.
2025-11-09 21:07:22 -05:00

78 lines
2.9 KiB
PL/PgSQL

-- Processing Jobs Table
-- This table stores document processing jobs that need to be executed
-- Replaces the in-memory job queue with persistent database storage
CREATE TABLE IF NOT EXISTS processing_jobs (
-- Primary key
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
-- Job data
document_id UUID NOT NULL REFERENCES documents(id) ON DELETE CASCADE,
user_id TEXT NOT NULL,
-- Job status and progress
status TEXT NOT NULL CHECK (status IN ('pending', 'processing', 'completed', 'failed', 'retrying')),
attempts INTEGER NOT NULL DEFAULT 0,
max_attempts INTEGER NOT NULL DEFAULT 3,
-- Processing options (stored as JSONB)
options JSONB,
-- Timestamps
created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW(),
started_at TIMESTAMP WITH TIME ZONE,
completed_at TIMESTAMP WITH TIME ZONE,
updated_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(),
-- Error tracking
error TEXT,
last_error_at TIMESTAMP WITH TIME ZONE,
-- Result storage
result JSONB
);
-- Indexes for efficient querying
CREATE INDEX IF NOT EXISTS idx_processing_jobs_status ON processing_jobs(status);
CREATE INDEX IF NOT EXISTS idx_processing_jobs_created_at ON processing_jobs(created_at);
CREATE INDEX IF NOT EXISTS idx_processing_jobs_document_id ON processing_jobs(document_id);
CREATE INDEX IF NOT EXISTS idx_processing_jobs_user_id ON processing_jobs(user_id);
CREATE INDEX IF NOT EXISTS idx_processing_jobs_pending ON processing_jobs(status, created_at) WHERE status = 'pending';
-- Function to automatically update updated_at timestamp
CREATE OR REPLACE FUNCTION update_processing_jobs_updated_at()
RETURNS TRIGGER AS $$
BEGIN
NEW.updated_at = NOW();
RETURN NEW;
END;
$$ LANGUAGE plpgsql;
-- Trigger to call the update function
DROP TRIGGER IF EXISTS set_processing_jobs_updated_at ON processing_jobs;
CREATE TRIGGER set_processing_jobs_updated_at
BEFORE UPDATE ON processing_jobs
FOR EACH ROW
EXECUTE FUNCTION update_processing_jobs_updated_at();
-- Grant permissions (adjust role name as needed)
-- ALTER TABLE processing_jobs ENABLE ROW LEVEL SECURITY;
-- Optional: Create a view for monitoring
CREATE OR REPLACE VIEW processing_jobs_summary AS
SELECT
status,
COUNT(*) as count,
AVG(EXTRACT(EPOCH FROM (COALESCE(completed_at, NOW()) - created_at))) as avg_duration_seconds,
MAX(created_at) as latest_created_at
FROM processing_jobs
GROUP BY status;
-- Comments for documentation
COMMENT ON TABLE processing_jobs IS 'Stores document processing jobs for async background processing';
COMMENT ON COLUMN processing_jobs.status IS 'Current status: pending, processing, completed, failed, retrying';
COMMENT ON COLUMN processing_jobs.attempts IS 'Number of processing attempts made';
COMMENT ON COLUMN processing_jobs.max_attempts IS 'Maximum number of retry attempts allowed';
COMMENT ON COLUMN processing_jobs.options IS 'Processing options and configuration (JSON)';
COMMENT ON COLUMN processing_jobs.error IS 'Last error message if processing failed';