Major release with significant performance improvements and new processing strategy. ## Core Changes - Implemented simple_full_document processing strategy (default) - Full document → LLM approach: 1-2 passes, ~5-6 minutes processing time - Achieved 100% completeness with 2 API calls (down from 5+) - Removed redundant Document AI passes for faster processing ## Financial Data Extraction - Enhanced deterministic financial table parser - Improved FY3/FY2/FY1/LTM identification from varying CIM formats - Automatic merging of parser results with LLM extraction ## Code Quality & Infrastructure - Cleaned up debug logging (removed emoji markers from production code) - Fixed Firebase Secrets configuration (using modern defineSecret approach) - Updated OpenAI API key - Resolved deployment conflicts (secrets vs environment variables) - Added .env files to Firebase ignore list ## Deployment - Firebase Functions v2 deployment successful - All 7 required secrets verified and configured - Function URL: https://api-y56ccs6wva-uc.a.run.app ## Performance Improvements - Processing time: ~5-6 minutes (down from 23+ minutes) - API calls: 1-2 (down from 5+) - Completeness: 100% achievable - LLM Model: claude-3-7-sonnet-latest ## Breaking Changes - Default processing strategy changed to 'simple_full_document' - RAG processor available as alternative strategy 'document_ai_agentic_rag' ## Files Changed - 36 files changed, 5642 insertions(+), 4451 deletions(-) - Removed deprecated documentation files - Cleaned up unused services and models This release represents a major refactoring focused on speed, accuracy, and maintainability.
58 lines
1.8 KiB
PL/PgSQL
58 lines
1.8 KiB
PL/PgSQL
-- Enable the pgvector extension
|
|
CREATE EXTENSION IF NOT EXISTS vector;
|
|
|
|
-- 1. Create document_chunks table
|
|
CREATE TABLE IF NOT EXISTS document_chunks (
|
|
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
|
document_id UUID NOT NULL REFERENCES documents(id) ON DELETE CASCADE,
|
|
content TEXT NOT NULL,
|
|
embedding VECTOR(1536), -- OpenAI text-embedding-3-small uses 1536 dimensions
|
|
metadata JSONB,
|
|
chunk_index INTEGER NOT NULL,
|
|
created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
|
|
updated_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP
|
|
);
|
|
|
|
CREATE INDEX IF NOT EXISTS idx_document_chunks_document_id ON document_chunks(document_id);
|
|
CREATE INDEX IF NOT EXISTS idx_document_chunks_created_at ON document_chunks(created_at);
|
|
|
|
-- Use IVFFlat index for faster similarity search
|
|
CREATE INDEX ON document_chunks USING ivfflat (embedding vector_cosine_ops)
|
|
WITH (lists = 100);
|
|
|
|
|
|
-- 2. Create match_document_chunks function
|
|
CREATE OR REPLACE FUNCTION match_document_chunks (
|
|
query_embedding vector(1536),
|
|
match_threshold float,
|
|
match_count int
|
|
)
|
|
RETURNS TABLE (
|
|
id UUID,
|
|
document_id UUID,
|
|
content text,
|
|
metadata JSONB,
|
|
chunk_index INT,
|
|
similarity float
|
|
)
|
|
LANGUAGE sql STABLE
|
|
AS $$
|
|
SELECT
|
|
document_chunks.id,
|
|
document_chunks.document_id,
|
|
document_chunks.content,
|
|
document_chunks.metadata,
|
|
document_chunks.chunk_index,
|
|
1 - (document_chunks.embedding <=> query_embedding) AS similarity
|
|
FROM document_chunks
|
|
WHERE 1 - (document_chunks.embedding <=> query_embedding) > match_threshold
|
|
ORDER BY similarity DESC
|
|
LIMIT match_count;
|
|
$$;
|
|
|
|
-- 3. Create trigger for updated_at
|
|
CREATE TRIGGER update_document_chunks_updated_at
|
|
BEFORE UPDATE ON document_chunks
|
|
FOR EACH ROW
|
|
EXECUTE FUNCTION update_updated_at_column();
|