New processing strategy `single_pass_quality_check` replaces the multi-pass agentic RAG pipeline (15-25 min) with a streamlined 2-call approach: 1. Full-document LLM extraction (Sonnet) — single call with complete CIM text 2. Delta quality-check (Haiku) — reviews extraction, returns only corrections Key changes: - New singlePassProcessor.ts with extraction + quality check flow - llmService: qualityCheckCIMDocument() with delta-only corrections array - llmService: improved prompt requiring professional inferences for qualitative fields instead of defaulting to "Not specified in CIM" - Removed deterministic financial parser from single-pass flow (LLM outperforms it — parser matched footnotes and narrative text as financials) - Default strategy changed to single_pass_quality_check - Completeness scoring with diagnostic logging of empty fields Tested on 2 real CIMs: 100% completeness, correct financials, ~150s each. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
66 lines
2.2 KiB
PL/PgSQL
66 lines
2.2 KiB
PL/PgSQL
-- Fix vector search timeout by pre-filtering on document_id BEFORE vector search
|
|
-- When document_id is provided, this avoids the full IVFFlat index scan (26K+ rows)
|
|
-- and instead computes distances on only ~80 chunks per document.
|
|
|
|
-- Drop old function signatures
|
|
DROP FUNCTION IF EXISTS match_document_chunks(vector(1536), float, int);
|
|
DROP FUNCTION IF EXISTS match_document_chunks(vector(1536), float, int, text);
|
|
|
|
-- Create optimized function that branches based on whether document_id is provided
|
|
CREATE OR REPLACE FUNCTION match_document_chunks (
|
|
query_embedding vector(1536),
|
|
match_threshold float,
|
|
match_count int,
|
|
filter_document_id text DEFAULT NULL
|
|
)
|
|
RETURNS TABLE (
|
|
id UUID,
|
|
document_id VARCHAR(255),
|
|
content text,
|
|
metadata JSONB,
|
|
chunk_index INT,
|
|
similarity float
|
|
)
|
|
LANGUAGE plpgsql STABLE
|
|
AS $$
|
|
BEGIN
|
|
IF filter_document_id IS NOT NULL THEN
|
|
-- FAST PATH: Pre-filter by document_id using btree index, then compute
|
|
-- vector distances on only that document's chunks (~80 rows).
|
|
-- This completely bypasses the IVFFlat index scan.
|
|
RETURN QUERY
|
|
SELECT
|
|
dc.id,
|
|
dc.document_id,
|
|
dc.content,
|
|
dc.metadata,
|
|
dc.chunk_index,
|
|
1 - (dc.embedding <=> query_embedding) AS similarity
|
|
FROM document_chunks dc
|
|
WHERE dc.document_id = filter_document_id
|
|
AND dc.embedding IS NOT NULL
|
|
AND 1 - (dc.embedding <=> query_embedding) > match_threshold
|
|
ORDER BY dc.embedding <=> query_embedding
|
|
LIMIT match_count;
|
|
ELSE
|
|
-- SLOW PATH: Search across all documents using IVFFlat index.
|
|
-- Only used when no document_id filter is provided.
|
|
RETURN QUERY
|
|
SELECT
|
|
dc.id,
|
|
dc.document_id,
|
|
dc.content,
|
|
dc.metadata,
|
|
dc.chunk_index,
|
|
1 - (dc.embedding <=> query_embedding) AS similarity
|
|
FROM document_chunks dc
|
|
WHERE dc.embedding IS NOT NULL
|
|
AND 1 - (dc.embedding <=> query_embedding) > match_threshold
|
|
ORDER BY dc.embedding <=> query_embedding
|
|
LIMIT match_count;
|
|
END IF;
|
|
END;
|
|
$$;
|
|
|
|
COMMENT ON FUNCTION match_document_chunks IS 'Vector search with fast document-scoped path. When filter_document_id is provided, uses btree index to pre-filter (~80 rows) instead of scanning the full IVFFlat index (26K+ rows).';
|