Files
cim_summary/backend/sql/fix_vector_search_timeout.sql
admin 5cfb136484 Add single-pass CIM processor: 2 LLM calls, ~2.5 min processing
New processing strategy `single_pass_quality_check` replaces the multi-pass
agentic RAG pipeline (15-25 min) with a streamlined 2-call approach:

1. Full-document LLM extraction (Sonnet) — single call with complete CIM text
2. Delta quality-check (Haiku) — reviews extraction, returns only corrections

Key changes:
- New singlePassProcessor.ts with extraction + quality check flow
- llmService: qualityCheckCIMDocument() with delta-only corrections array
- llmService: improved prompt requiring professional inferences for qualitative
  fields instead of defaulting to "Not specified in CIM"
- Removed deterministic financial parser from single-pass flow (LLM outperforms
  it — parser matched footnotes and narrative text as financials)
- Default strategy changed to single_pass_quality_check
- Completeness scoring with diagnostic logging of empty fields

Tested on 2 real CIMs: 100% completeness, correct financials, ~150s each.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-23 22:28:45 -05:00

66 lines
2.2 KiB
PL/PgSQL

-- Fix vector search timeout by pre-filtering on document_id BEFORE vector search
-- When document_id is provided, this avoids the full IVFFlat index scan (26K+ rows)
-- and instead computes distances on only ~80 chunks per document.
-- Drop old function signatures
DROP FUNCTION IF EXISTS match_document_chunks(vector(1536), float, int);
DROP FUNCTION IF EXISTS match_document_chunks(vector(1536), float, int, text);
-- Create optimized function that branches based on whether document_id is provided
CREATE OR REPLACE FUNCTION match_document_chunks (
query_embedding vector(1536),
match_threshold float,
match_count int,
filter_document_id text DEFAULT NULL
)
RETURNS TABLE (
id UUID,
document_id VARCHAR(255),
content text,
metadata JSONB,
chunk_index INT,
similarity float
)
LANGUAGE plpgsql STABLE
AS $$
BEGIN
IF filter_document_id IS NOT NULL THEN
-- FAST PATH: Pre-filter by document_id using btree index, then compute
-- vector distances on only that document's chunks (~80 rows).
-- This completely bypasses the IVFFlat index scan.
RETURN QUERY
SELECT
dc.id,
dc.document_id,
dc.content,
dc.metadata,
dc.chunk_index,
1 - (dc.embedding <=> query_embedding) AS similarity
FROM document_chunks dc
WHERE dc.document_id = filter_document_id
AND dc.embedding IS NOT NULL
AND 1 - (dc.embedding <=> query_embedding) > match_threshold
ORDER BY dc.embedding <=> query_embedding
LIMIT match_count;
ELSE
-- SLOW PATH: Search across all documents using IVFFlat index.
-- Only used when no document_id filter is provided.
RETURN QUERY
SELECT
dc.id,
dc.document_id,
dc.content,
dc.metadata,
dc.chunk_index,
1 - (dc.embedding <=> query_embedding) AS similarity
FROM document_chunks dc
WHERE dc.embedding IS NOT NULL
AND 1 - (dc.embedding <=> query_embedding) > match_threshold
ORDER BY dc.embedding <=> query_embedding
LIMIT match_count;
END IF;
END;
$$;
COMMENT ON FUNCTION match_document_chunks IS 'Vector search with fast document-scoped path. When filter_document_id is provided, uses btree index to pre-filter (~80 rows) instead of scanning the full IVFFlat index (26K+ rows).';