-- Enable the pgvector extension CREATE EXTENSION IF NOT EXISTS vector; -- 1. Create document_chunks table CREATE TABLE IF NOT EXISTS document_chunks ( id UUID PRIMARY KEY DEFAULT gen_random_uuid(), document_id UUID NOT NULL REFERENCES documents(id) ON DELETE CASCADE, content TEXT NOT NULL, embedding VECTOR(1536), -- OpenAI text-embedding-3-small uses 1536 dimensions metadata JSONB, chunk_index INTEGER NOT NULL, created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP, updated_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP ); CREATE INDEX IF NOT EXISTS idx_document_chunks_document_id ON document_chunks(document_id); CREATE INDEX IF NOT EXISTS idx_document_chunks_created_at ON document_chunks(created_at); -- Use IVFFlat index for faster similarity search CREATE INDEX ON document_chunks USING ivfflat (embedding vector_cosine_ops) WITH (lists = 100); -- 2. Create match_document_chunks function CREATE OR REPLACE FUNCTION match_document_chunks ( query_embedding vector(1536), match_threshold float, match_count int ) RETURNS TABLE ( id UUID, document_id UUID, content text, metadata JSONB, chunk_index INT, similarity float ) LANGUAGE sql STABLE AS $$ SELECT document_chunks.id, document_chunks.document_id, document_chunks.content, document_chunks.metadata, document_chunks.chunk_index, 1 - (document_chunks.embedding <=> query_embedding) AS similarity FROM document_chunks WHERE 1 - (document_chunks.embedding <=> query_embedding) > match_threshold ORDER BY similarity DESC LIMIT match_count; $$; -- 3. Create trigger for updated_at CREATE TRIGGER update_document_chunks_updated_at BEFORE UPDATE ON document_chunks FOR EACH ROW EXECUTE FUNCTION update_updated_at_column();