-- Supabase Vector Database Setup for CIM Document Processor -- This script creates the document_chunks table with vector search capabilities -- Enable the pgvector extension for vector operations CREATE EXTENSION IF NOT EXISTS vector; -- Create the document_chunks table CREATE TABLE IF NOT EXISTS document_chunks ( id UUID DEFAULT gen_random_uuid() PRIMARY KEY, document_id TEXT NOT NULL, content TEXT NOT NULL, embedding VECTOR(1536), -- OpenAI embedding dimensions metadata JSONB DEFAULT '{}', chunk_index INTEGER NOT NULL, created_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(), updated_at TIMESTAMP WITH TIME ZONE DEFAULT NOW() ); -- Create indexes for better performance CREATE INDEX IF NOT EXISTS idx_document_chunks_document_id ON document_chunks(document_id); CREATE INDEX IF NOT EXISTS idx_document_chunks_chunk_index ON document_chunks(chunk_index); CREATE INDEX IF NOT EXISTS idx_document_chunks_embedding ON document_chunks USING ivfflat (embedding vector_cosine_ops) WITH (lists = 100); -- Create a function to automatically update the updated_at timestamp CREATE OR REPLACE FUNCTION update_updated_at_column() RETURNS TRIGGER AS $$ BEGIN NEW.updated_at = NOW(); RETURN NEW; END; $$ language 'plpgsql'; -- Create trigger to automatically update updated_at DROP TRIGGER IF EXISTS update_document_chunks_updated_at ON document_chunks; CREATE TRIGGER update_document_chunks_updated_at BEFORE UPDATE ON document_chunks FOR EACH ROW EXECUTE FUNCTION update_updated_at_column(); -- Create vector similarity search function CREATE OR REPLACE FUNCTION match_document_chunks( query_embedding VECTOR(1536), match_threshold FLOAT DEFAULT 0.7, match_count INTEGER DEFAULT 10 ) RETURNS TABLE ( id UUID, document_id TEXT, content TEXT, metadata JSONB, chunk_index INTEGER, similarity FLOAT ) LANGUAGE SQL STABLE AS $$ SELECT document_chunks.id, document_chunks.document_id, document_chunks.content, document_chunks.metadata, document_chunks.chunk_index, 1 - (document_chunks.embedding <=> query_embedding) AS similarity FROM document_chunks WHERE 1 - (document_chunks.embedding <=> query_embedding) > match_threshold ORDER BY document_chunks.embedding <=> query_embedding LIMIT match_count; $$; -- Create RLS policies for security ALTER TABLE document_chunks ENABLE ROW LEVEL SECURITY; -- Policy to allow authenticated users to read chunks CREATE POLICY "Users can view document chunks" ON document_chunks FOR SELECT USING (auth.role() = 'authenticated'); -- Policy to allow authenticated users to insert chunks CREATE POLICY "Users can insert document chunks" ON document_chunks FOR INSERT WITH CHECK (auth.role() = 'authenticated'); -- Policy to allow authenticated users to update their chunks CREATE POLICY "Users can update document chunks" ON document_chunks FOR UPDATE USING (auth.role() = 'authenticated'); -- Policy to allow authenticated users to delete chunks CREATE POLICY "Users can delete document chunks" ON document_chunks FOR DELETE USING (auth.role() = 'authenticated'); -- Grant necessary permissions GRANT USAGE ON SCHEMA public TO postgres, anon, authenticated, service_role; GRANT ALL ON TABLE document_chunks TO postgres, service_role; GRANT SELECT ON TABLE document_chunks TO anon, authenticated; GRANT INSERT, UPDATE, DELETE ON TABLE document_chunks TO authenticated, service_role; -- Grant execute permissions on the search function GRANT EXECUTE ON FUNCTION match_document_chunks TO postgres, anon, authenticated, service_role; -- Create some sample data for testing (optional) -- INSERT INTO document_chunks (document_id, content, chunk_index, metadata) -- VALUES -- ('test-doc-1', 'This is a test chunk of content for vector search.', 1, '{"test": true}'), -- ('test-doc-1', 'Another chunk of content from the same document.', 2, '{"test": true}'); -- Display table info SELECT column_name, data_type, is_nullable, column_default FROM information_schema.columns WHERE table_name = 'document_chunks' ORDER BY ordinal_position;