cim_summary/backend/supabase_vector_setup.sql

-- Supabase Vector Database Setup for CIM Document Processor
-- This script creates the document_chunks table with vector search capabilities

-- Enable the pgvector extension for vector operations
CREATE EXTENSION IF NOT EXISTS vector;

-- Create the document_chunks table
CREATE TABLE IF NOT EXISTS document_chunks (
    id UUID DEFAULT gen_random_uuid() PRIMARY KEY,
    document_id TEXT NOT NULL,
    content TEXT NOT NULL,
    embedding VECTOR(1536), -- OpenAI embedding dimensions
    metadata JSONB DEFAULT '{}',
    chunk_index INTEGER NOT NULL,
    created_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(),
    updated_at TIMESTAMP WITH TIME ZONE DEFAULT NOW()
);

-- Create indexes for better performance
CREATE INDEX IF NOT EXISTS idx_document_chunks_document_id ON document_chunks(document_id);
CREATE INDEX IF NOT EXISTS idx_document_chunks_chunk_index ON document_chunks(chunk_index);
CREATE INDEX IF NOT EXISTS idx_document_chunks_embedding ON document_chunks USING ivfflat (embedding vector_cosine_ops) WITH (lists = 100);

-- Create a function to automatically update the updated_at timestamp
CREATE OR REPLACE FUNCTION update_updated_at_column()
RETURNS TRIGGER AS $$
BEGIN
    NEW.updated_at = NOW();
    RETURN NEW;
END;
$$ language 'plpgsql';

-- Create trigger to automatically update updated_at
DROP TRIGGER IF EXISTS update_document_chunks_updated_at ON document_chunks;
CREATE TRIGGER update_document_chunks_updated_at
    BEFORE UPDATE ON document_chunks
    FOR EACH ROW
    EXECUTE FUNCTION update_updated_at_column();

-- Create vector similarity search function
CREATE OR REPLACE FUNCTION match_document_chunks(
    query_embedding VECTOR(1536),
    match_threshold FLOAT DEFAULT 0.7,
    match_count INTEGER DEFAULT 10
)
RETURNS TABLE (
    id UUID,
    document_id TEXT,
    content TEXT,
    metadata JSONB,
    chunk_index INTEGER,
    similarity FLOAT
)
LANGUAGE SQL STABLE
AS $$
    SELECT
        document_chunks.id,
        document_chunks.document_id,
        document_chunks.content,
        document_chunks.metadata,
        document_chunks.chunk_index,
        1 - (document_chunks.embedding <=> query_embedding) AS similarity
    FROM document_chunks
    WHERE 1 - (document_chunks.embedding <=> query_embedding) > match_threshold
    ORDER BY document_chunks.embedding <=> query_embedding
    LIMIT match_count;
$$;

-- Create RLS policies for security
ALTER TABLE document_chunks ENABLE ROW LEVEL SECURITY;

-- Policy to allow authenticated users to read chunks
CREATE POLICY "Users can view document chunks" ON document_chunks
    FOR SELECT USING (auth.role() = 'authenticated');

-- Policy to allow authenticated users to insert chunks
CREATE POLICY "Users can insert document chunks" ON document_chunks
    FOR INSERT WITH CHECK (auth.role() = 'authenticated');

-- Policy to allow authenticated users to update their chunks
CREATE POLICY "Users can update document chunks" ON document_chunks
    FOR UPDATE USING (auth.role() = 'authenticated');

-- Policy to allow authenticated users to delete chunks
CREATE POLICY "Users can delete document chunks" ON document_chunks
    FOR DELETE USING (auth.role() = 'authenticated');

-- Grant necessary permissions
GRANT USAGE ON SCHEMA public TO postgres, anon, authenticated, service_role;
GRANT ALL ON TABLE document_chunks TO postgres, service_role;
GRANT SELECT ON TABLE document_chunks TO anon, authenticated;
GRANT INSERT, UPDATE, DELETE ON TABLE document_chunks TO authenticated, service_role;

-- Grant execute permissions on the search function
GRANT EXECUTE ON FUNCTION match_document_chunks TO postgres, anon, authenticated, service_role;

-- Create some sample data for testing (optional)
-- INSERT INTO document_chunks (document_id, content, chunk_index, metadata)
-- VALUES
--     ('test-doc-1', 'This is a test chunk of content for vector search.', 1, '{"test": true}'),
--     ('test-doc-1', 'Another chunk of content from the same document.', 2, '{"test": true}');

-- Display table info
SELECT
    column_name,
    data_type,
    is_nullable,
    column_default
FROM information_schema.columns
WHERE table_name = 'document_chunks'
ORDER BY ordinal_position;