Pre-cleanup commit: Current state before service layer consolidation

This commit is contained in:
Jon
2025-08-01 14:57:56 -04:00
parent 95c92946de
commit f453efb0f8
21 changed files with 2560 additions and 363 deletions

View File

@@ -1,89 +1,76 @@
-- Enable the pgvector extension
CREATE EXTENSION IF NOT EXISTS vector;
-- Create document_chunks table with vector support
-- Create the document_chunks table
CREATE TABLE IF NOT EXISTS document_chunks (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
document_id VARCHAR(255) NOT NULL,
chunk_index INTEGER NOT NULL,
content TEXT NOT NULL,
embedding vector(1536), -- OpenAI embeddings are 1536 dimensions
metadata JSONB DEFAULT '{}',
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
-- Create indexes for better performance
CREATE INDEX IF NOT EXISTS document_chunks_document_id_idx ON document_chunks(document_id);
CREATE INDEX IF NOT EXISTS document_chunks_embedding_idx ON document_chunks USING ivfflat (embedding vector_cosine_ops);
-- Create function to enable pgvector (for RPC calls)
CREATE OR REPLACE FUNCTION enable_pgvector()
RETURNS VOID AS $$
BEGIN
CREATE EXTENSION IF NOT EXISTS vector;
END;
$$ LANGUAGE plpgsql;
-- Create function to create document_chunks table (for RPC calls)
CREATE OR REPLACE FUNCTION create_document_chunks_table()
RETURNS VOID AS $$
BEGIN
CREATE TABLE IF NOT EXISTS document_chunks (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
document_id VARCHAR(255) NOT NULL,
chunk_index INTEGER NOT NULL,
content TEXT NOT NULL,
embedding vector(1536),
metadata JSONB DEFAULT '{}',
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
CREATE INDEX IF NOT EXISTS document_chunks_document_id_idx ON document_chunks(document_id);
CREATE INDEX IF NOT EXISTS document_chunks_embedding_idx ON document_chunks USING ivfflat (embedding vector_cosine_ops);
END;
$$ LANGUAGE plpgsql;
-- Create function to match documents based on vector similarity
CREATE OR REPLACE FUNCTION match_documents(
query_embedding vector(1536),
match_threshold float DEFAULT 0.7,
match_count int DEFAULT 10
)
RETURNS TABLE(
id UUID,
document_id UUID NOT NULL,
content TEXT,
metadata JSONB,
document_id VARCHAR(255),
similarity FLOAT
) AS $$
embedding VECTOR(1536),
chunk_index INTEGER,
section TEXT,
page_number INTEGER,
created_at TIMESTAMPTZ DEFAULT NOW(),
updated_at TIMESTAMPTZ DEFAULT NOW()
);
-- Create the vector_similarity_searches table
CREATE TABLE IF NOT EXISTS vector_similarity_searches (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
user_id UUID,
query_text TEXT,
query_embedding VECTOR(1536),
search_results JSONB,
filters JSONB,
limit_count INTEGER,
similarity_threshold REAL,
processing_time_ms INTEGER,
created_at TIMESTAMPTZ DEFAULT NOW()
);
-- Create the function to count distinct documents
CREATE OR REPLACE FUNCTION count_distinct_documents()
RETURNS INTEGER AS $$
BEGIN
RETURN (SELECT COUNT(DISTINCT document_id) FROM document_chunks);
END;
$$ LANGUAGE plpgsql;
-- Create the function to get the average chunk size
CREATE OR REPLACE FUNCTION average_chunk_size()
RETURNS INTEGER AS $$
BEGIN
RETURN (SELECT AVG(LENGTH(content)) FROM document_chunks);
END;
$$ LANGUAGE plpgsql;
-- Create the function to get search analytics
CREATE OR REPLACE FUNCTION get_search_analytics(user_id_param UUID, days_param INTEGER)
RETURNS TABLE(query_text TEXT, search_count BIGINT) AS $$
BEGIN
RETURN QUERY
SELECT
document_chunks.id,
document_chunks.content,
document_chunks.metadata,
document_chunks.document_id,
1 - (document_chunks.embedding <=> query_embedding) AS similarity
FROM document_chunks
WHERE 1 - (document_chunks.embedding <=> query_embedding) > match_threshold
ORDER BY document_chunks.embedding <=> query_embedding
LIMIT match_count;
vs.query_text,
COUNT(*) as search_count
FROM
vector_similarity_searches vs
WHERE
vs.user_id = user_id_param AND
vs.created_at >= NOW() - (days_param * INTERVAL '1 day')
GROUP BY
vs.query_text
ORDER BY
search_count DESC
LIMIT 20;
END;
$$ LANGUAGE plpgsql;
-- Enable Row Level Security (RLS) if needed
-- ALTER TABLE document_chunks ENABLE ROW LEVEL SECURITY;
-- Create policies for RLS (adjust as needed for your auth requirements)
-- CREATE POLICY "Users can view all document chunks" ON document_chunks FOR SELECT USING (true);
-- CREATE POLICY "Users can insert document chunks" ON document_chunks FOR INSERT WITH CHECK (true);
-- CREATE POLICY "Users can update document chunks" ON document_chunks FOR UPDATE USING (true);
-- CREATE POLICY "Users can delete document chunks" ON document_chunks FOR DELETE USING (true);
-- Grant necessary permissions
GRANT ALL ON document_chunks TO authenticated;
GRANT ALL ON document_chunks TO anon;
GRANT EXECUTE ON FUNCTION match_documents TO authenticated;
GRANT EXECUTE ON FUNCTION match_documents TO anon;
-- Create the function to get vector database stats
CREATE OR REPLACE FUNCTION get_vector_database_stats()
RETURNS TABLE(total_chunks BIGINT, total_documents BIGINT, average_similarity REAL) AS $$
BEGIN
RETURN QUERY
SELECT
(SELECT COUNT(*) FROM document_chunks),
(SELECT COUNT(DISTINCT document_id) FROM document_chunks),
(SELECT AVG(similarity_score) FROM document_similarities WHERE similarity_score > 0);
END;
$$ LANGUAGE plpgsql;