Pre-cleanup commit: Current state before service layer consolidation

2025-08-01 14:57:56 -04:00
parent 95c92946de
commit f453efb0f8
21 changed files with 2560 additions and 363 deletions
--- a/backend/supabase_setup.sql
+++ b/backend/supabase_setup.sql
@@ -1,89 +1,76 @@
-- Enable the pgvector extension
-CREATE EXTENSION IF NOT EXISTS vector;
-
-- Create document_chunks table with vector support
+-- Create the document_chunks table
 CREATE TABLE IF NOT EXISTS document_chunks (
  id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
-  document_id VARCHAR(255) NOT NULL,
-  chunk_index INTEGER NOT NULL,
-  content TEXT NOT NULL,
-  embedding vector(1536), -- OpenAI embeddings are 1536 dimensions
-  metadata JSONB DEFAULT '{}',
-  created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
-  updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
-);
-
-- Create indexes for better performance
-CREATE INDEX IF NOT EXISTS document_chunks_document_id_idx ON document_chunks(document_id);
-CREATE INDEX IF NOT EXISTS document_chunks_embedding_idx ON document_chunks USING ivfflat (embedding vector_cosine_ops);
-
-- Create function to enable pgvector (for RPC calls)
-CREATE OR REPLACE FUNCTION enable_pgvector()
-RETURNS VOID AS $$
-BEGIN
-  CREATE EXTENSION IF NOT EXISTS vector;
-END;
-$$ LANGUAGE plpgsql;
-
-- Create function to create document_chunks table (for RPC calls)
-CREATE OR REPLACE FUNCTION create_document_chunks_table()
-RETURNS VOID AS $$
-BEGIN
-  CREATE TABLE IF NOT EXISTS document_chunks (
-    id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
-    document_id VARCHAR(255) NOT NULL,
-    chunk_index INTEGER NOT NULL,
-    content TEXT NOT NULL,
-    embedding vector(1536),
-    metadata JSONB DEFAULT '{}',
-    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
-    updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
-  );
-  
-  CREATE INDEX IF NOT EXISTS document_chunks_document_id_idx ON document_chunks(document_id);
-  CREATE INDEX IF NOT EXISTS document_chunks_embedding_idx ON document_chunks USING ivfflat (embedding vector_cosine_ops);
-END;
-$$ LANGUAGE plpgsql;
-
-- Create function to match documents based on vector similarity
-CREATE OR REPLACE FUNCTION match_documents(
-  query_embedding vector(1536),
-  match_threshold float DEFAULT 0.7,
-  match_count int DEFAULT 10
-)
-RETURNS TABLE(
-  id UUID,
+  document_id UUID NOT NULL,
  content TEXT,
  metadata JSONB,
-  document_id VARCHAR(255),
-  similarity FLOAT
-) AS $$
+  embedding VECTOR(1536),
+  chunk_index INTEGER,
+  section TEXT,
+  page_number INTEGER,
+  created_at TIMESTAMPTZ DEFAULT NOW(),
+  updated_at TIMESTAMPTZ DEFAULT NOW()
+);
+
+-- Create the vector_similarity_searches table
+CREATE TABLE IF NOT EXISTS vector_similarity_searches (
+  id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
+  user_id UUID,
+  query_text TEXT,
+  query_embedding VECTOR(1536),
+  search_results JSONB,
+  filters JSONB,
+  limit_count INTEGER,
+  similarity_threshold REAL,
+  processing_time_ms INTEGER,
+  created_at TIMESTAMPTZ DEFAULT NOW()
+);
+
+-- Create the function to count distinct documents
+CREATE OR REPLACE FUNCTION count_distinct_documents()
+RETURNS INTEGER AS $$
+BEGIN
+  RETURN (SELECT COUNT(DISTINCT document_id) FROM document_chunks);
+END;
+$$ LANGUAGE plpgsql;
+
+-- Create the function to get the average chunk size
+CREATE OR REPLACE FUNCTION average_chunk_size()
+RETURNS INTEGER AS $$
+BEGIN
+  RETURN (SELECT AVG(LENGTH(content)) FROM document_chunks);
+END;
+$$ LANGUAGE plpgsql;
+
+-- Create the function to get search analytics
+CREATE OR REPLACE FUNCTION get_search_analytics(user_id_param UUID, days_param INTEGER)
+RETURNS TABLE(query_text TEXT, search_count BIGINT) AS $$
 BEGIN
  RETURN QUERY
  SELECT
-    document_chunks.id,
-    document_chunks.content,
-    document_chunks.metadata,
-    document_chunks.document_id,
-    1 - (document_chunks.embedding <=> query_embedding) AS similarity
-  FROM document_chunks
-  WHERE 1 - (document_chunks.embedding <=> query_embedding) > match_threshold
-  ORDER BY document_chunks.embedding <=> query_embedding
-  LIMIT match_count;
+    vs.query_text,
+    COUNT(*) as search_count
+  FROM
+    vector_similarity_searches vs
+  WHERE
+    vs.user_id = user_id_param AND
+    vs.created_at >= NOW() - (days_param * INTERVAL '1 day')
+  GROUP BY
+    vs.query_text
+  ORDER BY
+    search_count DESC
+  LIMIT 20;
 END;
 $$ LANGUAGE plpgsql;

-- Enable Row Level Security (RLS) if needed
-- ALTER TABLE document_chunks ENABLE ROW LEVEL SECURITY;
-
-- Create policies for RLS (adjust as needed for your auth requirements)
-- CREATE POLICY "Users can view all document chunks" ON document_chunks FOR SELECT USING (true);
-- CREATE POLICY "Users can insert document chunks" ON document_chunks FOR INSERT WITH CHECK (true);
-- CREATE POLICY "Users can update document chunks" ON document_chunks FOR UPDATE USING (true);
-- CREATE POLICY "Users can delete document chunks" ON document_chunks FOR DELETE USING (true);
-
-- Grant necessary permissions
-GRANT ALL ON document_chunks TO authenticated;
-GRANT ALL ON document_chunks TO anon;
-GRANT EXECUTE ON FUNCTION match_documents TO authenticated;
-GRANT EXECUTE ON FUNCTION match_documents TO anon;
+-- Create the function to get vector database stats
+CREATE OR REPLACE FUNCTION get_vector_database_stats()
+RETURNS TABLE(total_chunks BIGINT, total_documents BIGINT, average_similarity REAL) AS $$
+BEGIN
+  RETURN QUERY
+  SELECT
+    (SELECT COUNT(*) FROM document_chunks),
+    (SELECT COUNT(DISTINCT document_id) FROM document_chunks),
+    (SELECT AVG(similarity_score) FROM document_similarities WHERE similarity_score > 0);
+END;
+$$ LANGUAGE plpgsql;