Files
cim_summary/DATABASE_SCHEMA_DOCUMENTATION.md

19 KiB

Database Schema Documentation

Complete Database Structure for CIM Document Processor

🎯 Overview

This document provides comprehensive documentation of the database schema for the CIM Document Processor, including all tables, relationships, indexes, and data structures.


🗄️ Database Architecture

Technology Stack

  • Database: PostgreSQL (via Supabase)
  • ORM: Supabase Client (TypeScript)
  • Migrations: SQL migration files
  • Backup: Supabase automated backups

Database Features

  • JSONB Support: For flexible analysis data storage
  • UUID Primary Keys: For secure document identification
  • Row Level Security: For user data isolation
  • Full-Text Search: For document content search
  • Vector Storage: For AI embeddings and similarity search

📊 Core Tables

Documents Table

Purpose: Primary table for storing document metadata and processing results

CREATE TABLE documents (
  id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
  user_id TEXT NOT NULL,
  original_file_name TEXT NOT NULL,
  file_path TEXT NOT NULL,
  file_size INTEGER NOT NULL,
  status TEXT NOT NULL DEFAULT 'uploaded',
  extracted_text TEXT,
  generated_summary TEXT,
  summary_pdf_path TEXT,
  analysis_data JSONB,
  error_message TEXT,
  created_at TIMESTAMP DEFAULT NOW(),
  updated_at TIMESTAMP DEFAULT NOW()
);

Columns:

  • id - Unique document identifier (UUID)
  • user_id - User who owns the document
  • original_file_name - Original uploaded file name
  • file_path - Storage path for the document
  • file_size - File size in bytes
  • status - Processing status (uploaded, processing, completed, failed, cancelled)
  • extracted_text - Text extracted from document
  • generated_summary - AI-generated summary
  • summary_pdf_path - Path to generated PDF report
  • analysis_data - Structured analysis results (JSONB)
  • error_message - Error message if processing failed
  • created_at - Document creation timestamp
  • updated_at - Last update timestamp

Indexes:

CREATE INDEX idx_documents_user_id ON documents(user_id);
CREATE INDEX idx_documents_status ON documents(status);
CREATE INDEX idx_documents_created_at ON documents(created_at);
CREATE INDEX idx_documents_analysis_data ON documents USING GIN (analysis_data);

Users Table

Purpose: User authentication and profile information

CREATE TABLE users (
  id TEXT PRIMARY KEY,
  name TEXT,
  email TEXT UNIQUE NOT NULL,
  created_at TIMESTAMP DEFAULT NOW(),
  updated_at TIMESTAMP DEFAULT NOW()
);

Columns:

  • id - Firebase user ID
  • name - User display name
  • email - User email address
  • created_at - Account creation timestamp
  • updated_at - Last update timestamp

Indexes:

CREATE INDEX idx_users_email ON users(email);

Processing Jobs Table

Purpose: Background job tracking and management

CREATE TABLE processing_jobs (
  id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
  document_id UUID REFERENCES documents(id) ON DELETE CASCADE,
  user_id TEXT NOT NULL,
  job_type TEXT NOT NULL,
  status TEXT NOT NULL DEFAULT 'pending',
  priority INTEGER DEFAULT 0,
  attempts INTEGER DEFAULT 0,
  max_attempts INTEGER DEFAULT 3,
  started_at TIMESTAMP,
  completed_at TIMESTAMP,
  error_message TEXT,
  result_data JSONB,
  created_at TIMESTAMP DEFAULT NOW(),
  updated_at TIMESTAMP DEFAULT NOW()
);

Columns:

  • id - Unique job identifier
  • document_id - Associated document
  • user_id - User who initiated the job
  • job_type - Type of processing job
  • status - Job status (pending, running, completed, failed)
  • priority - Job priority (higher = more important)
  • attempts - Number of processing attempts
  • max_attempts - Maximum allowed attempts
  • started_at - Job start timestamp
  • completed_at - Job completion timestamp
  • error_message - Error message if failed
  • result_data - Job result data (JSONB)
  • created_at - Job creation timestamp
  • updated_at - Last update timestamp

Indexes:

CREATE INDEX idx_processing_jobs_document_id ON processing_jobs(document_id);
CREATE INDEX idx_processing_jobs_user_id ON processing_jobs(user_id);
CREATE INDEX idx_processing_jobs_status ON processing_jobs(status);
CREATE INDEX idx_processing_jobs_priority ON processing_jobs(priority);

🤖 AI Processing Tables

Agentic RAG Sessions Table

Purpose: Track AI processing sessions and results

CREATE TABLE agentic_rag_sessions (
  id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
  document_id UUID REFERENCES documents(id) ON DELETE CASCADE,
  user_id TEXT NOT NULL,
  strategy TEXT NOT NULL,
  status TEXT NOT NULL DEFAULT 'pending',
  total_agents INTEGER DEFAULT 0,
  completed_agents INTEGER DEFAULT 0,
  failed_agents INTEGER DEFAULT 0,
  overall_validation_score DECIMAL(3,2),
  processing_time_ms INTEGER,
  api_calls_count INTEGER DEFAULT 0,
  total_cost DECIMAL(10,4),
  reasoning_steps JSONB,
  final_result JSONB,
  created_at TIMESTAMP DEFAULT NOW(),
  completed_at TIMESTAMP
);

Columns:

  • id - Unique session identifier
  • document_id - Associated document
  • user_id - User who initiated processing
  • strategy - Processing strategy used
  • status - Session status
  • total_agents - Total number of AI agents
  • completed_agents - Successfully completed agents
  • failed_agents - Failed agents
  • overall_validation_score - Quality validation score
  • processing_time_ms - Total processing time
  • api_calls_count - Number of API calls made
  • total_cost - Total cost of processing
  • reasoning_steps - AI reasoning process (JSONB)
  • final_result - Final analysis result (JSONB)
  • created_at - Session creation timestamp
  • completed_at - Session completion timestamp

Indexes:

CREATE INDEX idx_agentic_rag_sessions_document_id ON agentic_rag_sessions(document_id);
CREATE INDEX idx_agentic_rag_sessions_user_id ON agentic_rag_sessions(user_id);
CREATE INDEX idx_agentic_rag_sessions_status ON agentic_rag_sessions(status);
CREATE INDEX idx_agentic_rag_sessions_strategy ON agentic_rag_sessions(strategy);

Agent Executions Table

Purpose: Track individual AI agent executions

CREATE TABLE agent_executions (
  id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
  session_id UUID REFERENCES agentic_rag_sessions(id) ON DELETE CASCADE,
  agent_name TEXT NOT NULL,
  agent_type TEXT NOT NULL,
  status TEXT NOT NULL DEFAULT 'pending',
  input_data JSONB,
  output_data JSONB,
  error_message TEXT,
  execution_time_ms INTEGER,
  api_calls INTEGER DEFAULT 0,
  cost DECIMAL(10,4),
  validation_score DECIMAL(3,2),
  created_at TIMESTAMP DEFAULT NOW(),
  completed_at TIMESTAMP
);

Columns:

  • id - Unique execution identifier
  • session_id - Associated processing session
  • agent_name - Name of the AI agent
  • agent_type - Type of agent
  • status - Execution status
  • input_data - Input data for agent (JSONB)
  • output_data - Output data from agent (JSONB)
  • error_message - Error message if failed
  • execution_time_ms - Execution time in milliseconds
  • api_calls - Number of API calls made
  • cost - Cost of this execution
  • validation_score - Quality validation score
  • created_at - Execution creation timestamp
  • completed_at - Execution completion timestamp

Indexes:

CREATE INDEX idx_agent_executions_session_id ON agent_executions(session_id);
CREATE INDEX idx_agent_executions_agent_name ON agent_executions(agent_name);
CREATE INDEX idx_agent_executions_status ON agent_executions(status);

Quality Metrics Table

Purpose: Track quality metrics for AI processing

CREATE TABLE quality_metrics (
  id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
  session_id UUID REFERENCES agentic_rag_sessions(id) ON DELETE CASCADE,
  metric_name TEXT NOT NULL,
  metric_value DECIMAL(10,4),
  metric_type TEXT NOT NULL,
  threshold_value DECIMAL(10,4),
  passed BOOLEAN,
  details JSONB,
  created_at TIMESTAMP DEFAULT NOW()
);

Columns:

  • id - Unique metric identifier
  • session_id - Associated processing session
  • metric_name - Name of the quality metric
  • metric_value - Actual metric value
  • metric_type - Type of metric (accuracy, completeness, etc.)
  • threshold_value - Threshold for passing
  • passed - Whether metric passed threshold
  • details - Additional metric details (JSONB)
  • created_at - Metric creation timestamp

Indexes:

CREATE INDEX idx_quality_metrics_session_id ON quality_metrics(session_id);
CREATE INDEX idx_quality_metrics_metric_name ON quality_metrics(metric_name);
CREATE INDEX idx_quality_metrics_passed ON quality_metrics(passed);

🔍 Vector Database Tables

Document Chunks Table

Purpose: Store document chunks with vector embeddings

CREATE TABLE document_chunks (
  id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
  document_id UUID REFERENCES documents(id) ON DELETE CASCADE,
  chunk_index INTEGER NOT NULL,
  content TEXT NOT NULL,
  embedding VECTOR(1536),
  metadata JSONB,
  created_at TIMESTAMP DEFAULT NOW()
);

Columns:

  • id - Unique chunk identifier
  • document_id - Associated document
  • chunk_index - Sequential chunk index
  • content - Chunk text content
  • embedding - Vector embedding (1536 dimensions)
  • metadata - Chunk metadata (JSONB)
  • created_at - Chunk creation timestamp

Indexes:

CREATE INDEX idx_document_chunks_document_id ON document_chunks(document_id);
CREATE INDEX idx_document_chunks_chunk_index ON document_chunks(chunk_index);
CREATE INDEX idx_document_chunks_embedding ON document_chunks USING ivfflat (embedding vector_cosine_ops);

Search Analytics Table

Purpose: Track vector search usage and performance

CREATE TABLE search_analytics (
  id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
  user_id TEXT NOT NULL,
  query_text TEXT NOT NULL,
  results_count INTEGER,
  search_time_ms INTEGER,
  success BOOLEAN,
  error_message TEXT,
  created_at TIMESTAMP DEFAULT NOW()
);

Columns:

  • id - Unique search identifier
  • user_id - User who performed search
  • query_text - Search query text
  • results_count - Number of results returned
  • search_time_ms - Search execution time
  • success - Whether search was successful
  • error_message - Error message if failed
  • created_at - Search timestamp

Indexes:

CREATE INDEX idx_search_analytics_user_id ON search_analytics(user_id);
CREATE INDEX idx_search_analytics_created_at ON search_analytics(created_at);
CREATE INDEX idx_search_analytics_success ON search_analytics(success);

📈 Analytics Tables

Performance Metrics Table

Purpose: Track system performance metrics

CREATE TABLE performance_metrics (
  id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
  metric_name TEXT NOT NULL,
  metric_value DECIMAL(10,4),
  metric_unit TEXT,
  tags JSONB,
  timestamp TIMESTAMP DEFAULT NOW()
);

Columns:

  • id - Unique metric identifier
  • metric_name - Name of the performance metric
  • metric_value - Metric value
  • metric_unit - Unit of measurement
  • tags - Additional tags (JSONB)
  • timestamp - Metric timestamp

Indexes:

CREATE INDEX idx_performance_metrics_name ON performance_metrics(metric_name);
CREATE INDEX idx_performance_metrics_timestamp ON performance_metrics(timestamp);

Usage Analytics Table

Purpose: Track user usage patterns

CREATE TABLE usage_analytics (
  id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
  user_id TEXT NOT NULL,
  action_type TEXT NOT NULL,
  action_details JSONB,
  ip_address INET,
  user_agent TEXT,
  created_at TIMESTAMP DEFAULT NOW()
);

Columns:

  • id - Unique analytics identifier
  • user_id - User who performed action
  • action_type - Type of action performed
  • action_details - Action details (JSONB)
  • ip_address - User IP address
  • user_agent - User agent string
  • created_at - Action timestamp

Indexes:

CREATE INDEX idx_usage_analytics_user_id ON usage_analytics(user_id);
CREATE INDEX idx_usage_analytics_action_type ON usage_analytics(action_type);
CREATE INDEX idx_usage_analytics_created_at ON usage_analytics(created_at);

🔗 Table Relationships

Primary Relationships

erDiagram
    users ||--o{ documents : "owns"
    documents ||--o{ processing_jobs : "has"
    documents ||--o{ agentic_rag_sessions : "has"
    agentic_rag_sessions ||--o{ agent_executions : "contains"
    agentic_rag_sessions ||--o{ quality_metrics : "has"
    documents ||--o{ document_chunks : "contains"
    users ||--o{ search_analytics : "performs"
    users ||--o{ usage_analytics : "generates"

Foreign Key Constraints

-- Documents table constraints
ALTER TABLE documents ADD CONSTRAINT fk_documents_user_id 
  FOREIGN KEY (user_id) REFERENCES users(id) ON DELETE CASCADE;

-- Processing jobs table constraints
ALTER TABLE processing_jobs ADD CONSTRAINT fk_processing_jobs_document_id 
  FOREIGN KEY (document_id) REFERENCES documents(id) ON DELETE CASCADE;

-- Agentic RAG sessions table constraints
ALTER TABLE agentic_rag_sessions ADD CONSTRAINT fk_agentic_rag_sessions_document_id 
  FOREIGN KEY (document_id) REFERENCES documents(id) ON DELETE CASCADE;

-- Agent executions table constraints
ALTER TABLE agent_executions ADD CONSTRAINT fk_agent_executions_session_id 
  FOREIGN KEY (session_id) REFERENCES agentic_rag_sessions(id) ON DELETE CASCADE;

-- Quality metrics table constraints
ALTER TABLE quality_metrics ADD CONSTRAINT fk_quality_metrics_session_id 
  FOREIGN KEY (session_id) REFERENCES agentic_rag_sessions(id) ON DELETE CASCADE;

-- Document chunks table constraints
ALTER TABLE document_chunks ADD CONSTRAINT fk_document_chunks_document_id 
  FOREIGN KEY (document_id) REFERENCES documents(id) ON DELETE CASCADE;

🔐 Row Level Security (RLS)

Documents Table RLS

-- Enable RLS
ALTER TABLE documents ENABLE ROW LEVEL SECURITY;

-- Policy: Users can only access their own documents
CREATE POLICY "Users can view own documents" ON documents
  FOR SELECT USING (auth.uid()::text = user_id);

CREATE POLICY "Users can insert own documents" ON documents
  FOR INSERT WITH CHECK (auth.uid()::text = user_id);

CREATE POLICY "Users can update own documents" ON documents
  FOR UPDATE USING (auth.uid()::text = user_id);

CREATE POLICY "Users can delete own documents" ON documents
  FOR DELETE USING (auth.uid()::text = user_id);

Processing Jobs Table RLS

-- Enable RLS
ALTER TABLE processing_jobs ENABLE ROW LEVEL SECURITY;

-- Policy: Users can only access their own jobs
CREATE POLICY "Users can view own jobs" ON processing_jobs
  FOR SELECT USING (auth.uid()::text = user_id);

CREATE POLICY "Users can insert own jobs" ON processing_jobs
  FOR INSERT WITH CHECK (auth.uid()::text = user_id);

CREATE POLICY "Users can update own jobs" ON processing_jobs
  FOR UPDATE USING (auth.uid()::text = user_id);

📊 Data Types and Constraints

Status Enums

-- Document status enum
CREATE TYPE document_status AS ENUM (
  'uploaded',
  'processing',
  'completed',
  'failed',
  'cancelled'
);

-- Job status enum
CREATE TYPE job_status AS ENUM (
  'pending',
  'running',
  'completed',
  'failed',
  'cancelled'
);

-- Session status enum
CREATE TYPE session_status AS ENUM (
  'pending',
  'processing',
  'completed',
  'failed',
  'cancelled'
);

Check Constraints

-- File size constraint
ALTER TABLE documents ADD CONSTRAINT check_file_size 
  CHECK (file_size > 0 AND file_size <= 104857600);

-- Processing time constraint
ALTER TABLE agentic_rag_sessions ADD CONSTRAINT check_processing_time 
  CHECK (processing_time_ms >= 0);

-- Validation score constraint
ALTER TABLE quality_metrics ADD CONSTRAINT check_validation_score 
  CHECK (metric_value >= 0 AND metric_value <= 1);

🔄 Migration Scripts

Initial Schema Migration

-- Migration: 001_create_initial_schema.sql
BEGIN;

-- Create users table
CREATE TABLE users (
  id TEXT PRIMARY KEY,
  name TEXT,
  email TEXT UNIQUE NOT NULL,
  created_at TIMESTAMP DEFAULT NOW(),
  updated_at TIMESTAMP DEFAULT NOW()
);

-- Create documents table
CREATE TABLE documents (
  id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
  user_id TEXT NOT NULL,
  original_file_name TEXT NOT NULL,
  file_path TEXT NOT NULL,
  file_size INTEGER NOT NULL,
  status TEXT NOT NULL DEFAULT 'uploaded',
  extracted_text TEXT,
  generated_summary TEXT,
  summary_pdf_path TEXT,
  analysis_data JSONB,
  error_message TEXT,
  created_at TIMESTAMP DEFAULT NOW(),
  updated_at TIMESTAMP DEFAULT NOW()
);

-- Create indexes
CREATE INDEX idx_documents_user_id ON documents(user_id);
CREATE INDEX idx_documents_status ON documents(status);
CREATE INDEX idx_documents_created_at ON documents(created_at);

-- Enable RLS
ALTER TABLE documents ENABLE ROW LEVEL SECURITY;

COMMIT;

Add Vector Support Migration

-- Migration: 002_add_vector_support.sql
BEGIN;

-- Enable vector extension
CREATE EXTENSION IF NOT EXISTS vector;

-- Create document chunks table
CREATE TABLE document_chunks (
  id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
  document_id UUID REFERENCES documents(id) ON DELETE CASCADE,
  chunk_index INTEGER NOT NULL,
  content TEXT NOT NULL,
  embedding VECTOR(1536),
  metadata JSONB,
  created_at TIMESTAMP DEFAULT NOW()
);

-- Create vector indexes
CREATE INDEX idx_document_chunks_document_id ON document_chunks(document_id);
CREATE INDEX idx_document_chunks_embedding ON document_chunks USING ivfflat (embedding vector_cosine_ops);

COMMIT;

📈 Performance Optimization

Query Optimization

-- Optimize document queries with composite indexes
CREATE INDEX idx_documents_user_status ON documents(user_id, status);
CREATE INDEX idx_documents_user_created ON documents(user_id, created_at DESC);

-- Optimize processing job queries
CREATE INDEX idx_processing_jobs_user_status ON processing_jobs(user_id, status);
CREATE INDEX idx_processing_jobs_priority_status ON processing_jobs(priority DESC, status);

-- Optimize analytics queries
CREATE INDEX idx_usage_analytics_user_action ON usage_analytics(user_id, action_type);
CREATE INDEX idx_performance_metrics_name_time ON performance_metrics(metric_name, timestamp DESC);

Partitioning Strategy

-- Partition documents table by creation date
CREATE TABLE documents_2024 PARTITION OF documents
  FOR VALUES FROM ('2024-01-01') TO ('2025-01-01');

CREATE TABLE documents_2025 PARTITION OF documents
  FOR VALUES FROM ('2025-01-01') TO ('2026-01-01');

🔍 Monitoring and Maintenance

Database Health Queries

-- Check table sizes
SELECT 
  schemaname,
  tablename,
  attname,
  n_distinct,
  correlation
FROM pg_stats
WHERE tablename = 'documents';

-- Check index usage
SELECT 
  schemaname,
  tablename,
  indexname,
  idx_scan,
  idx_tup_read,
  idx_tup_fetch
FROM pg_stat_user_indexes
WHERE tablename = 'documents';

-- Check slow queries
SELECT 
  query,
  calls,
  total_time,
  mean_time,
  rows
FROM pg_stat_statements
WHERE query LIKE '%documents%'
ORDER BY mean_time DESC
LIMIT 10;

Maintenance Procedures

-- Vacuum and analyze tables
VACUUM ANALYZE documents;
VACUUM ANALYZE processing_jobs;
VACUUM ANALYZE agentic_rag_sessions;

-- Update statistics
ANALYZE documents;
ANALYZE processing_jobs;
ANALYZE agentic_rag_sessions;

This comprehensive database schema documentation provides complete information about the database structure, relationships, and optimization strategies for the CIM Document Processor.