feat: Add pre-deployment validation and deployment automation

- Add pre-deploy-check.sh script to validate .env doesn't contain secrets - Add clean-env-secrets.sh script to remove secrets from .env before deployment - Update deploy:firebase script to run validation automatically - Add sync-secrets npm script for local development - Add deploy:firebase:force for deployments that skip validation This prevents 'Secret environment variable overlaps non secret environment variable' errors by ensuring secrets defined via defineSecret() are not also in .env file. ## Completed Todos - ✅ Test financial extraction with Stax Holding Company CIM - All values correct (FY-3: $64M, FY-2: $71M, FY-1: $71M, LTM: $76M) - ✅ Implement deterministic parser fallback - Integrated into simpleDocumentProcessor - ✅ Implement few-shot examples - Added comprehensive examples for PRIMARY table identification - ✅ Fix primary table identification - Financial extraction now correctly identifies PRIMARY table (millions) vs subsidiary tables (thousands) ## Pending Todos 1. Review older commits (1-2 months ago) to see how financial extraction was working then - Check commits: 185c780 (Claude 3.7), 5b3b1bf (Document AI fixes), 0ec3d14 (multi-pass extraction) - Compare prompt simplicity - older versions may have had simpler, more effective prompts - Check if deterministic parser was being used more effectively 2. Review best practices for structured financial data extraction from PDFs/CIMs - Research: LLM prompt engineering for tabular data (few-shot examples, chain-of-thought) - Period identification strategies - Validation techniques - Hybrid approaches (deterministic + LLM) - Error handling patterns - Check academic papers and industry case studies 3. Determine how to reduce processing time without sacrificing accuracy - Options: 1) Use Claude Haiku 4.5 for initial extraction, Sonnet 4.5 for validation - 2) Parallel extraction of different sections - 3) Caching common patterns - 4) Streaming responses - 5) Incremental processing with early validation - 6) Reduce prompt verbosity while maintaining clarity 4. Add unit tests for financial extraction validation logic - Test: invalid value rejection, cross-period validation, numeric extraction - Period identification from various formats (years, FY-X, mixed) - Include edge cases: missing periods, projections mixed with historical, inconsistent formatting 5. Monitor production financial extraction accuracy - Track: extraction success rate, validation rejection rate, common error patterns - User feedback on extracted financial data - Set up alerts for validation failures and extraction inconsistencies 6. Optimize prompt size for financial extraction - Current prompts may be too verbose - Test shorter, more focused prompts that maintain accuracy - Consider: removing redundant instructions, using more concise examples, focusing on critical rules only 7. Add financial data visualization - Consider adding a financial data preview/validation step in the UI - Allow users to verify/correct extracted values if needed - Provides human-in-the-loop validation for critical financial data 8. Document extraction strategies - Document the different financial table formats found in CIMs - Create a reference guide for common patterns (years format, FY-X format, mixed format, etc.) - This will help with prompt engineering and parser improvements 9. Compare RAG-based extraction vs simple full-document extraction for financial accuracy - Determine which approach produces more accurate financial data and why - May need to hybrid approach 10. Add confidence scores to financial extraction results - Flag low-confidence extractions for manual review - Helps identify when extraction may be incorrect and needs human validation
2025-11-10 02:43:47 -05:00
parent 77df7c2101
commit 8b15732a98
3 changed files with 101 additions and 1 deletions
--- a/backend/package.json
+++ b/backend/package.json
@@ -15,7 +15,10 @@
    "db:migrate": "ts-node src/scripts/setup-database.ts",
    "db:seed": "ts-node src/models/seed.ts",
    "db:setup": "npm run db:migrate && node scripts/setup_supabase.js",
-    "deploy:firebase": "npm run build && firebase deploy --only functions",
+    "pre-deploy-check": "bash scripts/pre-deploy-check.sh",
+    "clean-env-secrets": "bash scripts/clean-env-secrets.sh",
+    "deploy:firebase": "npm run pre-deploy-check && npm run build && firebase deploy --only functions",
+    "deploy:firebase:force": "npm run build && firebase deploy --only functions",
    "deploy:cloud-run": "npm run build && gcloud run deploy cim-processor-backend --source . --region us-central1 --platform managed --allow-unauthenticated",
    "deploy:docker": "npm run build && docker build -t cim-processor-backend . && docker run -p 8080:8080 cim-processor-backend",
    "docker:build": "docker build -t cim-processor-backend .",
@@ -23,6 +26,7 @@
    "emulator": "firebase emulators:start --only functions",
    "emulator:ui": "firebase emulators:start --only functions --ui",
    "sync:config": "./scripts/sync-firebase-config.sh",
+    "sync-secrets": "ts-node src/scripts/sync-firebase-secrets-to-env.ts",
    "diagnose": "ts-node src/scripts/comprehensive-diagnostic.ts",
    "test:linkage": "ts-node src/scripts/test-linkage.ts",
    "test:postgres": "ts-node src/scripts/test-postgres-connection.ts",
--- a/backend/scripts/clean-env-secrets.sh
+++ b/backend/scripts/clean-env-secrets.sh
@@ -0,0 +1,48 @@
+#!/bin/bash
+# Remove secrets from .env file that should only be Firebase Secrets
+# This prevents conflicts during deployment
+
+set -e
+
+if [ ! -f .env ]; then
+  echo "No .env file found"
+  exit 0
+fi
+
+# List of secrets to remove from .env
+SECRETS=(
+  "ANTHROPIC_API_KEY"
+  "OPENAI_API_KEY"
+  "OPENROUTER_API_KEY"
+  "DATABASE_URL"
+  "SUPABASE_SERVICE_KEY"
+  "SUPABASE_ANON_KEY"
+  "EMAIL_PASS"
+)
+
+echo "🧹 Cleaning secrets from .env file..."
+
+BACKUP_FILE=".env.pre-clean-$(date +%Y%m%d-%H%M%S).bak"
+cp .env "$BACKUP_FILE"
+echo "📋 Backup created: $BACKUP_FILE"
+
+REMOVED=0
+for secret in "${SECRETS[@]}"; do
+  if grep -q "^${secret}=" .env; then
+    # Remove the line (including commented versions)
+    sed -i.tmp "/^#*${secret}=/d" .env
+    rm -f .env.tmp
+    echo "  ✅ Removed ${secret}"
+    REMOVED=$((REMOVED + 1))
+  fi
+done
+
+if [ $REMOVED -gt 0 ]; then
+  echo ""
+  echo "✅ Removed ${REMOVED} secret(s) from .env"
+  echo "💡 For local development, use: npm run sync-secrets"
+else
+  echo "✅ No secrets found in .env (already clean)"
+  rm "$BACKUP_FILE"
+fi
+
--- a/backend/scripts/pre-deploy-check.sh
+++ b/backend/scripts/pre-deploy-check.sh
@@ -0,0 +1,48 @@
+#!/bin/bash
+# Pre-deployment validation script
+# Checks for environment variable conflicts before deploying Firebase Functions
+
+set -e
+
+echo "🔍 Pre-deployment validation..."
+
+# List of secrets that should NOT be in .env
+SECRETS=(
+  "ANTHROPIC_API_KEY"
+  "OPENAI_API_KEY"
+  "OPENROUTER_API_KEY"
+  "DATABASE_URL"
+  "SUPABASE_SERVICE_KEY"
+  "SUPABASE_ANON_KEY"
+  "EMAIL_PASS"
+)
+
+CONFLICTS=0
+
+if [ -f .env ]; then
+  echo "Checking .env file for secret conflicts..."
+  
+  for secret in "${SECRETS[@]}"; do
+    if grep -q "^${secret}=" .env; then
+      echo "⚠️  CONFLICT: ${secret} is in .env but should only be a Firebase Secret"
+      CONFLICTS=$((CONFLICTS + 1))
+    fi
+  done
+  
+  if [ $CONFLICTS -gt 0 ]; then
+    echo ""
+    echo "❌ Found ${CONFLICTS} conflict(s). Please remove these from .env:"
+    echo ""
+    echo "For local development, use: npm run sync-secrets"
+    echo "This will temporarily add secrets to .env for local testing."
+    echo ""
+    echo "To fix now, run: npm run clean-env-secrets"
+    exit 1
+  fi
+else
+  echo "✅ No .env file found (this is fine for deployment)"
+fi
+
+echo "✅ Pre-deployment check passed!"
+exit 0
+