feat(02-03): create alertService with deduplication and email
- evaluateAndAlert() iterates ProbeResults and skips healthy probes - Maps 'down' -> 'service_down', 'degraded' -> 'service_degraded' - Deduplication via AlertEventModel.findRecentByService with configurable cooldown - Creates alert_events row before sending email (suppression skips both) - Recipient read from process.env.EMAIL_WEEKLY_RECIPIENT (never hardcoded) - createTransporter() called inside function scope (Firebase Secret timing fix) - Email failures caught and logged, never re-thrown
This commit is contained in:
109
backend/sql/check_table_sizes.sql
Normal file
109
backend/sql/check_table_sizes.sql
Normal file
@@ -0,0 +1,109 @@
|
||||
-- ============================================================
|
||||
-- CHECK TABLE SIZES - Run in Supabase SQL Editor
|
||||
-- ============================================================
|
||||
-- Part 1: Shows all public tables with sizes (auto-discovers)
|
||||
-- Part 2: Cleanup candidate counts (only for tables that exist)
|
||||
-- ============================================================
|
||||
|
||||
-- PART 1: All public table sizes
|
||||
SELECT
|
||||
c.relname AS table_name,
|
||||
pg_size_pretty(pg_total_relation_size(c.oid)) AS total_size,
|
||||
pg_size_pretty(pg_relation_size(c.oid)) AS data_size,
|
||||
pg_size_pretty(pg_total_relation_size(c.oid) - pg_relation_size(c.oid)) AS index_size,
|
||||
c.reltuples::bigint AS estimated_rows
|
||||
FROM pg_class c
|
||||
JOIN pg_namespace n ON n.oid = c.relnamespace
|
||||
WHERE n.nspname = 'public'
|
||||
AND c.relkind = 'r'
|
||||
ORDER BY pg_total_relation_size(c.oid) DESC;
|
||||
|
||||
-- PART 2: Cleanup candidates (safe — checks table existence before querying)
|
||||
DO $$
|
||||
DECLARE
|
||||
rec RECORD;
|
||||
row_count bigint;
|
||||
cleanup_count bigint;
|
||||
query text;
|
||||
BEGIN
|
||||
RAISE NOTICE '--- CLEANUP CANDIDATE BREAKDOWN ---';
|
||||
|
||||
-- Processing jobs
|
||||
IF EXISTS (SELECT 1 FROM pg_class c JOIN pg_namespace n ON n.oid = c.relnamespace WHERE n.nspname = 'public' AND c.relname = 'processing_jobs') THEN
|
||||
SELECT count(*), count(*) FILTER (WHERE status IN ('completed', 'failed') AND completed_at < NOW() - INTERVAL '30 days')
|
||||
INTO row_count, cleanup_count FROM processing_jobs;
|
||||
RAISE NOTICE 'processing_jobs: % total, % cleanup candidates (completed/failed > 30d)', row_count, cleanup_count;
|
||||
END IF;
|
||||
|
||||
-- Vector similarity searches
|
||||
IF EXISTS (SELECT 1 FROM pg_class c JOIN pg_namespace n ON n.oid = c.relnamespace WHERE n.nspname = 'public' AND c.relname = 'vector_similarity_searches') THEN
|
||||
SELECT count(*), count(*) FILTER (WHERE created_at < NOW() - INTERVAL '90 days')
|
||||
INTO row_count, cleanup_count FROM vector_similarity_searches;
|
||||
RAISE NOTICE 'vector_similarity_searches: % total, % cleanup candidates (> 90d)', row_count, cleanup_count;
|
||||
END IF;
|
||||
|
||||
-- Session events
|
||||
IF EXISTS (SELECT 1 FROM pg_class c JOIN pg_namespace n ON n.oid = c.relnamespace WHERE n.nspname = 'public' AND c.relname = 'session_events') THEN
|
||||
SELECT count(*), count(*) FILTER (WHERE created_at < NOW() - INTERVAL '30 days')
|
||||
INTO row_count, cleanup_count FROM session_events;
|
||||
RAISE NOTICE 'session_events: % total, % cleanup candidates (> 30d)', row_count, cleanup_count;
|
||||
END IF;
|
||||
|
||||
-- Execution events
|
||||
IF EXISTS (SELECT 1 FROM pg_class c JOIN pg_namespace n ON n.oid = c.relnamespace WHERE n.nspname = 'public' AND c.relname = 'execution_events') THEN
|
||||
SELECT count(*), count(*) FILTER (WHERE created_at < NOW() - INTERVAL '30 days')
|
||||
INTO row_count, cleanup_count FROM execution_events;
|
||||
RAISE NOTICE 'execution_events: % total, % cleanup candidates (> 30d)', row_count, cleanup_count;
|
||||
END IF;
|
||||
|
||||
-- Performance metrics
|
||||
IF EXISTS (SELECT 1 FROM pg_class c JOIN pg_namespace n ON n.oid = c.relnamespace WHERE n.nspname = 'public' AND c.relname = 'performance_metrics') THEN
|
||||
SELECT count(*), count(*) FILTER (WHERE created_at < NOW() - INTERVAL '90 days')
|
||||
INTO row_count, cleanup_count FROM performance_metrics;
|
||||
RAISE NOTICE 'performance_metrics: % total, % cleanup candidates (> 90d)', row_count, cleanup_count;
|
||||
END IF;
|
||||
|
||||
-- Service health checks
|
||||
IF EXISTS (SELECT 1 FROM pg_class c JOIN pg_namespace n ON n.oid = c.relnamespace WHERE n.nspname = 'public' AND c.relname = 'service_health_checks') THEN
|
||||
SELECT count(*), count(*) FILTER (WHERE created_at < NOW() - INTERVAL '30 days')
|
||||
INTO row_count, cleanup_count FROM service_health_checks;
|
||||
RAISE NOTICE 'service_health_checks: % total, % cleanup candidates (> 30d)', row_count, cleanup_count;
|
||||
END IF;
|
||||
|
||||
-- Alert events
|
||||
IF EXISTS (SELECT 1 FROM pg_class c JOIN pg_namespace n ON n.oid = c.relnamespace WHERE n.nspname = 'public' AND c.relname = 'alert_events') THEN
|
||||
SELECT count(*), count(*) FILTER (WHERE created_at < NOW() - INTERVAL '30 days')
|
||||
INTO row_count, cleanup_count FROM alert_events;
|
||||
RAISE NOTICE 'alert_events: % total, % cleanup candidates (> 30d)', row_count, cleanup_count;
|
||||
END IF;
|
||||
|
||||
-- Agent executions
|
||||
IF EXISTS (SELECT 1 FROM pg_class c JOIN pg_namespace n ON n.oid = c.relnamespace WHERE n.nspname = 'public' AND c.relname = 'agent_executions') THEN
|
||||
SELECT count(*), count(*) FILTER (WHERE created_at < NOW() - INTERVAL '90 days')
|
||||
INTO row_count, cleanup_count FROM agent_executions;
|
||||
RAISE NOTICE 'agent_executions: % total, % cleanup candidates (> 90d)', row_count, cleanup_count;
|
||||
END IF;
|
||||
|
||||
-- Agentic RAG sessions
|
||||
IF EXISTS (SELECT 1 FROM pg_class c JOIN pg_namespace n ON n.oid = c.relnamespace WHERE n.nspname = 'public' AND c.relname = 'agentic_rag_sessions') THEN
|
||||
SELECT count(*), count(*) FILTER (WHERE created_at < NOW() - INTERVAL '90 days')
|
||||
INTO row_count, cleanup_count FROM agentic_rag_sessions;
|
||||
RAISE NOTICE 'agentic_rag_sessions: % total, % cleanup candidates (> 90d)', row_count, cleanup_count;
|
||||
END IF;
|
||||
|
||||
-- Processing quality metrics
|
||||
IF EXISTS (SELECT 1 FROM pg_class c JOIN pg_namespace n ON n.oid = c.relnamespace WHERE n.nspname = 'public' AND c.relname = 'processing_quality_metrics') THEN
|
||||
SELECT count(*), count(*) FILTER (WHERE created_at < NOW() - INTERVAL '90 days')
|
||||
INTO row_count, cleanup_count FROM processing_quality_metrics;
|
||||
RAISE NOTICE 'processing_quality_metrics: % total, % cleanup candidates (> 90d)', row_count, cleanup_count;
|
||||
END IF;
|
||||
|
||||
-- Documents extracted_text
|
||||
IF EXISTS (SELECT 1 FROM pg_class c JOIN pg_namespace n ON n.oid = c.relnamespace WHERE n.nspname = 'public' AND c.relname = 'documents') THEN
|
||||
SELECT count(*), count(*) FILTER (WHERE status = 'completed' AND analysis_data IS NOT NULL AND extracted_text IS NOT NULL AND created_at < NOW() - INTERVAL '30 days')
|
||||
INTO row_count, cleanup_count FROM documents;
|
||||
RAISE NOTICE 'documents (extracted_text nullable): % total, % cleanup candidates (completed > 30d with analysis_data)', row_count, cleanup_count;
|
||||
END IF;
|
||||
|
||||
RAISE NOTICE '--- END CLEANUP BREAKDOWN ---';
|
||||
END $$;
|
||||
102
backend/sql/cleanup_old_data.sql
Normal file
102
backend/sql/cleanup_old_data.sql
Normal file
@@ -0,0 +1,102 @@
|
||||
-- ============================================================
|
||||
-- CLEANUP OLD DATA - Run in Supabase SQL Editor
|
||||
-- ============================================================
|
||||
-- Removes stale data that accumulates over time without
|
||||
-- impacting application functionality.
|
||||
--
|
||||
-- SAFE TO RUN: All deleted data is either intermediate
|
||||
-- processing artifacts or analytics logs. Core document
|
||||
-- data (documents, document_chunks, analysis_data) is
|
||||
-- never touched by DELETE statements.
|
||||
--
|
||||
-- Skips tables that don't exist yet (safe for any state).
|
||||
--
|
||||
-- RECOMMENDATION: Run the check_table_sizes.sql query first
|
||||
-- to see how much data will be affected.
|
||||
-- ============================================================
|
||||
|
||||
DO $$
|
||||
DECLARE
|
||||
deleted bigint;
|
||||
BEGIN
|
||||
|
||||
-- 1. Processing jobs: completed/failed older than 30 days
|
||||
IF EXISTS (SELECT 1 FROM pg_class c JOIN pg_namespace n ON n.oid = c.relnamespace WHERE n.nspname = 'public' AND c.relname = 'processing_jobs') THEN
|
||||
DELETE FROM processing_jobs WHERE status IN ('completed', 'failed') AND completed_at < NOW() - INTERVAL '30 days';
|
||||
GET DIAGNOSTICS deleted = ROW_COUNT;
|
||||
RAISE NOTICE 'processing_jobs: deleted % rows', deleted;
|
||||
END IF;
|
||||
|
||||
-- 2. Execution events: older than 30 days
|
||||
IF EXISTS (SELECT 1 FROM pg_class c JOIN pg_namespace n ON n.oid = c.relnamespace WHERE n.nspname = 'public' AND c.relname = 'execution_events') THEN
|
||||
DELETE FROM execution_events WHERE created_at < NOW() - INTERVAL '30 days';
|
||||
GET DIAGNOSTICS deleted = ROW_COUNT;
|
||||
RAISE NOTICE 'execution_events: deleted % rows', deleted;
|
||||
END IF;
|
||||
|
||||
-- 3. Session events: older than 30 days
|
||||
IF EXISTS (SELECT 1 FROM pg_class c JOIN pg_namespace n ON n.oid = c.relnamespace WHERE n.nspname = 'public' AND c.relname = 'session_events') THEN
|
||||
DELETE FROM session_events WHERE created_at < NOW() - INTERVAL '30 days';
|
||||
GET DIAGNOSTICS deleted = ROW_COUNT;
|
||||
RAISE NOTICE 'session_events: deleted % rows', deleted;
|
||||
END IF;
|
||||
|
||||
-- 4. Performance metrics: older than 90 days
|
||||
IF EXISTS (SELECT 1 FROM pg_class c JOIN pg_namespace n ON n.oid = c.relnamespace WHERE n.nspname = 'public' AND c.relname = 'performance_metrics') THEN
|
||||
DELETE FROM performance_metrics WHERE created_at < NOW() - INTERVAL '90 days';
|
||||
GET DIAGNOSTICS deleted = ROW_COUNT;
|
||||
RAISE NOTICE 'performance_metrics: deleted % rows', deleted;
|
||||
END IF;
|
||||
|
||||
-- 5. Vector similarity searches: older than 90 days
|
||||
IF EXISTS (SELECT 1 FROM pg_class c JOIN pg_namespace n ON n.oid = c.relnamespace WHERE n.nspname = 'public' AND c.relname = 'vector_similarity_searches') THEN
|
||||
DELETE FROM vector_similarity_searches WHERE created_at < NOW() - INTERVAL '90 days';
|
||||
GET DIAGNOSTICS deleted = ROW_COUNT;
|
||||
RAISE NOTICE 'vector_similarity_searches: deleted % rows', deleted;
|
||||
END IF;
|
||||
|
||||
-- 6. Service health checks: older than 30 days (INFR-01)
|
||||
IF EXISTS (SELECT 1 FROM pg_class c JOIN pg_namespace n ON n.oid = c.relnamespace WHERE n.nspname = 'public' AND c.relname = 'service_health_checks') THEN
|
||||
DELETE FROM service_health_checks WHERE created_at < NOW() - INTERVAL '30 days';
|
||||
GET DIAGNOSTICS deleted = ROW_COUNT;
|
||||
RAISE NOTICE 'service_health_checks: deleted % rows', deleted;
|
||||
END IF;
|
||||
|
||||
-- 7. Alert events: resolved older than 30 days (INFR-01)
|
||||
IF EXISTS (SELECT 1 FROM pg_class c JOIN pg_namespace n ON n.oid = c.relnamespace WHERE n.nspname = 'public' AND c.relname = 'alert_events') THEN
|
||||
DELETE FROM alert_events WHERE status = 'resolved' AND created_at < NOW() - INTERVAL '30 days';
|
||||
GET DIAGNOSTICS deleted = ROW_COUNT;
|
||||
RAISE NOTICE 'alert_events: deleted % rows', deleted;
|
||||
END IF;
|
||||
|
||||
-- 8. Agent executions: older than 90 days
|
||||
IF EXISTS (SELECT 1 FROM pg_class c JOIN pg_namespace n ON n.oid = c.relnamespace WHERE n.nspname = 'public' AND c.relname = 'agent_executions') THEN
|
||||
DELETE FROM agent_executions WHERE created_at < NOW() - INTERVAL '90 days';
|
||||
GET DIAGNOSTICS deleted = ROW_COUNT;
|
||||
RAISE NOTICE 'agent_executions: deleted % rows', deleted;
|
||||
END IF;
|
||||
|
||||
-- 9. Processing quality metrics: older than 90 days
|
||||
IF EXISTS (SELECT 1 FROM pg_class c JOIN pg_namespace n ON n.oid = c.relnamespace WHERE n.nspname = 'public' AND c.relname = 'processing_quality_metrics') THEN
|
||||
DELETE FROM processing_quality_metrics WHERE created_at < NOW() - INTERVAL '90 days';
|
||||
GET DIAGNOSTICS deleted = ROW_COUNT;
|
||||
RAISE NOTICE 'processing_quality_metrics: deleted % rows', deleted;
|
||||
END IF;
|
||||
|
||||
-- 10. Agentic RAG sessions: completed older than 90 days
|
||||
IF EXISTS (SELECT 1 FROM pg_class c JOIN pg_namespace n ON n.oid = c.relnamespace WHERE n.nspname = 'public' AND c.relname = 'agentic_rag_sessions') THEN
|
||||
DELETE FROM agentic_rag_sessions WHERE status IN ('completed', 'failed') AND created_at < NOW() - INTERVAL '90 days';
|
||||
GET DIAGNOSTICS deleted = ROW_COUNT;
|
||||
RAISE NOTICE 'agentic_rag_sessions: deleted % rows', deleted;
|
||||
END IF;
|
||||
|
||||
-- 11. Null out extracted_text for completed documents older than 30 days
|
||||
IF EXISTS (SELECT 1 FROM pg_class c JOIN pg_namespace n ON n.oid = c.relnamespace WHERE n.nspname = 'public' AND c.relname = 'documents') THEN
|
||||
UPDATE documents SET extracted_text = NULL
|
||||
WHERE status = 'completed' AND analysis_data IS NOT NULL AND extracted_text IS NOT NULL AND created_at < NOW() - INTERVAL '30 days';
|
||||
GET DIAGNOSTICS deleted = ROW_COUNT;
|
||||
RAISE NOTICE 'documents extracted_text nulled: % rows', deleted;
|
||||
END IF;
|
||||
|
||||
RAISE NOTICE '--- CLEANUP COMPLETE ---';
|
||||
END $$;
|
||||
145
backend/sql/setup_pg_cron_cleanup.sql
Normal file
145
backend/sql/setup_pg_cron_cleanup.sql
Normal file
@@ -0,0 +1,145 @@
|
||||
-- ============================================================
|
||||
-- ALTERNATIVE: PG_CRON AUTOMATED CLEANUP
|
||||
-- ============================================================
|
||||
-- NOTE: The primary cleanup runs as a Firebase scheduled
|
||||
-- function (cleanupOldData in index.ts). This pg_cron
|
||||
-- approach is an ALTERNATIVE if you prefer database-level
|
||||
-- scheduling instead.
|
||||
--
|
||||
-- Supabase includes pg_cron. This script creates scheduled
|
||||
-- jobs that automatically enforce retention policies.
|
||||
--
|
||||
-- PREREQUISITE: pg_cron extension must be enabled.
|
||||
-- Go to Supabase Dashboard → Database → Extensions → enable pg_cron
|
||||
--
|
||||
-- SCHEDULE: Runs daily at 03:00 UTC (off-peak)
|
||||
-- ============================================================
|
||||
|
||||
-- Enable the pg_cron extension (if not already enabled)
|
||||
CREATE EXTENSION IF NOT EXISTS pg_cron;
|
||||
|
||||
-- Grant usage to postgres role (required on Supabase)
|
||||
GRANT USAGE ON SCHEMA cron TO postgres;
|
||||
|
||||
-- ============================================================
|
||||
-- Create the cleanup function
|
||||
-- ============================================================
|
||||
CREATE OR REPLACE FUNCTION public.cleanup_old_data()
|
||||
RETURNS jsonb
|
||||
LANGUAGE plpgsql
|
||||
SECURITY DEFINER
|
||||
AS $$
|
||||
DECLARE
|
||||
result jsonb := '{}'::jsonb;
|
||||
deleted_count bigint;
|
||||
BEGIN
|
||||
-- 1. Processing jobs: completed/failed older than 30 days
|
||||
DELETE FROM processing_jobs
|
||||
WHERE status IN ('completed', 'failed')
|
||||
AND completed_at < NOW() - INTERVAL '30 days';
|
||||
GET DIAGNOSTICS deleted_count = ROW_COUNT;
|
||||
result := result || jsonb_build_object('processing_jobs', deleted_count);
|
||||
|
||||
-- 2. Execution events: older than 30 days
|
||||
DELETE FROM execution_events
|
||||
WHERE created_at < NOW() - INTERVAL '30 days';
|
||||
GET DIAGNOSTICS deleted_count = ROW_COUNT;
|
||||
result := result || jsonb_build_object('execution_events', deleted_count);
|
||||
|
||||
-- 3. Session events: older than 30 days
|
||||
DELETE FROM session_events
|
||||
WHERE created_at < NOW() - INTERVAL '30 days';
|
||||
GET DIAGNOSTICS deleted_count = ROW_COUNT;
|
||||
result := result || jsonb_build_object('session_events', deleted_count);
|
||||
|
||||
-- 4. Performance metrics: older than 90 days
|
||||
DELETE FROM performance_metrics
|
||||
WHERE created_at < NOW() - INTERVAL '90 days';
|
||||
GET DIAGNOSTICS deleted_count = ROW_COUNT;
|
||||
result := result || jsonb_build_object('performance_metrics', deleted_count);
|
||||
|
||||
-- 5. Vector similarity searches: older than 90 days
|
||||
DELETE FROM vector_similarity_searches
|
||||
WHERE created_at < NOW() - INTERVAL '90 days';
|
||||
GET DIAGNOSTICS deleted_count = ROW_COUNT;
|
||||
result := result || jsonb_build_object('vector_similarity_searches', deleted_count);
|
||||
|
||||
-- 6. Service health checks: older than 30 days (INFR-01)
|
||||
DELETE FROM service_health_checks
|
||||
WHERE created_at < NOW() - INTERVAL '30 days';
|
||||
GET DIAGNOSTICS deleted_count = ROW_COUNT;
|
||||
result := result || jsonb_build_object('service_health_checks', deleted_count);
|
||||
|
||||
-- 7. Alert events: resolved older than 30 days (INFR-01)
|
||||
DELETE FROM alert_events
|
||||
WHERE status = 'resolved'
|
||||
AND created_at < NOW() - INTERVAL '30 days';
|
||||
GET DIAGNOSTICS deleted_count = ROW_COUNT;
|
||||
result := result || jsonb_build_object('alert_events', deleted_count);
|
||||
|
||||
-- 8. Agent executions: older than 90 days
|
||||
DELETE FROM agent_executions
|
||||
WHERE created_at < NOW() - INTERVAL '90 days';
|
||||
GET DIAGNOSTICS deleted_count = ROW_COUNT;
|
||||
result := result || jsonb_build_object('agent_executions', deleted_count);
|
||||
|
||||
-- 9. Processing quality metrics: older than 90 days
|
||||
DELETE FROM processing_quality_metrics
|
||||
WHERE created_at < NOW() - INTERVAL '90 days';
|
||||
GET DIAGNOSTICS deleted_count = ROW_COUNT;
|
||||
result := result || jsonb_build_object('processing_quality_metrics', deleted_count);
|
||||
|
||||
-- 10. Agentic RAG sessions: completed older than 90 days
|
||||
DELETE FROM agentic_rag_sessions
|
||||
WHERE status IN ('completed', 'failed')
|
||||
AND created_at < NOW() - INTERVAL '90 days';
|
||||
GET DIAGNOSTICS deleted_count = ROW_COUNT;
|
||||
result := result || jsonb_build_object('agentic_rag_sessions', deleted_count);
|
||||
|
||||
-- 11. Null out extracted_text for completed documents older than 30 days
|
||||
UPDATE documents
|
||||
SET extracted_text = NULL
|
||||
WHERE status = 'completed'
|
||||
AND analysis_data IS NOT NULL
|
||||
AND extracted_text IS NOT NULL
|
||||
AND created_at < NOW() - INTERVAL '30 days';
|
||||
GET DIAGNOSTICS deleted_count = ROW_COUNT;
|
||||
result := result || jsonb_build_object('documents_text_nulled', deleted_count);
|
||||
|
||||
RETURN result;
|
||||
END;
|
||||
$$;
|
||||
|
||||
-- ============================================================
|
||||
-- Schedule the cron job: daily at 03:00 UTC
|
||||
-- ============================================================
|
||||
SELECT cron.schedule(
|
||||
'daily-cleanup-old-data', -- job name
|
||||
'0 3 * * *', -- cron expression: 3 AM UTC daily
|
||||
$$SELECT public.cleanup_old_data()$$
|
||||
);
|
||||
|
||||
-- ============================================================
|
||||
-- Verify the job was created
|
||||
-- ============================================================
|
||||
SELECT * FROM cron.job WHERE jobname = 'daily-cleanup-old-data';
|
||||
|
||||
-- ============================================================
|
||||
-- MANAGEMENT COMMANDS (for reference)
|
||||
-- ============================================================
|
||||
|
||||
-- View all scheduled jobs:
|
||||
-- SELECT * FROM cron.job;
|
||||
|
||||
-- View recent job runs and results:
|
||||
-- SELECT * FROM cron.job_run_details ORDER BY start_time DESC LIMIT 20;
|
||||
|
||||
-- Run cleanup manually (to test):
|
||||
-- SELECT public.cleanup_old_data();
|
||||
|
||||
-- Unschedule the job:
|
||||
-- SELECT cron.unschedule('daily-cleanup-old-data');
|
||||
|
||||
-- Change schedule to weekly (Sundays at 3 AM):
|
||||
-- SELECT cron.unschedule('daily-cleanup-old-data');
|
||||
-- SELECT cron.schedule('weekly-cleanup-old-data', '0 3 * * 0', $$SELECT public.cleanup_old_data()$$);
|
||||
@@ -334,4 +334,113 @@ export const processDocumentJobs = onSchedule({
|
||||
// Re-throw to trigger retry mechanism (up to retryCount times)
|
||||
throw error;
|
||||
}
|
||||
});
|
||||
|
||||
// Scheduled function to clean up old database records
|
||||
// Runs daily at 3 AM UTC to enforce retention policies
|
||||
export const cleanupOldData = onSchedule({
|
||||
schedule: 'every day 03:00',
|
||||
timeZone: 'UTC',
|
||||
timeoutSeconds: 300, // 5 minutes max
|
||||
memory: '512MiB',
|
||||
retryCount: 1,
|
||||
secrets: [
|
||||
databaseUrl,
|
||||
supabaseServiceKey,
|
||||
supabaseAnonKey,
|
||||
],
|
||||
}, async (event) => {
|
||||
logger.info('Database cleanup scheduled function triggered', {
|
||||
timestamp: new Date().toISOString(),
|
||||
scheduleTime: event.scheduleTime,
|
||||
});
|
||||
|
||||
try {
|
||||
const { getPostgresPool } = await import('./config/supabase');
|
||||
const pool = getPostgresPool();
|
||||
|
||||
const results: Record<string, number> = {};
|
||||
|
||||
// Helper: run cleanup query only if the table exists
|
||||
const safeCleanup = async (table: string, query: string): Promise<number> => {
|
||||
const exists = await pool.query(
|
||||
`SELECT 1 FROM pg_class c JOIN pg_namespace n ON n.oid = c.relnamespace WHERE n.nspname = 'public' AND c.relname = $1`,
|
||||
[table]
|
||||
);
|
||||
if (exists.rowCount === 0) return 0;
|
||||
const result = await pool.query(query);
|
||||
return result.rowCount ?? 0;
|
||||
};
|
||||
|
||||
// 1. Processing jobs: completed/failed older than 30 days
|
||||
results.processing_jobs = await safeCleanup('processing_jobs',
|
||||
`DELETE FROM processing_jobs WHERE status IN ('completed', 'failed') AND completed_at < NOW() - INTERVAL '30 days'`
|
||||
);
|
||||
|
||||
// 2. Execution events: older than 30 days
|
||||
results.execution_events = await safeCleanup('execution_events',
|
||||
`DELETE FROM execution_events WHERE created_at < NOW() - INTERVAL '30 days'`
|
||||
);
|
||||
|
||||
// 3. Session events: older than 30 days
|
||||
results.session_events = await safeCleanup('session_events',
|
||||
`DELETE FROM session_events WHERE created_at < NOW() - INTERVAL '30 days'`
|
||||
);
|
||||
|
||||
// 4. Performance metrics: older than 90 days
|
||||
results.performance_metrics = await safeCleanup('performance_metrics',
|
||||
`DELETE FROM performance_metrics WHERE created_at < NOW() - INTERVAL '90 days'`
|
||||
);
|
||||
|
||||
// 5. Vector similarity searches: older than 90 days
|
||||
results.vector_similarity_searches = await safeCleanup('vector_similarity_searches',
|
||||
`DELETE FROM vector_similarity_searches WHERE created_at < NOW() - INTERVAL '90 days'`
|
||||
);
|
||||
|
||||
// 6. Service health checks: older than 30 days (INFR-01)
|
||||
results.service_health_checks = await safeCleanup('service_health_checks',
|
||||
`DELETE FROM service_health_checks WHERE created_at < NOW() - INTERVAL '30 days'`
|
||||
);
|
||||
|
||||
// 7. Alert events: resolved older than 30 days (INFR-01)
|
||||
results.alert_events = await safeCleanup('alert_events',
|
||||
`DELETE FROM alert_events WHERE status = 'resolved' AND created_at < NOW() - INTERVAL '30 days'`
|
||||
);
|
||||
|
||||
// 8. Agent executions: older than 90 days
|
||||
results.agent_executions = await safeCleanup('agent_executions',
|
||||
`DELETE FROM agent_executions WHERE created_at < NOW() - INTERVAL '90 days'`
|
||||
);
|
||||
|
||||
// 9. Processing quality metrics: older than 90 days
|
||||
results.processing_quality_metrics = await safeCleanup('processing_quality_metrics',
|
||||
`DELETE FROM processing_quality_metrics WHERE created_at < NOW() - INTERVAL '90 days'`
|
||||
);
|
||||
|
||||
// 10. Agentic RAG sessions: completed older than 90 days
|
||||
results.agentic_rag_sessions = await safeCleanup('agentic_rag_sessions',
|
||||
`DELETE FROM agentic_rag_sessions WHERE status IN ('completed', 'failed') AND created_at < NOW() - INTERVAL '90 days'`
|
||||
);
|
||||
|
||||
// 11. Null out extracted_text for completed documents older than 30 days
|
||||
results.documents_text_nulled = await safeCleanup('documents',
|
||||
`UPDATE documents SET extracted_text = NULL WHERE status = 'completed' AND analysis_data IS NOT NULL AND extracted_text IS NOT NULL AND created_at < NOW() - INTERVAL '30 days'`
|
||||
);
|
||||
|
||||
const totalDeleted = Object.values(results).reduce((sum, count) => sum + count, 0);
|
||||
|
||||
logger.info('Database cleanup completed', {
|
||||
totalDeleted,
|
||||
details: results,
|
||||
timestamp: new Date().toISOString(),
|
||||
});
|
||||
} catch (error) {
|
||||
const errorMessage = error instanceof Error ? error.message : String(error);
|
||||
logger.error('Database cleanup failed', {
|
||||
error: errorMessage,
|
||||
stack: error instanceof Error ? error.stack : undefined,
|
||||
timestamp: new Date().toISOString(),
|
||||
});
|
||||
throw error;
|
||||
}
|
||||
});
|
||||
146
backend/src/services/alertService.ts
Normal file
146
backend/src/services/alertService.ts
Normal file
@@ -0,0 +1,146 @@
|
||||
import nodemailer from 'nodemailer';
|
||||
import { AlertEventModel } from '../models/AlertEventModel';
|
||||
import { logger } from '../utils/logger';
|
||||
import { ProbeResult } from './healthProbeService';
|
||||
|
||||
// =============================================================================
|
||||
// Constants
|
||||
// =============================================================================
|
||||
|
||||
const ALERT_COOLDOWN_MINUTES = parseInt(process.env['ALERT_COOLDOWN_MINUTES'] ?? '60', 10);
|
||||
|
||||
// =============================================================================
|
||||
// Private helpers
|
||||
// =============================================================================
|
||||
|
||||
/**
|
||||
* Create a nodemailer transporter using SMTP config from process.env.
|
||||
* Created INSIDE function scope — NOT at module level — because Firebase Secrets
|
||||
* are not available at module load time (PITFALL A).
|
||||
*/
|
||||
function createTransporter(): nodemailer.Transporter {
|
||||
return nodemailer.createTransport({
|
||||
host: process.env['EMAIL_HOST'] ?? 'smtp.gmail.com',
|
||||
port: parseInt(process.env['EMAIL_PORT'] ?? '587', 10),
|
||||
secure: process.env['EMAIL_SECURE'] === 'true',
|
||||
auth: {
|
||||
user: process.env['EMAIL_USER'],
|
||||
pass: process.env['EMAIL_PASS'],
|
||||
},
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Send an alert email to the configured recipient.
|
||||
* Reads recipient from process.env.EMAIL_WEEKLY_RECIPIENT — NEVER hardcoded (ALRT-04).
|
||||
* Email failures are caught and logged; they do NOT throw (must not break probe pipeline).
|
||||
*/
|
||||
async function sendAlertEmail(
|
||||
serviceName: string,
|
||||
alertType: string,
|
||||
message: string
|
||||
): Promise<void> {
|
||||
const recipient = process.env['EMAIL_WEEKLY_RECIPIENT'];
|
||||
|
||||
if (!recipient) {
|
||||
logger.warn('alertService.sendAlertEmail: no EMAIL_WEEKLY_RECIPIENT configured — skipping email', {
|
||||
serviceName,
|
||||
alertType,
|
||||
});
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
const transporter = createTransporter();
|
||||
const subject = `[CIM Summary] Alert: ${serviceName} \u2014 ${alertType}`;
|
||||
const text = `Service: ${serviceName}\nAlert Type: ${alertType}\n\nDetails:\n${message}`;
|
||||
const html = `
|
||||
<h2>CIM Summary Alert</h2>
|
||||
<p><strong>Service:</strong> ${serviceName}</p>
|
||||
<p><strong>Alert Type:</strong> ${alertType}</p>
|
||||
<h3>Details</h3>
|
||||
<pre>${message}</pre>
|
||||
`;
|
||||
|
||||
await transporter.sendMail({
|
||||
from: process.env['EMAIL_FROM'] ?? process.env['EMAIL_USER'],
|
||||
to: recipient,
|
||||
subject,
|
||||
text,
|
||||
html,
|
||||
});
|
||||
|
||||
logger.info('alertService.sendAlertEmail: alert email sent', {
|
||||
serviceName,
|
||||
alertType,
|
||||
recipient,
|
||||
});
|
||||
} catch (err) {
|
||||
logger.error('alertService.sendAlertEmail: failed to send alert email', {
|
||||
error: err instanceof Error ? err.message : String(err),
|
||||
serviceName,
|
||||
alertType,
|
||||
recipient,
|
||||
});
|
||||
// Do NOT re-throw — email failure must not break the probe pipeline
|
||||
}
|
||||
}
|
||||
|
||||
// =============================================================================
|
||||
// Exported service
|
||||
// =============================================================================
|
||||
|
||||
/**
|
||||
* Evaluate probe results and send alerts for degraded or down services.
|
||||
* Implements deduplication: if an alert of the same type was sent within the
|
||||
* cooldown window, suppresses both row creation and email (prevents alert storms).
|
||||
*
|
||||
* For each failing probe:
|
||||
* 1. Map status to alert_type ('down' -> 'service_down', 'degraded' -> 'service_degraded')
|
||||
* 2. Check AlertEventModel.findRecentByService — if within cooldown, suppress
|
||||
* 3. Otherwise: create alert_events row, then send email
|
||||
*/
|
||||
async function evaluateAndAlert(probeResults: ProbeResult[]): Promise<void> {
|
||||
for (const probe of probeResults) {
|
||||
if (probe.status !== 'degraded' && probe.status !== 'down') {
|
||||
continue; // Healthy probes — no action needed
|
||||
}
|
||||
|
||||
const alertType: 'service_down' | 'service_degraded' =
|
||||
probe.status === 'down' ? 'service_down' : 'service_degraded';
|
||||
|
||||
// Deduplication check — suppress if already alerted within cooldown window
|
||||
const recentAlert = await AlertEventModel.findRecentByService(
|
||||
probe.service_name,
|
||||
alertType,
|
||||
ALERT_COOLDOWN_MINUTES
|
||||
);
|
||||
|
||||
if (recentAlert !== null) {
|
||||
logger.info('alertService.evaluateAndAlert: suppress — alert within cooldown window', {
|
||||
serviceName: probe.service_name,
|
||||
alertType,
|
||||
cooldownMinutes: ALERT_COOLDOWN_MINUTES,
|
||||
lastAlertId: recentAlert.id,
|
||||
lastAlertAt: recentAlert.created_at,
|
||||
});
|
||||
continue;
|
||||
}
|
||||
|
||||
// No recent alert — create the alert_events row first
|
||||
const message =
|
||||
probe.error_message ??
|
||||
`Service ${probe.service_name} reported status: ${probe.status}`;
|
||||
|
||||
await AlertEventModel.create({
|
||||
service_name: probe.service_name,
|
||||
alert_type: alertType,
|
||||
message,
|
||||
});
|
||||
|
||||
// Then send the email notification
|
||||
await sendAlertEmail(probe.service_name, alertType, message);
|
||||
}
|
||||
}
|
||||
|
||||
export const alertService = { evaluateAndAlert };
|
||||
Reference in New Issue
Block a user