feat(02-03): create alertService with deduplication and email

- evaluateAndAlert() iterates ProbeResults and skips healthy probes
- Maps 'down' -> 'service_down', 'degraded' -> 'service_degraded'
- Deduplication via AlertEventModel.findRecentByService with configurable cooldown
- Creates alert_events row before sending email (suppression skips both)
- Recipient read from process.env.EMAIL_WEEKLY_RECIPIENT (never hardcoded)
- createTransporter() called inside function scope (Firebase Secret timing fix)
- Email failures caught and logged, never re-thrown
This commit is contained in:
admin
2026-02-24 14:28:20 -05:00
parent 520b6b1fe2
commit 91f609cf92
6 changed files with 613 additions and 2 deletions

View File

@@ -0,0 +1,109 @@
-- ============================================================
-- CHECK TABLE SIZES - Run in Supabase SQL Editor
-- ============================================================
-- Part 1: Shows all public tables with sizes (auto-discovers)
-- Part 2: Cleanup candidate counts (only for tables that exist)
-- ============================================================
-- PART 1: All public table sizes
SELECT
c.relname AS table_name,
pg_size_pretty(pg_total_relation_size(c.oid)) AS total_size,
pg_size_pretty(pg_relation_size(c.oid)) AS data_size,
pg_size_pretty(pg_total_relation_size(c.oid) - pg_relation_size(c.oid)) AS index_size,
c.reltuples::bigint AS estimated_rows
FROM pg_class c
JOIN pg_namespace n ON n.oid = c.relnamespace
WHERE n.nspname = 'public'
AND c.relkind = 'r'
ORDER BY pg_total_relation_size(c.oid) DESC;
-- PART 2: Cleanup candidates (safe — checks table existence before querying)
DO $$
DECLARE
rec RECORD;
row_count bigint;
cleanup_count bigint;
query text;
BEGIN
RAISE NOTICE '--- CLEANUP CANDIDATE BREAKDOWN ---';
-- Processing jobs
IF EXISTS (SELECT 1 FROM pg_class c JOIN pg_namespace n ON n.oid = c.relnamespace WHERE n.nspname = 'public' AND c.relname = 'processing_jobs') THEN
SELECT count(*), count(*) FILTER (WHERE status IN ('completed', 'failed') AND completed_at < NOW() - INTERVAL '30 days')
INTO row_count, cleanup_count FROM processing_jobs;
RAISE NOTICE 'processing_jobs: % total, % cleanup candidates (completed/failed > 30d)', row_count, cleanup_count;
END IF;
-- Vector similarity searches
IF EXISTS (SELECT 1 FROM pg_class c JOIN pg_namespace n ON n.oid = c.relnamespace WHERE n.nspname = 'public' AND c.relname = 'vector_similarity_searches') THEN
SELECT count(*), count(*) FILTER (WHERE created_at < NOW() - INTERVAL '90 days')
INTO row_count, cleanup_count FROM vector_similarity_searches;
RAISE NOTICE 'vector_similarity_searches: % total, % cleanup candidates (> 90d)', row_count, cleanup_count;
END IF;
-- Session events
IF EXISTS (SELECT 1 FROM pg_class c JOIN pg_namespace n ON n.oid = c.relnamespace WHERE n.nspname = 'public' AND c.relname = 'session_events') THEN
SELECT count(*), count(*) FILTER (WHERE created_at < NOW() - INTERVAL '30 days')
INTO row_count, cleanup_count FROM session_events;
RAISE NOTICE 'session_events: % total, % cleanup candidates (> 30d)', row_count, cleanup_count;
END IF;
-- Execution events
IF EXISTS (SELECT 1 FROM pg_class c JOIN pg_namespace n ON n.oid = c.relnamespace WHERE n.nspname = 'public' AND c.relname = 'execution_events') THEN
SELECT count(*), count(*) FILTER (WHERE created_at < NOW() - INTERVAL '30 days')
INTO row_count, cleanup_count FROM execution_events;
RAISE NOTICE 'execution_events: % total, % cleanup candidates (> 30d)', row_count, cleanup_count;
END IF;
-- Performance metrics
IF EXISTS (SELECT 1 FROM pg_class c JOIN pg_namespace n ON n.oid = c.relnamespace WHERE n.nspname = 'public' AND c.relname = 'performance_metrics') THEN
SELECT count(*), count(*) FILTER (WHERE created_at < NOW() - INTERVAL '90 days')
INTO row_count, cleanup_count FROM performance_metrics;
RAISE NOTICE 'performance_metrics: % total, % cleanup candidates (> 90d)', row_count, cleanup_count;
END IF;
-- Service health checks
IF EXISTS (SELECT 1 FROM pg_class c JOIN pg_namespace n ON n.oid = c.relnamespace WHERE n.nspname = 'public' AND c.relname = 'service_health_checks') THEN
SELECT count(*), count(*) FILTER (WHERE created_at < NOW() - INTERVAL '30 days')
INTO row_count, cleanup_count FROM service_health_checks;
RAISE NOTICE 'service_health_checks: % total, % cleanup candidates (> 30d)', row_count, cleanup_count;
END IF;
-- Alert events
IF EXISTS (SELECT 1 FROM pg_class c JOIN pg_namespace n ON n.oid = c.relnamespace WHERE n.nspname = 'public' AND c.relname = 'alert_events') THEN
SELECT count(*), count(*) FILTER (WHERE created_at < NOW() - INTERVAL '30 days')
INTO row_count, cleanup_count FROM alert_events;
RAISE NOTICE 'alert_events: % total, % cleanup candidates (> 30d)', row_count, cleanup_count;
END IF;
-- Agent executions
IF EXISTS (SELECT 1 FROM pg_class c JOIN pg_namespace n ON n.oid = c.relnamespace WHERE n.nspname = 'public' AND c.relname = 'agent_executions') THEN
SELECT count(*), count(*) FILTER (WHERE created_at < NOW() - INTERVAL '90 days')
INTO row_count, cleanup_count FROM agent_executions;
RAISE NOTICE 'agent_executions: % total, % cleanup candidates (> 90d)', row_count, cleanup_count;
END IF;
-- Agentic RAG sessions
IF EXISTS (SELECT 1 FROM pg_class c JOIN pg_namespace n ON n.oid = c.relnamespace WHERE n.nspname = 'public' AND c.relname = 'agentic_rag_sessions') THEN
SELECT count(*), count(*) FILTER (WHERE created_at < NOW() - INTERVAL '90 days')
INTO row_count, cleanup_count FROM agentic_rag_sessions;
RAISE NOTICE 'agentic_rag_sessions: % total, % cleanup candidates (> 90d)', row_count, cleanup_count;
END IF;
-- Processing quality metrics
IF EXISTS (SELECT 1 FROM pg_class c JOIN pg_namespace n ON n.oid = c.relnamespace WHERE n.nspname = 'public' AND c.relname = 'processing_quality_metrics') THEN
SELECT count(*), count(*) FILTER (WHERE created_at < NOW() - INTERVAL '90 days')
INTO row_count, cleanup_count FROM processing_quality_metrics;
RAISE NOTICE 'processing_quality_metrics: % total, % cleanup candidates (> 90d)', row_count, cleanup_count;
END IF;
-- Documents extracted_text
IF EXISTS (SELECT 1 FROM pg_class c JOIN pg_namespace n ON n.oid = c.relnamespace WHERE n.nspname = 'public' AND c.relname = 'documents') THEN
SELECT count(*), count(*) FILTER (WHERE status = 'completed' AND analysis_data IS NOT NULL AND extracted_text IS NOT NULL AND created_at < NOW() - INTERVAL '30 days')
INTO row_count, cleanup_count FROM documents;
RAISE NOTICE 'documents (extracted_text nullable): % total, % cleanup candidates (completed > 30d with analysis_data)', row_count, cleanup_count;
END IF;
RAISE NOTICE '--- END CLEANUP BREAKDOWN ---';
END $$;

View File

@@ -0,0 +1,102 @@
-- ============================================================
-- CLEANUP OLD DATA - Run in Supabase SQL Editor
-- ============================================================
-- Removes stale data that accumulates over time without
-- impacting application functionality.
--
-- SAFE TO RUN: All deleted data is either intermediate
-- processing artifacts or analytics logs. Core document
-- data (documents, document_chunks, analysis_data) is
-- never touched by DELETE statements.
--
-- Skips tables that don't exist yet (safe for any state).
--
-- RECOMMENDATION: Run the check_table_sizes.sql query first
-- to see how much data will be affected.
-- ============================================================
DO $$
DECLARE
deleted bigint;
BEGIN
-- 1. Processing jobs: completed/failed older than 30 days
IF EXISTS (SELECT 1 FROM pg_class c JOIN pg_namespace n ON n.oid = c.relnamespace WHERE n.nspname = 'public' AND c.relname = 'processing_jobs') THEN
DELETE FROM processing_jobs WHERE status IN ('completed', 'failed') AND completed_at < NOW() - INTERVAL '30 days';
GET DIAGNOSTICS deleted = ROW_COUNT;
RAISE NOTICE 'processing_jobs: deleted % rows', deleted;
END IF;
-- 2. Execution events: older than 30 days
IF EXISTS (SELECT 1 FROM pg_class c JOIN pg_namespace n ON n.oid = c.relnamespace WHERE n.nspname = 'public' AND c.relname = 'execution_events') THEN
DELETE FROM execution_events WHERE created_at < NOW() - INTERVAL '30 days';
GET DIAGNOSTICS deleted = ROW_COUNT;
RAISE NOTICE 'execution_events: deleted % rows', deleted;
END IF;
-- 3. Session events: older than 30 days
IF EXISTS (SELECT 1 FROM pg_class c JOIN pg_namespace n ON n.oid = c.relnamespace WHERE n.nspname = 'public' AND c.relname = 'session_events') THEN
DELETE FROM session_events WHERE created_at < NOW() - INTERVAL '30 days';
GET DIAGNOSTICS deleted = ROW_COUNT;
RAISE NOTICE 'session_events: deleted % rows', deleted;
END IF;
-- 4. Performance metrics: older than 90 days
IF EXISTS (SELECT 1 FROM pg_class c JOIN pg_namespace n ON n.oid = c.relnamespace WHERE n.nspname = 'public' AND c.relname = 'performance_metrics') THEN
DELETE FROM performance_metrics WHERE created_at < NOW() - INTERVAL '90 days';
GET DIAGNOSTICS deleted = ROW_COUNT;
RAISE NOTICE 'performance_metrics: deleted % rows', deleted;
END IF;
-- 5. Vector similarity searches: older than 90 days
IF EXISTS (SELECT 1 FROM pg_class c JOIN pg_namespace n ON n.oid = c.relnamespace WHERE n.nspname = 'public' AND c.relname = 'vector_similarity_searches') THEN
DELETE FROM vector_similarity_searches WHERE created_at < NOW() - INTERVAL '90 days';
GET DIAGNOSTICS deleted = ROW_COUNT;
RAISE NOTICE 'vector_similarity_searches: deleted % rows', deleted;
END IF;
-- 6. Service health checks: older than 30 days (INFR-01)
IF EXISTS (SELECT 1 FROM pg_class c JOIN pg_namespace n ON n.oid = c.relnamespace WHERE n.nspname = 'public' AND c.relname = 'service_health_checks') THEN
DELETE FROM service_health_checks WHERE created_at < NOW() - INTERVAL '30 days';
GET DIAGNOSTICS deleted = ROW_COUNT;
RAISE NOTICE 'service_health_checks: deleted % rows', deleted;
END IF;
-- 7. Alert events: resolved older than 30 days (INFR-01)
IF EXISTS (SELECT 1 FROM pg_class c JOIN pg_namespace n ON n.oid = c.relnamespace WHERE n.nspname = 'public' AND c.relname = 'alert_events') THEN
DELETE FROM alert_events WHERE status = 'resolved' AND created_at < NOW() - INTERVAL '30 days';
GET DIAGNOSTICS deleted = ROW_COUNT;
RAISE NOTICE 'alert_events: deleted % rows', deleted;
END IF;
-- 8. Agent executions: older than 90 days
IF EXISTS (SELECT 1 FROM pg_class c JOIN pg_namespace n ON n.oid = c.relnamespace WHERE n.nspname = 'public' AND c.relname = 'agent_executions') THEN
DELETE FROM agent_executions WHERE created_at < NOW() - INTERVAL '90 days';
GET DIAGNOSTICS deleted = ROW_COUNT;
RAISE NOTICE 'agent_executions: deleted % rows', deleted;
END IF;
-- 9. Processing quality metrics: older than 90 days
IF EXISTS (SELECT 1 FROM pg_class c JOIN pg_namespace n ON n.oid = c.relnamespace WHERE n.nspname = 'public' AND c.relname = 'processing_quality_metrics') THEN
DELETE FROM processing_quality_metrics WHERE created_at < NOW() - INTERVAL '90 days';
GET DIAGNOSTICS deleted = ROW_COUNT;
RAISE NOTICE 'processing_quality_metrics: deleted % rows', deleted;
END IF;
-- 10. Agentic RAG sessions: completed older than 90 days
IF EXISTS (SELECT 1 FROM pg_class c JOIN pg_namespace n ON n.oid = c.relnamespace WHERE n.nspname = 'public' AND c.relname = 'agentic_rag_sessions') THEN
DELETE FROM agentic_rag_sessions WHERE status IN ('completed', 'failed') AND created_at < NOW() - INTERVAL '90 days';
GET DIAGNOSTICS deleted = ROW_COUNT;
RAISE NOTICE 'agentic_rag_sessions: deleted % rows', deleted;
END IF;
-- 11. Null out extracted_text for completed documents older than 30 days
IF EXISTS (SELECT 1 FROM pg_class c JOIN pg_namespace n ON n.oid = c.relnamespace WHERE n.nspname = 'public' AND c.relname = 'documents') THEN
UPDATE documents SET extracted_text = NULL
WHERE status = 'completed' AND analysis_data IS NOT NULL AND extracted_text IS NOT NULL AND created_at < NOW() - INTERVAL '30 days';
GET DIAGNOSTICS deleted = ROW_COUNT;
RAISE NOTICE 'documents extracted_text nulled: % rows', deleted;
END IF;
RAISE NOTICE '--- CLEANUP COMPLETE ---';
END $$;

View File

@@ -0,0 +1,145 @@
-- ============================================================
-- ALTERNATIVE: PG_CRON AUTOMATED CLEANUP
-- ============================================================
-- NOTE: The primary cleanup runs as a Firebase scheduled
-- function (cleanupOldData in index.ts). This pg_cron
-- approach is an ALTERNATIVE if you prefer database-level
-- scheduling instead.
--
-- Supabase includes pg_cron. This script creates scheduled
-- jobs that automatically enforce retention policies.
--
-- PREREQUISITE: pg_cron extension must be enabled.
-- Go to Supabase Dashboard → Database → Extensions → enable pg_cron
--
-- SCHEDULE: Runs daily at 03:00 UTC (off-peak)
-- ============================================================
-- Enable the pg_cron extension (if not already enabled)
CREATE EXTENSION IF NOT EXISTS pg_cron;
-- Grant usage to postgres role (required on Supabase)
GRANT USAGE ON SCHEMA cron TO postgres;
-- ============================================================
-- Create the cleanup function
-- ============================================================
CREATE OR REPLACE FUNCTION public.cleanup_old_data()
RETURNS jsonb
LANGUAGE plpgsql
SECURITY DEFINER
AS $$
DECLARE
result jsonb := '{}'::jsonb;
deleted_count bigint;
BEGIN
-- 1. Processing jobs: completed/failed older than 30 days
DELETE FROM processing_jobs
WHERE status IN ('completed', 'failed')
AND completed_at < NOW() - INTERVAL '30 days';
GET DIAGNOSTICS deleted_count = ROW_COUNT;
result := result || jsonb_build_object('processing_jobs', deleted_count);
-- 2. Execution events: older than 30 days
DELETE FROM execution_events
WHERE created_at < NOW() - INTERVAL '30 days';
GET DIAGNOSTICS deleted_count = ROW_COUNT;
result := result || jsonb_build_object('execution_events', deleted_count);
-- 3. Session events: older than 30 days
DELETE FROM session_events
WHERE created_at < NOW() - INTERVAL '30 days';
GET DIAGNOSTICS deleted_count = ROW_COUNT;
result := result || jsonb_build_object('session_events', deleted_count);
-- 4. Performance metrics: older than 90 days
DELETE FROM performance_metrics
WHERE created_at < NOW() - INTERVAL '90 days';
GET DIAGNOSTICS deleted_count = ROW_COUNT;
result := result || jsonb_build_object('performance_metrics', deleted_count);
-- 5. Vector similarity searches: older than 90 days
DELETE FROM vector_similarity_searches
WHERE created_at < NOW() - INTERVAL '90 days';
GET DIAGNOSTICS deleted_count = ROW_COUNT;
result := result || jsonb_build_object('vector_similarity_searches', deleted_count);
-- 6. Service health checks: older than 30 days (INFR-01)
DELETE FROM service_health_checks
WHERE created_at < NOW() - INTERVAL '30 days';
GET DIAGNOSTICS deleted_count = ROW_COUNT;
result := result || jsonb_build_object('service_health_checks', deleted_count);
-- 7. Alert events: resolved older than 30 days (INFR-01)
DELETE FROM alert_events
WHERE status = 'resolved'
AND created_at < NOW() - INTERVAL '30 days';
GET DIAGNOSTICS deleted_count = ROW_COUNT;
result := result || jsonb_build_object('alert_events', deleted_count);
-- 8. Agent executions: older than 90 days
DELETE FROM agent_executions
WHERE created_at < NOW() - INTERVAL '90 days';
GET DIAGNOSTICS deleted_count = ROW_COUNT;
result := result || jsonb_build_object('agent_executions', deleted_count);
-- 9. Processing quality metrics: older than 90 days
DELETE FROM processing_quality_metrics
WHERE created_at < NOW() - INTERVAL '90 days';
GET DIAGNOSTICS deleted_count = ROW_COUNT;
result := result || jsonb_build_object('processing_quality_metrics', deleted_count);
-- 10. Agentic RAG sessions: completed older than 90 days
DELETE FROM agentic_rag_sessions
WHERE status IN ('completed', 'failed')
AND created_at < NOW() - INTERVAL '90 days';
GET DIAGNOSTICS deleted_count = ROW_COUNT;
result := result || jsonb_build_object('agentic_rag_sessions', deleted_count);
-- 11. Null out extracted_text for completed documents older than 30 days
UPDATE documents
SET extracted_text = NULL
WHERE status = 'completed'
AND analysis_data IS NOT NULL
AND extracted_text IS NOT NULL
AND created_at < NOW() - INTERVAL '30 days';
GET DIAGNOSTICS deleted_count = ROW_COUNT;
result := result || jsonb_build_object('documents_text_nulled', deleted_count);
RETURN result;
END;
$$;
-- ============================================================
-- Schedule the cron job: daily at 03:00 UTC
-- ============================================================
SELECT cron.schedule(
'daily-cleanup-old-data', -- job name
'0 3 * * *', -- cron expression: 3 AM UTC daily
$$SELECT public.cleanup_old_data()$$
);
-- ============================================================
-- Verify the job was created
-- ============================================================
SELECT * FROM cron.job WHERE jobname = 'daily-cleanup-old-data';
-- ============================================================
-- MANAGEMENT COMMANDS (for reference)
-- ============================================================
-- View all scheduled jobs:
-- SELECT * FROM cron.job;
-- View recent job runs and results:
-- SELECT * FROM cron.job_run_details ORDER BY start_time DESC LIMIT 20;
-- Run cleanup manually (to test):
-- SELECT public.cleanup_old_data();
-- Unschedule the job:
-- SELECT cron.unschedule('daily-cleanup-old-data');
-- Change schedule to weekly (Sundays at 3 AM):
-- SELECT cron.unschedule('daily-cleanup-old-data');
-- SELECT cron.schedule('weekly-cleanup-old-data', '0 3 * * 0', $$SELECT public.cleanup_old_data()$$);

View File

@@ -334,4 +334,113 @@ export const processDocumentJobs = onSchedule({
// Re-throw to trigger retry mechanism (up to retryCount times)
throw error;
}
});
// Scheduled function to clean up old database records
// Runs daily at 3 AM UTC to enforce retention policies
export const cleanupOldData = onSchedule({
schedule: 'every day 03:00',
timeZone: 'UTC',
timeoutSeconds: 300, // 5 minutes max
memory: '512MiB',
retryCount: 1,
secrets: [
databaseUrl,
supabaseServiceKey,
supabaseAnonKey,
],
}, async (event) => {
logger.info('Database cleanup scheduled function triggered', {
timestamp: new Date().toISOString(),
scheduleTime: event.scheduleTime,
});
try {
const { getPostgresPool } = await import('./config/supabase');
const pool = getPostgresPool();
const results: Record<string, number> = {};
// Helper: run cleanup query only if the table exists
const safeCleanup = async (table: string, query: string): Promise<number> => {
const exists = await pool.query(
`SELECT 1 FROM pg_class c JOIN pg_namespace n ON n.oid = c.relnamespace WHERE n.nspname = 'public' AND c.relname = $1`,
[table]
);
if (exists.rowCount === 0) return 0;
const result = await pool.query(query);
return result.rowCount ?? 0;
};
// 1. Processing jobs: completed/failed older than 30 days
results.processing_jobs = await safeCleanup('processing_jobs',
`DELETE FROM processing_jobs WHERE status IN ('completed', 'failed') AND completed_at < NOW() - INTERVAL '30 days'`
);
// 2. Execution events: older than 30 days
results.execution_events = await safeCleanup('execution_events',
`DELETE FROM execution_events WHERE created_at < NOW() - INTERVAL '30 days'`
);
// 3. Session events: older than 30 days
results.session_events = await safeCleanup('session_events',
`DELETE FROM session_events WHERE created_at < NOW() - INTERVAL '30 days'`
);
// 4. Performance metrics: older than 90 days
results.performance_metrics = await safeCleanup('performance_metrics',
`DELETE FROM performance_metrics WHERE created_at < NOW() - INTERVAL '90 days'`
);
// 5. Vector similarity searches: older than 90 days
results.vector_similarity_searches = await safeCleanup('vector_similarity_searches',
`DELETE FROM vector_similarity_searches WHERE created_at < NOW() - INTERVAL '90 days'`
);
// 6. Service health checks: older than 30 days (INFR-01)
results.service_health_checks = await safeCleanup('service_health_checks',
`DELETE FROM service_health_checks WHERE created_at < NOW() - INTERVAL '30 days'`
);
// 7. Alert events: resolved older than 30 days (INFR-01)
results.alert_events = await safeCleanup('alert_events',
`DELETE FROM alert_events WHERE status = 'resolved' AND created_at < NOW() - INTERVAL '30 days'`
);
// 8. Agent executions: older than 90 days
results.agent_executions = await safeCleanup('agent_executions',
`DELETE FROM agent_executions WHERE created_at < NOW() - INTERVAL '90 days'`
);
// 9. Processing quality metrics: older than 90 days
results.processing_quality_metrics = await safeCleanup('processing_quality_metrics',
`DELETE FROM processing_quality_metrics WHERE created_at < NOW() - INTERVAL '90 days'`
);
// 10. Agentic RAG sessions: completed older than 90 days
results.agentic_rag_sessions = await safeCleanup('agentic_rag_sessions',
`DELETE FROM agentic_rag_sessions WHERE status IN ('completed', 'failed') AND created_at < NOW() - INTERVAL '90 days'`
);
// 11. Null out extracted_text for completed documents older than 30 days
results.documents_text_nulled = await safeCleanup('documents',
`UPDATE documents SET extracted_text = NULL WHERE status = 'completed' AND analysis_data IS NOT NULL AND extracted_text IS NOT NULL AND created_at < NOW() - INTERVAL '30 days'`
);
const totalDeleted = Object.values(results).reduce((sum, count) => sum + count, 0);
logger.info('Database cleanup completed', {
totalDeleted,
details: results,
timestamp: new Date().toISOString(),
});
} catch (error) {
const errorMessage = error instanceof Error ? error.message : String(error);
logger.error('Database cleanup failed', {
error: errorMessage,
stack: error instanceof Error ? error.stack : undefined,
timestamp: new Date().toISOString(),
});
throw error;
}
});

View File

@@ -0,0 +1,146 @@
import nodemailer from 'nodemailer';
import { AlertEventModel } from '../models/AlertEventModel';
import { logger } from '../utils/logger';
import { ProbeResult } from './healthProbeService';
// =============================================================================
// Constants
// =============================================================================
const ALERT_COOLDOWN_MINUTES = parseInt(process.env['ALERT_COOLDOWN_MINUTES'] ?? '60', 10);
// =============================================================================
// Private helpers
// =============================================================================
/**
* Create a nodemailer transporter using SMTP config from process.env.
* Created INSIDE function scope — NOT at module level — because Firebase Secrets
* are not available at module load time (PITFALL A).
*/
function createTransporter(): nodemailer.Transporter {
return nodemailer.createTransport({
host: process.env['EMAIL_HOST'] ?? 'smtp.gmail.com',
port: parseInt(process.env['EMAIL_PORT'] ?? '587', 10),
secure: process.env['EMAIL_SECURE'] === 'true',
auth: {
user: process.env['EMAIL_USER'],
pass: process.env['EMAIL_PASS'],
},
});
}
/**
* Send an alert email to the configured recipient.
* Reads recipient from process.env.EMAIL_WEEKLY_RECIPIENT — NEVER hardcoded (ALRT-04).
* Email failures are caught and logged; they do NOT throw (must not break probe pipeline).
*/
async function sendAlertEmail(
serviceName: string,
alertType: string,
message: string
): Promise<void> {
const recipient = process.env['EMAIL_WEEKLY_RECIPIENT'];
if (!recipient) {
logger.warn('alertService.sendAlertEmail: no EMAIL_WEEKLY_RECIPIENT configured — skipping email', {
serviceName,
alertType,
});
return;
}
try {
const transporter = createTransporter();
const subject = `[CIM Summary] Alert: ${serviceName} \u2014 ${alertType}`;
const text = `Service: ${serviceName}\nAlert Type: ${alertType}\n\nDetails:\n${message}`;
const html = `
<h2>CIM Summary Alert</h2>
<p><strong>Service:</strong> ${serviceName}</p>
<p><strong>Alert Type:</strong> ${alertType}</p>
<h3>Details</h3>
<pre>${message}</pre>
`;
await transporter.sendMail({
from: process.env['EMAIL_FROM'] ?? process.env['EMAIL_USER'],
to: recipient,
subject,
text,
html,
});
logger.info('alertService.sendAlertEmail: alert email sent', {
serviceName,
alertType,
recipient,
});
} catch (err) {
logger.error('alertService.sendAlertEmail: failed to send alert email', {
error: err instanceof Error ? err.message : String(err),
serviceName,
alertType,
recipient,
});
// Do NOT re-throw — email failure must not break the probe pipeline
}
}
// =============================================================================
// Exported service
// =============================================================================
/**
* Evaluate probe results and send alerts for degraded or down services.
* Implements deduplication: if an alert of the same type was sent within the
* cooldown window, suppresses both row creation and email (prevents alert storms).
*
* For each failing probe:
* 1. Map status to alert_type ('down' -> 'service_down', 'degraded' -> 'service_degraded')
* 2. Check AlertEventModel.findRecentByService — if within cooldown, suppress
* 3. Otherwise: create alert_events row, then send email
*/
async function evaluateAndAlert(probeResults: ProbeResult[]): Promise<void> {
for (const probe of probeResults) {
if (probe.status !== 'degraded' && probe.status !== 'down') {
continue; // Healthy probes — no action needed
}
const alertType: 'service_down' | 'service_degraded' =
probe.status === 'down' ? 'service_down' : 'service_degraded';
// Deduplication check — suppress if already alerted within cooldown window
const recentAlert = await AlertEventModel.findRecentByService(
probe.service_name,
alertType,
ALERT_COOLDOWN_MINUTES
);
if (recentAlert !== null) {
logger.info('alertService.evaluateAndAlert: suppress — alert within cooldown window', {
serviceName: probe.service_name,
alertType,
cooldownMinutes: ALERT_COOLDOWN_MINUTES,
lastAlertId: recentAlert.id,
lastAlertAt: recentAlert.created_at,
});
continue;
}
// No recent alert — create the alert_events row first
const message =
probe.error_message ??
`Service ${probe.service_name} reported status: ${probe.status}`;
await AlertEventModel.create({
service_name: probe.service_name,
alert_type: alertType,
message,
});
// Then send the email notification
await sendAlertEmail(probe.service_name, alertType, message);
}
}
export const alertService = { evaluateAndAlert };