Initial commit: sales analysis template

Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
Jonathan Pressnell
2026-02-06 09:16:34 -05:00
commit cf0b596449
38 changed files with 8001 additions and 0 deletions

214
config_validator.py Normal file
View File

@@ -0,0 +1,214 @@
"""
Configuration validation utility
Validates configuration settings against data to catch errors early
Usage:
from config_validator import validate_config
# Validate configuration
errors, warnings = validate_config(df)
if errors:
print("Configuration errors found:", errors)
"""
import pandas as pd
from pathlib import Path
from config import (
DATA_FILE, REVENUE_COLUMN, DATE_COLUMN, DATE_FALLBACK_COLUMNS,
CUSTOMER_COLUMN, ITEM_COLUMN, QUANTITY_COLUMN,
MIN_YEAR, MAX_DATE, ANALYSIS_YEARS,
LTM_ENABLED, LTM_START, LTM_END, LTM_START_YEAR, LTM_END_YEAR,
EXCLUSION_FILTERS, get_data_path
)
def validate_config(df=None):
"""
Validate configuration against data
Args:
df: Optional DataFrame to validate against. If None, attempts to load data.
Returns:
tuple: (errors list, warnings list)
Example:
errors, warnings = validate_config(df)
if errors:
for error in errors:
print(f"ERROR: {error}")
if warnings:
for warning in warnings:
print(f"WARNING: {warning}")
"""
errors = []
warnings = []
# Load data if not provided
if df is None:
try:
from data_loader import load_sales_data
data_path = get_data_path()
if not data_path.exists():
errors.append(f"Data file not found: {data_path}")
return errors, warnings
df = load_sales_data(data_path)
except Exception as e:
errors.append(f"Could not load data for validation: {e}")
return errors, warnings
# 1. Validate required columns exist
required_columns = [REVENUE_COLUMN, DATE_COLUMN]
for col in required_columns:
if col not in df.columns:
errors.append(f"Required column '{col}' not found in data. Available columns: {list(df.columns)[:10]}")
# 2. Validate date column has valid dates
if DATE_COLUMN in df.columns:
date_coverage = df[DATE_COLUMN].notna().sum() / len(df) * 100
if date_coverage < 50:
errors.append(f"Date coverage is very low ({date_coverage:.1f}%). Check date column configuration.")
elif date_coverage < 90:
warnings.append(f"Date coverage is {date_coverage:.1f}%. Consider adding fallback date columns.")
# 3. Validate fallback date columns
if DATE_FALLBACK_COLUMNS:
missing_fallbacks = [col for col in DATE_FALLBACK_COLUMNS if col not in df.columns]
if missing_fallbacks:
warnings.append(f"Fallback date columns not found: {missing_fallbacks}")
# 4. Validate revenue column is numeric
if REVENUE_COLUMN in df.columns:
try:
pd.to_numeric(df[REVENUE_COLUMN], errors='coerce')
valid_revenue = df[REVENUE_COLUMN].notna().sum()
if valid_revenue == 0:
errors.append(f"Revenue column '{REVENUE_COLUMN}' has no valid numeric values")
elif valid_revenue < len(df) * 0.9:
warnings.append(f"Revenue column has {len(df) - valid_revenue} invalid values")
except Exception:
errors.append(f"Revenue column '{REVENUE_COLUMN}' cannot be converted to numeric")
# 5. Validate date range
if DATE_COLUMN in df.columns and df[DATE_COLUMN].notna().any():
min_date_in_data = df[DATE_COLUMN].min()
max_date_in_data = df[DATE_COLUMN].max()
if MIN_YEAR and min_date_in_data.year > MIN_YEAR:
warnings.append(f"MIN_YEAR ({MIN_YEAR}) is earlier than earliest data ({min_date_in_data.year})")
if MAX_DATE and max_date_in_data > MAX_DATE:
warnings.append(f"MAX_DATE ({MAX_DATE.date()}) is earlier than latest data ({max_date_in_data.date()})")
# 6. Validate analysis years
if 'Year' in df.columns:
available_years = sorted(df['Year'].unique())
missing_years = [year for year in ANALYSIS_YEARS if year not in available_years]
if missing_years:
warnings.append(f"ANALYSIS_YEARS includes years not in data: {missing_years}")
# 7. Validate LTM configuration
if LTM_ENABLED:
if LTM_START is None or LTM_END is None:
errors.append("LTM_ENABLED is True but LTM_START or LTM_END is None")
else:
if LTM_START > LTM_END:
errors.append(f"LTM_START ({LTM_START}) is after LTM_END ({LTM_END})")
if 'YearMonth' in df.columns:
available_periods = df['YearMonth'].unique()
if LTM_START not in available_periods:
warnings.append(f"LTM_START ({LTM_START}) not found in data")
if LTM_END not in available_periods:
warnings.append(f"LTM_END ({LTM_END}) not found in data")
# 8. Validate exclusion filters
if EXCLUSION_FILTERS.get('enabled', False):
exclude_col = EXCLUSION_FILTERS.get('exclude_by_column')
if exclude_col:
if exclude_col not in df.columns:
errors.append(f"Exclusion filter column '{exclude_col}' not found in data")
else:
exclude_values = EXCLUSION_FILTERS.get('exclude_values', [])
if exclude_values:
available_values = df[exclude_col].unique()
invalid_values = [v for v in exclude_values if v not in available_values]
if invalid_values:
warnings.append(f"Exclusion filter values not found in data: {invalid_values}")
# 9. Validate optional columns (warnings only)
optional_columns = {
'Customer': CUSTOMER_COLUMN,
'Item': ITEM_COLUMN,
'Quantity': QUANTITY_COLUMN
}
for col_type, col_name in optional_columns.items():
if col_name and col_name not in df.columns:
warnings.append(f"Optional {col_type} column '{col_name}' not found. Some analyses may not work.")
# 10. Validate data file exists
data_path = get_data_path()
if not data_path.exists():
errors.append(f"Data file not found: {data_path}")
return errors, warnings
def print_validation_report(errors, warnings):
"""
Print a formatted validation report
Args:
errors: List of error messages
warnings: List of warning messages
"""
print("\n" + "="*60)
print("Configuration Validation Report")
print("="*60)
if errors:
print(f"\n❌ ERRORS ({len(errors)}):")
for i, error in enumerate(errors, 1):
print(f" {i}. {error}")
else:
print("\n✅ No configuration errors found")
if warnings:
print(f"\n⚠️ WARNINGS ({len(warnings)}):")
for i, warning in enumerate(warnings, 1):
print(f" {i}. {warning}")
else:
print("\n✅ No warnings")
print("\n" + "="*60)
if errors:
return False
return True
def validate_and_report(df=None):
"""
Validate configuration and print report
Args:
df: Optional DataFrame to validate against
Returns:
bool: True if no errors, False otherwise
"""
errors, warnings = validate_config(df)
return print_validation_report(errors, warnings)
# ============================================================================
# STANDALONE VALIDATION SCRIPT
# ============================================================================
if __name__ == "__main__":
"""Run configuration validation"""
print("Validating configuration...")
is_valid = validate_and_report()
if is_valid:
print("\n✅ Configuration is valid!")
exit(0)
else:
print("\n❌ Configuration has errors. Please fix them before running analyses.")
exit(1)