Initial commit: sales analysis template

Co-authored-by: Cursor <cursoragent@cursor.com>
2026-02-06 09:16:34 -05:00
commit cf0b596449
38 changed files with 8001 additions and 0 deletions
--- a/config_validator.py
+++ b/config_validator.py
@@ -0,0 +1,214 @@
+"""
+Configuration validation utility
+Validates configuration settings against data to catch errors early
+
+Usage:
+    from config_validator import validate_config
+    
+    # Validate configuration
+    errors, warnings = validate_config(df)
+    if errors:
+        print("Configuration errors found:", errors)
+"""
+import pandas as pd
+from pathlib import Path
+from config import (
+    DATA_FILE, REVENUE_COLUMN, DATE_COLUMN, DATE_FALLBACK_COLUMNS,
+    CUSTOMER_COLUMN, ITEM_COLUMN, QUANTITY_COLUMN,
+    MIN_YEAR, MAX_DATE, ANALYSIS_YEARS,
+    LTM_ENABLED, LTM_START, LTM_END, LTM_START_YEAR, LTM_END_YEAR,
+    EXCLUSION_FILTERS, get_data_path
+)
+
+def validate_config(df=None):
+    """
+    Validate configuration against data
+    
+    Args:
+        df: Optional DataFrame to validate against. If None, attempts to load data.
+    
+    Returns:
+        tuple: (errors list, warnings list)
+    
+    Example:
+        errors, warnings = validate_config(df)
+        if errors:
+            for error in errors:
+                print(f"ERROR: {error}")
+        if warnings:
+            for warning in warnings:
+                print(f"WARNING: {warning}")
+    """
+    errors = []
+    warnings = []
+    
+    # Load data if not provided
+    if df is None:
+        try:
+            from data_loader import load_sales_data
+            data_path = get_data_path()
+            if not data_path.exists():
+                errors.append(f"Data file not found: {data_path}")
+                return errors, warnings
+            df = load_sales_data(data_path)
+        except Exception as e:
+            errors.append(f"Could not load data for validation: {e}")
+            return errors, warnings
+    
+    # 1. Validate required columns exist
+    required_columns = [REVENUE_COLUMN, DATE_COLUMN]
+    for col in required_columns:
+        if col not in df.columns:
+            errors.append(f"Required column '{col}' not found in data. Available columns: {list(df.columns)[:10]}")
+    
+    # 2. Validate date column has valid dates
+    if DATE_COLUMN in df.columns:
+        date_coverage = df[DATE_COLUMN].notna().sum() / len(df) * 100
+        if date_coverage < 50:
+            errors.append(f"Date coverage is very low ({date_coverage:.1f}%). Check date column configuration.")
+        elif date_coverage < 90:
+            warnings.append(f"Date coverage is {date_coverage:.1f}%. Consider adding fallback date columns.")
+    
+    # 3. Validate fallback date columns
+    if DATE_FALLBACK_COLUMNS:
+        missing_fallbacks = [col for col in DATE_FALLBACK_COLUMNS if col not in df.columns]
+        if missing_fallbacks:
+            warnings.append(f"Fallback date columns not found: {missing_fallbacks}")
+    
+    # 4. Validate revenue column is numeric
+    if REVENUE_COLUMN in df.columns:
+        try:
+            pd.to_numeric(df[REVENUE_COLUMN], errors='coerce')
+            valid_revenue = df[REVENUE_COLUMN].notna().sum()
+            if valid_revenue == 0:
+                errors.append(f"Revenue column '{REVENUE_COLUMN}' has no valid numeric values")
+            elif valid_revenue < len(df) * 0.9:
+                warnings.append(f"Revenue column has {len(df) - valid_revenue} invalid values")
+        except Exception:
+            errors.append(f"Revenue column '{REVENUE_COLUMN}' cannot be converted to numeric")
+    
+    # 5. Validate date range
+    if DATE_COLUMN in df.columns and df[DATE_COLUMN].notna().any():
+        min_date_in_data = df[DATE_COLUMN].min()
+        max_date_in_data = df[DATE_COLUMN].max()
+        
+        if MIN_YEAR and min_date_in_data.year > MIN_YEAR:
+            warnings.append(f"MIN_YEAR ({MIN_YEAR}) is earlier than earliest data ({min_date_in_data.year})")
+        
+        if MAX_DATE and max_date_in_data > MAX_DATE:
+            warnings.append(f"MAX_DATE ({MAX_DATE.date()}) is earlier than latest data ({max_date_in_data.date()})")
+    
+    # 6. Validate analysis years
+    if 'Year' in df.columns:
+        available_years = sorted(df['Year'].unique())
+        missing_years = [year for year in ANALYSIS_YEARS if year not in available_years]
+        if missing_years:
+            warnings.append(f"ANALYSIS_YEARS includes years not in data: {missing_years}")
+    
+    # 7. Validate LTM configuration
+    if LTM_ENABLED:
+        if LTM_START is None or LTM_END is None:
+            errors.append("LTM_ENABLED is True but LTM_START or LTM_END is None")
+        else:
+            if LTM_START > LTM_END:
+                errors.append(f"LTM_START ({LTM_START}) is after LTM_END ({LTM_END})")
+            
+            if 'YearMonth' in df.columns:
+                available_periods = df['YearMonth'].unique()
+                if LTM_START not in available_periods:
+                    warnings.append(f"LTM_START ({LTM_START}) not found in data")
+                if LTM_END not in available_periods:
+                    warnings.append(f"LTM_END ({LTM_END}) not found in data")
+    
+    # 8. Validate exclusion filters
+    if EXCLUSION_FILTERS.get('enabled', False):
+        exclude_col = EXCLUSION_FILTERS.get('exclude_by_column')
+        if exclude_col:
+            if exclude_col not in df.columns:
+                errors.append(f"Exclusion filter column '{exclude_col}' not found in data")
+            else:
+                exclude_values = EXCLUSION_FILTERS.get('exclude_values', [])
+                if exclude_values:
+                    available_values = df[exclude_col].unique()
+                    invalid_values = [v for v in exclude_values if v not in available_values]
+                    if invalid_values:
+                        warnings.append(f"Exclusion filter values not found in data: {invalid_values}")
+    
+    # 9. Validate optional columns (warnings only)
+    optional_columns = {
+        'Customer': CUSTOMER_COLUMN,
+        'Item': ITEM_COLUMN,
+        'Quantity': QUANTITY_COLUMN
+    }
+    
+    for col_type, col_name in optional_columns.items():
+        if col_name and col_name not in df.columns:
+            warnings.append(f"Optional {col_type} column '{col_name}' not found. Some analyses may not work.")
+    
+    # 10. Validate data file exists
+    data_path = get_data_path()
+    if not data_path.exists():
+        errors.append(f"Data file not found: {data_path}")
+    
+    return errors, warnings
+
+def print_validation_report(errors, warnings):
+    """
+    Print a formatted validation report
+    
+    Args:
+        errors: List of error messages
+        warnings: List of warning messages
+    """
+    print("\n" + "="*60)
+    print("Configuration Validation Report")
+    print("="*60)
+    
+    if errors:
+        print(f"\n❌ ERRORS ({len(errors)}):")
+        for i, error in enumerate(errors, 1):
+            print(f"  {i}. {error}")
+    else:
+        print("\n✅ No configuration errors found")
+    
+    if warnings:
+        print(f"\n⚠️  WARNINGS ({len(warnings)}):")
+        for i, warning in enumerate(warnings, 1):
+            print(f"  {i}. {warning}")
+    else:
+        print("\n✅ No warnings")
+    
+    print("\n" + "="*60)
+    
+    if errors:
+        return False
+    return True
+
+def validate_and_report(df=None):
+    """
+    Validate configuration and print report
+    
+    Args:
+        df: Optional DataFrame to validate against
+    
+    Returns:
+        bool: True if no errors, False otherwise
+    """
+    errors, warnings = validate_config(df)
+    return print_validation_report(errors, warnings)
+
+# ============================================================================
+# STANDALONE VALIDATION SCRIPT
+# ============================================================================
+
+if __name__ == "__main__":
+    """Run configuration validation"""
+    print("Validating configuration...")
+    is_valid = validate_and_report()
+    
+    if is_valid:
+        print("\n✅ Configuration is valid!")
+        exit(0)
+    else:
+        print("\n❌ Configuration has errors. Please fix them before running analyses.")
+        exit(1)