sales-data-analysis/config_validator.py

"""
Configuration validation utility
Validates configuration settings against data to catch errors early

Usage:
    from config_validator import validate_config

    # Validate configuration
    errors, warnings = validate_config(df)
    if errors:
        print("Configuration errors found:", errors)
"""
import pandas as pd
from pathlib import Path
from config import (
    DATA_FILE, REVENUE_COLUMN, DATE_COLUMN, DATE_FALLBACK_COLUMNS,
    CUSTOMER_COLUMN, ITEM_COLUMN, QUANTITY_COLUMN,
    MIN_YEAR, MAX_DATE, ANALYSIS_YEARS,
    LTM_ENABLED, LTM_START, LTM_END, LTM_START_YEAR, LTM_END_YEAR,
    EXCLUSION_FILTERS, get_data_path
)

def validate_config(df=None):
    """
    Validate configuration against data

    Args:
        df: Optional DataFrame to validate against. If None, attempts to load data.

    Returns:
        tuple: (errors list, warnings list)

    Example:
        errors, warnings = validate_config(df)
        if errors:
            for error in errors:
                print(f"ERROR: {error}")
        if warnings:
            for warning in warnings:
                print(f"WARNING: {warning}")
    """
    errors = []
    warnings = []

    # Load data if not provided
    if df is None:
        try:
            from data_loader import load_sales_data
            data_path = get_data_path()
            if not data_path.exists():
                errors.append(f"Data file not found: {data_path}")
                return errors, warnings
            df = load_sales_data(data_path)
        except Exception as e:
            errors.append(f"Could not load data for validation: {e}")
            return errors, warnings

    # 1. Validate required columns exist
    required_columns = [REVENUE_COLUMN, DATE_COLUMN]
    for col in required_columns:
        if col not in df.columns:
            errors.append(f"Required column '{col}' not found in data. Available columns: {list(df.columns)[:10]}")

    # 2. Validate date column has valid dates
    if DATE_COLUMN in df.columns:
        date_coverage = df[DATE_COLUMN].notna().sum() / len(df) * 100
        if date_coverage < 50:
            errors.append(f"Date coverage is very low ({date_coverage:.1f}%). Check date column configuration.")
        elif date_coverage < 90:
            warnings.append(f"Date coverage is {date_coverage:.1f}%. Consider adding fallback date columns.")

    # 3. Validate fallback date columns
    if DATE_FALLBACK_COLUMNS:
        missing_fallbacks = [col for col in DATE_FALLBACK_COLUMNS if col not in df.columns]
        if missing_fallbacks:
            warnings.append(f"Fallback date columns not found: {missing_fallbacks}")

    # 4. Validate revenue column is numeric
    if REVENUE_COLUMN in df.columns:
        try:
            pd.to_numeric(df[REVENUE_COLUMN], errors='coerce')
            valid_revenue = df[REVENUE_COLUMN].notna().sum()
            if valid_revenue == 0:
                errors.append(f"Revenue column '{REVENUE_COLUMN}' has no valid numeric values")
            elif valid_revenue < len(df) * 0.9:
                warnings.append(f"Revenue column has {len(df) - valid_revenue} invalid values")
        except Exception:
            errors.append(f"Revenue column '{REVENUE_COLUMN}' cannot be converted to numeric")

    # 5. Validate date range
    if DATE_COLUMN in df.columns and df[DATE_COLUMN].notna().any():
        min_date_in_data = df[DATE_COLUMN].min()
        max_date_in_data = df[DATE_COLUMN].max()

        if MIN_YEAR and min_date_in_data.year > MIN_YEAR:
            warnings.append(f"MIN_YEAR ({MIN_YEAR}) is earlier than earliest data ({min_date_in_data.year})")

        if MAX_DATE and max_date_in_data > MAX_DATE:
            warnings.append(f"MAX_DATE ({MAX_DATE.date()}) is earlier than latest data ({max_date_in_data.date()})")

    # 6. Validate analysis years
    if 'Year' in df.columns:
        available_years = sorted(df['Year'].unique())
        missing_years = [year for year in ANALYSIS_YEARS if year not in available_years]
        if missing_years:
            warnings.append(f"ANALYSIS_YEARS includes years not in data: {missing_years}")

    # 7. Validate LTM configuration
    if LTM_ENABLED:
        if LTM_START is None or LTM_END is None:
            errors.append("LTM_ENABLED is True but LTM_START or LTM_END is None")
        else:
            if LTM_START > LTM_END:
                errors.append(f"LTM_START ({LTM_START}) is after LTM_END ({LTM_END})")

            if 'YearMonth' in df.columns:
                available_periods = df['YearMonth'].unique()
                if LTM_START not in available_periods:
                    warnings.append(f"LTM_START ({LTM_START}) not found in data")
                if LTM_END not in available_periods:
                    warnings.append(f"LTM_END ({LTM_END}) not found in data")

    # 8. Validate exclusion filters
    if EXCLUSION_FILTERS.get('enabled', False):
        exclude_col = EXCLUSION_FILTERS.get('exclude_by_column')
        if exclude_col:
            if exclude_col not in df.columns:
                errors.append(f"Exclusion filter column '{exclude_col}' not found in data")
            else:
                exclude_values = EXCLUSION_FILTERS.get('exclude_values', [])
                if exclude_values:
                    available_values = df[exclude_col].unique()
                    invalid_values = [v for v in exclude_values if v not in available_values]
                    if invalid_values:
                        warnings.append(f"Exclusion filter values not found in data: {invalid_values}")

    # 9. Validate optional columns (warnings only)
    optional_columns = {
        'Customer': CUSTOMER_COLUMN,
        'Item': ITEM_COLUMN,
        'Quantity': QUANTITY_COLUMN
    }

    for col_type, col_name in optional_columns.items():
        if col_name and col_name not in df.columns:
            warnings.append(f"Optional {col_type} column '{col_name}' not found. Some analyses may not work.")

    # 10. Validate data file exists
    data_path = get_data_path()
    if not data_path.exists():
        errors.append(f"Data file not found: {data_path}")

    return errors, warnings

def print_validation_report(errors, warnings):
    """
    Print a formatted validation report

    Args:
        errors: List of error messages
        warnings: List of warning messages
    """
    print("\n" + "="*60)
    print("Configuration Validation Report")
    print("="*60)

    if errors:
        print(f"\n❌ ERRORS ({len(errors)}):")
        for i, error in enumerate(errors, 1):
            print(f"  {i}. {error}")
    else:
        print("\n✅ No configuration errors found")

    if warnings:
        print(f"\n⚠️  WARNINGS ({len(warnings)}):")
        for i, warning in enumerate(warnings, 1):
            print(f"  {i}. {warning}")
    else:
        print("\n✅ No warnings")

    print("\n" + "="*60)

    if errors:
        return False
    return True

def validate_and_report(df=None):
    """
    Validate configuration and print report

    Args:
        df: Optional DataFrame to validate against

    Returns:
        bool: True if no errors, False otherwise
    """
    errors, warnings = validate_config(df)
    return print_validation_report(errors, warnings)

# ============================================================================
# STANDALONE VALIDATION SCRIPT
# ============================================================================

if __name__ == "__main__":
    """Run configuration validation"""
    print("Validating configuration...")
    is_valid = validate_and_report()

    if is_valid:
        print("\n✅ Configuration is valid!")
        exit(0)
    else:
        print("\n❌ Configuration has errors. Please fix them before running analyses.")
        exit(1)