""" Configuration validation utility Validates configuration settings against data to catch errors early Usage: from config_validator import validate_config # Validate configuration errors, warnings = validate_config(df) if errors: print("Configuration errors found:", errors) """ import pandas as pd from pathlib import Path from config import ( DATA_FILE, REVENUE_COLUMN, DATE_COLUMN, DATE_FALLBACK_COLUMNS, CUSTOMER_COLUMN, ITEM_COLUMN, QUANTITY_COLUMN, MIN_YEAR, MAX_DATE, ANALYSIS_YEARS, LTM_ENABLED, LTM_START, LTM_END, LTM_START_YEAR, LTM_END_YEAR, EXCLUSION_FILTERS, get_data_path ) def validate_config(df=None): """ Validate configuration against data Args: df: Optional DataFrame to validate against. If None, attempts to load data. Returns: tuple: (errors list, warnings list) Example: errors, warnings = validate_config(df) if errors: for error in errors: print(f"ERROR: {error}") if warnings: for warning in warnings: print(f"WARNING: {warning}") """ errors = [] warnings = [] # Load data if not provided if df is None: try: from data_loader import load_sales_data data_path = get_data_path() if not data_path.exists(): errors.append(f"Data file not found: {data_path}") return errors, warnings df = load_sales_data(data_path) except Exception as e: errors.append(f"Could not load data for validation: {e}") return errors, warnings # 1. Validate required columns exist required_columns = [REVENUE_COLUMN, DATE_COLUMN] for col in required_columns: if col not in df.columns: errors.append(f"Required column '{col}' not found in data. Available columns: {list(df.columns)[:10]}") # 2. Validate date column has valid dates if DATE_COLUMN in df.columns: date_coverage = df[DATE_COLUMN].notna().sum() / len(df) * 100 if date_coverage < 50: errors.append(f"Date coverage is very low ({date_coverage:.1f}%). Check date column configuration.") elif date_coverage < 90: warnings.append(f"Date coverage is {date_coverage:.1f}%. Consider adding fallback date columns.") # 3. Validate fallback date columns if DATE_FALLBACK_COLUMNS: missing_fallbacks = [col for col in DATE_FALLBACK_COLUMNS if col not in df.columns] if missing_fallbacks: warnings.append(f"Fallback date columns not found: {missing_fallbacks}") # 4. Validate revenue column is numeric if REVENUE_COLUMN in df.columns: try: pd.to_numeric(df[REVENUE_COLUMN], errors='coerce') valid_revenue = df[REVENUE_COLUMN].notna().sum() if valid_revenue == 0: errors.append(f"Revenue column '{REVENUE_COLUMN}' has no valid numeric values") elif valid_revenue < len(df) * 0.9: warnings.append(f"Revenue column has {len(df) - valid_revenue} invalid values") except Exception: errors.append(f"Revenue column '{REVENUE_COLUMN}' cannot be converted to numeric") # 5. Validate date range if DATE_COLUMN in df.columns and df[DATE_COLUMN].notna().any(): min_date_in_data = df[DATE_COLUMN].min() max_date_in_data = df[DATE_COLUMN].max() if MIN_YEAR and min_date_in_data.year > MIN_YEAR: warnings.append(f"MIN_YEAR ({MIN_YEAR}) is earlier than earliest data ({min_date_in_data.year})") if MAX_DATE and max_date_in_data > MAX_DATE: warnings.append(f"MAX_DATE ({MAX_DATE.date()}) is earlier than latest data ({max_date_in_data.date()})") # 6. Validate analysis years if 'Year' in df.columns: available_years = sorted(df['Year'].unique()) missing_years = [year for year in ANALYSIS_YEARS if year not in available_years] if missing_years: warnings.append(f"ANALYSIS_YEARS includes years not in data: {missing_years}") # 7. Validate LTM configuration if LTM_ENABLED: if LTM_START is None or LTM_END is None: errors.append("LTM_ENABLED is True but LTM_START or LTM_END is None") else: if LTM_START > LTM_END: errors.append(f"LTM_START ({LTM_START}) is after LTM_END ({LTM_END})") if 'YearMonth' in df.columns: available_periods = df['YearMonth'].unique() if LTM_START not in available_periods: warnings.append(f"LTM_START ({LTM_START}) not found in data") if LTM_END not in available_periods: warnings.append(f"LTM_END ({LTM_END}) not found in data") # 8. Validate exclusion filters if EXCLUSION_FILTERS.get('enabled', False): exclude_col = EXCLUSION_FILTERS.get('exclude_by_column') if exclude_col: if exclude_col not in df.columns: errors.append(f"Exclusion filter column '{exclude_col}' not found in data") else: exclude_values = EXCLUSION_FILTERS.get('exclude_values', []) if exclude_values: available_values = df[exclude_col].unique() invalid_values = [v for v in exclude_values if v not in available_values] if invalid_values: warnings.append(f"Exclusion filter values not found in data: {invalid_values}") # 9. Validate optional columns (warnings only) optional_columns = { 'Customer': CUSTOMER_COLUMN, 'Item': ITEM_COLUMN, 'Quantity': QUANTITY_COLUMN } for col_type, col_name in optional_columns.items(): if col_name and col_name not in df.columns: warnings.append(f"Optional {col_type} column '{col_name}' not found. Some analyses may not work.") # 10. Validate data file exists data_path = get_data_path() if not data_path.exists(): errors.append(f"Data file not found: {data_path}") return errors, warnings def print_validation_report(errors, warnings): """ Print a formatted validation report Args: errors: List of error messages warnings: List of warning messages """ print("\n" + "="*60) print("Configuration Validation Report") print("="*60) if errors: print(f"\n❌ ERRORS ({len(errors)}):") for i, error in enumerate(errors, 1): print(f" {i}. {error}") else: print("\n✅ No configuration errors found") if warnings: print(f"\n⚠️ WARNINGS ({len(warnings)}):") for i, warning in enumerate(warnings, 1): print(f" {i}. {warning}") else: print("\n✅ No warnings") print("\n" + "="*60) if errors: return False return True def validate_and_report(df=None): """ Validate configuration and print report Args: df: Optional DataFrame to validate against Returns: bool: True if no errors, False otherwise """ errors, warnings = validate_config(df) return print_validation_report(errors, warnings) # ============================================================================ # STANDALONE VALIDATION SCRIPT # ============================================================================ if __name__ == "__main__": """Run configuration validation""" print("Validating configuration...") is_valid = validate_and_report() if is_valid: print("\n✅ Configuration is valid!") exit(0) else: print("\n❌ Configuration has errors. Please fix them before running analyses.") exit(1)