215 lines
7.7 KiB
Python
215 lines
7.7 KiB
Python
"""
|
|
Configuration validation utility
|
|
Validates configuration settings against data to catch errors early
|
|
|
|
Usage:
|
|
from config_validator import validate_config
|
|
|
|
# Validate configuration
|
|
errors, warnings = validate_config(df)
|
|
if errors:
|
|
print("Configuration errors found:", errors)
|
|
"""
|
|
import pandas as pd
|
|
from pathlib import Path
|
|
from config import (
|
|
DATA_FILE, REVENUE_COLUMN, DATE_COLUMN, DATE_FALLBACK_COLUMNS,
|
|
CUSTOMER_COLUMN, ITEM_COLUMN, QUANTITY_COLUMN,
|
|
MIN_YEAR, MAX_DATE, ANALYSIS_YEARS,
|
|
LTM_ENABLED, LTM_START, LTM_END, LTM_START_YEAR, LTM_END_YEAR,
|
|
EXCLUSION_FILTERS, get_data_path
|
|
)
|
|
|
|
def validate_config(df=None):
|
|
"""
|
|
Validate configuration against data
|
|
|
|
Args:
|
|
df: Optional DataFrame to validate against. If None, attempts to load data.
|
|
|
|
Returns:
|
|
tuple: (errors list, warnings list)
|
|
|
|
Example:
|
|
errors, warnings = validate_config(df)
|
|
if errors:
|
|
for error in errors:
|
|
print(f"ERROR: {error}")
|
|
if warnings:
|
|
for warning in warnings:
|
|
print(f"WARNING: {warning}")
|
|
"""
|
|
errors = []
|
|
warnings = []
|
|
|
|
# Load data if not provided
|
|
if df is None:
|
|
try:
|
|
from data_loader import load_sales_data
|
|
data_path = get_data_path()
|
|
if not data_path.exists():
|
|
errors.append(f"Data file not found: {data_path}")
|
|
return errors, warnings
|
|
df = load_sales_data(data_path)
|
|
except Exception as e:
|
|
errors.append(f"Could not load data for validation: {e}")
|
|
return errors, warnings
|
|
|
|
# 1. Validate required columns exist
|
|
required_columns = [REVENUE_COLUMN, DATE_COLUMN]
|
|
for col in required_columns:
|
|
if col not in df.columns:
|
|
errors.append(f"Required column '{col}' not found in data. Available columns: {list(df.columns)[:10]}")
|
|
|
|
# 2. Validate date column has valid dates
|
|
if DATE_COLUMN in df.columns:
|
|
date_coverage = df[DATE_COLUMN].notna().sum() / len(df) * 100
|
|
if date_coverage < 50:
|
|
errors.append(f"Date coverage is very low ({date_coverage:.1f}%). Check date column configuration.")
|
|
elif date_coverage < 90:
|
|
warnings.append(f"Date coverage is {date_coverage:.1f}%. Consider adding fallback date columns.")
|
|
|
|
# 3. Validate fallback date columns
|
|
if DATE_FALLBACK_COLUMNS:
|
|
missing_fallbacks = [col for col in DATE_FALLBACK_COLUMNS if col not in df.columns]
|
|
if missing_fallbacks:
|
|
warnings.append(f"Fallback date columns not found: {missing_fallbacks}")
|
|
|
|
# 4. Validate revenue column is numeric
|
|
if REVENUE_COLUMN in df.columns:
|
|
try:
|
|
pd.to_numeric(df[REVENUE_COLUMN], errors='coerce')
|
|
valid_revenue = df[REVENUE_COLUMN].notna().sum()
|
|
if valid_revenue == 0:
|
|
errors.append(f"Revenue column '{REVENUE_COLUMN}' has no valid numeric values")
|
|
elif valid_revenue < len(df) * 0.9:
|
|
warnings.append(f"Revenue column has {len(df) - valid_revenue} invalid values")
|
|
except Exception:
|
|
errors.append(f"Revenue column '{REVENUE_COLUMN}' cannot be converted to numeric")
|
|
|
|
# 5. Validate date range
|
|
if DATE_COLUMN in df.columns and df[DATE_COLUMN].notna().any():
|
|
min_date_in_data = df[DATE_COLUMN].min()
|
|
max_date_in_data = df[DATE_COLUMN].max()
|
|
|
|
if MIN_YEAR and min_date_in_data.year > MIN_YEAR:
|
|
warnings.append(f"MIN_YEAR ({MIN_YEAR}) is earlier than earliest data ({min_date_in_data.year})")
|
|
|
|
if MAX_DATE and max_date_in_data > MAX_DATE:
|
|
warnings.append(f"MAX_DATE ({MAX_DATE.date()}) is earlier than latest data ({max_date_in_data.date()})")
|
|
|
|
# 6. Validate analysis years
|
|
if 'Year' in df.columns:
|
|
available_years = sorted(df['Year'].unique())
|
|
missing_years = [year for year in ANALYSIS_YEARS if year not in available_years]
|
|
if missing_years:
|
|
warnings.append(f"ANALYSIS_YEARS includes years not in data: {missing_years}")
|
|
|
|
# 7. Validate LTM configuration
|
|
if LTM_ENABLED:
|
|
if LTM_START is None or LTM_END is None:
|
|
errors.append("LTM_ENABLED is True but LTM_START or LTM_END is None")
|
|
else:
|
|
if LTM_START > LTM_END:
|
|
errors.append(f"LTM_START ({LTM_START}) is after LTM_END ({LTM_END})")
|
|
|
|
if 'YearMonth' in df.columns:
|
|
available_periods = df['YearMonth'].unique()
|
|
if LTM_START not in available_periods:
|
|
warnings.append(f"LTM_START ({LTM_START}) not found in data")
|
|
if LTM_END not in available_periods:
|
|
warnings.append(f"LTM_END ({LTM_END}) not found in data")
|
|
|
|
# 8. Validate exclusion filters
|
|
if EXCLUSION_FILTERS.get('enabled', False):
|
|
exclude_col = EXCLUSION_FILTERS.get('exclude_by_column')
|
|
if exclude_col:
|
|
if exclude_col not in df.columns:
|
|
errors.append(f"Exclusion filter column '{exclude_col}' not found in data")
|
|
else:
|
|
exclude_values = EXCLUSION_FILTERS.get('exclude_values', [])
|
|
if exclude_values:
|
|
available_values = df[exclude_col].unique()
|
|
invalid_values = [v for v in exclude_values if v not in available_values]
|
|
if invalid_values:
|
|
warnings.append(f"Exclusion filter values not found in data: {invalid_values}")
|
|
|
|
# 9. Validate optional columns (warnings only)
|
|
optional_columns = {
|
|
'Customer': CUSTOMER_COLUMN,
|
|
'Item': ITEM_COLUMN,
|
|
'Quantity': QUANTITY_COLUMN
|
|
}
|
|
|
|
for col_type, col_name in optional_columns.items():
|
|
if col_name and col_name not in df.columns:
|
|
warnings.append(f"Optional {col_type} column '{col_name}' not found. Some analyses may not work.")
|
|
|
|
# 10. Validate data file exists
|
|
data_path = get_data_path()
|
|
if not data_path.exists():
|
|
errors.append(f"Data file not found: {data_path}")
|
|
|
|
return errors, warnings
|
|
|
|
def print_validation_report(errors, warnings):
|
|
"""
|
|
Print a formatted validation report
|
|
|
|
Args:
|
|
errors: List of error messages
|
|
warnings: List of warning messages
|
|
"""
|
|
print("\n" + "="*60)
|
|
print("Configuration Validation Report")
|
|
print("="*60)
|
|
|
|
if errors:
|
|
print(f"\n❌ ERRORS ({len(errors)}):")
|
|
for i, error in enumerate(errors, 1):
|
|
print(f" {i}. {error}")
|
|
else:
|
|
print("\n✅ No configuration errors found")
|
|
|
|
if warnings:
|
|
print(f"\n⚠️ WARNINGS ({len(warnings)}):")
|
|
for i, warning in enumerate(warnings, 1):
|
|
print(f" {i}. {warning}")
|
|
else:
|
|
print("\n✅ No warnings")
|
|
|
|
print("\n" + "="*60)
|
|
|
|
if errors:
|
|
return False
|
|
return True
|
|
|
|
def validate_and_report(df=None):
|
|
"""
|
|
Validate configuration and print report
|
|
|
|
Args:
|
|
df: Optional DataFrame to validate against
|
|
|
|
Returns:
|
|
bool: True if no errors, False otherwise
|
|
"""
|
|
errors, warnings = validate_config(df)
|
|
return print_validation_report(errors, warnings)
|
|
|
|
# ============================================================================
|
|
# STANDALONE VALIDATION SCRIPT
|
|
# ============================================================================
|
|
|
|
if __name__ == "__main__":
|
|
"""Run configuration validation"""
|
|
print("Validating configuration...")
|
|
is_valid = validate_and_report()
|
|
|
|
if is_valid:
|
|
print("\n✅ Configuration is valid!")
|
|
exit(0)
|
|
else:
|
|
print("\n❌ Configuration has errors. Please fix them before running analyses.")
|
|
exit(1)
|