Initial commit: sales analysis template
Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
214
config_validator.py
Normal file
214
config_validator.py
Normal file
@@ -0,0 +1,214 @@
|
||||
"""
|
||||
Configuration validation utility
|
||||
Validates configuration settings against data to catch errors early
|
||||
|
||||
Usage:
|
||||
from config_validator import validate_config
|
||||
|
||||
# Validate configuration
|
||||
errors, warnings = validate_config(df)
|
||||
if errors:
|
||||
print("Configuration errors found:", errors)
|
||||
"""
|
||||
import pandas as pd
|
||||
from pathlib import Path
|
||||
from config import (
|
||||
DATA_FILE, REVENUE_COLUMN, DATE_COLUMN, DATE_FALLBACK_COLUMNS,
|
||||
CUSTOMER_COLUMN, ITEM_COLUMN, QUANTITY_COLUMN,
|
||||
MIN_YEAR, MAX_DATE, ANALYSIS_YEARS,
|
||||
LTM_ENABLED, LTM_START, LTM_END, LTM_START_YEAR, LTM_END_YEAR,
|
||||
EXCLUSION_FILTERS, get_data_path
|
||||
)
|
||||
|
||||
def validate_config(df=None):
|
||||
"""
|
||||
Validate configuration against data
|
||||
|
||||
Args:
|
||||
df: Optional DataFrame to validate against. If None, attempts to load data.
|
||||
|
||||
Returns:
|
||||
tuple: (errors list, warnings list)
|
||||
|
||||
Example:
|
||||
errors, warnings = validate_config(df)
|
||||
if errors:
|
||||
for error in errors:
|
||||
print(f"ERROR: {error}")
|
||||
if warnings:
|
||||
for warning in warnings:
|
||||
print(f"WARNING: {warning}")
|
||||
"""
|
||||
errors = []
|
||||
warnings = []
|
||||
|
||||
# Load data if not provided
|
||||
if df is None:
|
||||
try:
|
||||
from data_loader import load_sales_data
|
||||
data_path = get_data_path()
|
||||
if not data_path.exists():
|
||||
errors.append(f"Data file not found: {data_path}")
|
||||
return errors, warnings
|
||||
df = load_sales_data(data_path)
|
||||
except Exception as e:
|
||||
errors.append(f"Could not load data for validation: {e}")
|
||||
return errors, warnings
|
||||
|
||||
# 1. Validate required columns exist
|
||||
required_columns = [REVENUE_COLUMN, DATE_COLUMN]
|
||||
for col in required_columns:
|
||||
if col not in df.columns:
|
||||
errors.append(f"Required column '{col}' not found in data. Available columns: {list(df.columns)[:10]}")
|
||||
|
||||
# 2. Validate date column has valid dates
|
||||
if DATE_COLUMN in df.columns:
|
||||
date_coverage = df[DATE_COLUMN].notna().sum() / len(df) * 100
|
||||
if date_coverage < 50:
|
||||
errors.append(f"Date coverage is very low ({date_coverage:.1f}%). Check date column configuration.")
|
||||
elif date_coverage < 90:
|
||||
warnings.append(f"Date coverage is {date_coverage:.1f}%. Consider adding fallback date columns.")
|
||||
|
||||
# 3. Validate fallback date columns
|
||||
if DATE_FALLBACK_COLUMNS:
|
||||
missing_fallbacks = [col for col in DATE_FALLBACK_COLUMNS if col not in df.columns]
|
||||
if missing_fallbacks:
|
||||
warnings.append(f"Fallback date columns not found: {missing_fallbacks}")
|
||||
|
||||
# 4. Validate revenue column is numeric
|
||||
if REVENUE_COLUMN in df.columns:
|
||||
try:
|
||||
pd.to_numeric(df[REVENUE_COLUMN], errors='coerce')
|
||||
valid_revenue = df[REVENUE_COLUMN].notna().sum()
|
||||
if valid_revenue == 0:
|
||||
errors.append(f"Revenue column '{REVENUE_COLUMN}' has no valid numeric values")
|
||||
elif valid_revenue < len(df) * 0.9:
|
||||
warnings.append(f"Revenue column has {len(df) - valid_revenue} invalid values")
|
||||
except Exception:
|
||||
errors.append(f"Revenue column '{REVENUE_COLUMN}' cannot be converted to numeric")
|
||||
|
||||
# 5. Validate date range
|
||||
if DATE_COLUMN in df.columns and df[DATE_COLUMN].notna().any():
|
||||
min_date_in_data = df[DATE_COLUMN].min()
|
||||
max_date_in_data = df[DATE_COLUMN].max()
|
||||
|
||||
if MIN_YEAR and min_date_in_data.year > MIN_YEAR:
|
||||
warnings.append(f"MIN_YEAR ({MIN_YEAR}) is earlier than earliest data ({min_date_in_data.year})")
|
||||
|
||||
if MAX_DATE and max_date_in_data > MAX_DATE:
|
||||
warnings.append(f"MAX_DATE ({MAX_DATE.date()}) is earlier than latest data ({max_date_in_data.date()})")
|
||||
|
||||
# 6. Validate analysis years
|
||||
if 'Year' in df.columns:
|
||||
available_years = sorted(df['Year'].unique())
|
||||
missing_years = [year for year in ANALYSIS_YEARS if year not in available_years]
|
||||
if missing_years:
|
||||
warnings.append(f"ANALYSIS_YEARS includes years not in data: {missing_years}")
|
||||
|
||||
# 7. Validate LTM configuration
|
||||
if LTM_ENABLED:
|
||||
if LTM_START is None or LTM_END is None:
|
||||
errors.append("LTM_ENABLED is True but LTM_START or LTM_END is None")
|
||||
else:
|
||||
if LTM_START > LTM_END:
|
||||
errors.append(f"LTM_START ({LTM_START}) is after LTM_END ({LTM_END})")
|
||||
|
||||
if 'YearMonth' in df.columns:
|
||||
available_periods = df['YearMonth'].unique()
|
||||
if LTM_START not in available_periods:
|
||||
warnings.append(f"LTM_START ({LTM_START}) not found in data")
|
||||
if LTM_END not in available_periods:
|
||||
warnings.append(f"LTM_END ({LTM_END}) not found in data")
|
||||
|
||||
# 8. Validate exclusion filters
|
||||
if EXCLUSION_FILTERS.get('enabled', False):
|
||||
exclude_col = EXCLUSION_FILTERS.get('exclude_by_column')
|
||||
if exclude_col:
|
||||
if exclude_col not in df.columns:
|
||||
errors.append(f"Exclusion filter column '{exclude_col}' not found in data")
|
||||
else:
|
||||
exclude_values = EXCLUSION_FILTERS.get('exclude_values', [])
|
||||
if exclude_values:
|
||||
available_values = df[exclude_col].unique()
|
||||
invalid_values = [v for v in exclude_values if v not in available_values]
|
||||
if invalid_values:
|
||||
warnings.append(f"Exclusion filter values not found in data: {invalid_values}")
|
||||
|
||||
# 9. Validate optional columns (warnings only)
|
||||
optional_columns = {
|
||||
'Customer': CUSTOMER_COLUMN,
|
||||
'Item': ITEM_COLUMN,
|
||||
'Quantity': QUANTITY_COLUMN
|
||||
}
|
||||
|
||||
for col_type, col_name in optional_columns.items():
|
||||
if col_name and col_name not in df.columns:
|
||||
warnings.append(f"Optional {col_type} column '{col_name}' not found. Some analyses may not work.")
|
||||
|
||||
# 10. Validate data file exists
|
||||
data_path = get_data_path()
|
||||
if not data_path.exists():
|
||||
errors.append(f"Data file not found: {data_path}")
|
||||
|
||||
return errors, warnings
|
||||
|
||||
def print_validation_report(errors, warnings):
|
||||
"""
|
||||
Print a formatted validation report
|
||||
|
||||
Args:
|
||||
errors: List of error messages
|
||||
warnings: List of warning messages
|
||||
"""
|
||||
print("\n" + "="*60)
|
||||
print("Configuration Validation Report")
|
||||
print("="*60)
|
||||
|
||||
if errors:
|
||||
print(f"\n❌ ERRORS ({len(errors)}):")
|
||||
for i, error in enumerate(errors, 1):
|
||||
print(f" {i}. {error}")
|
||||
else:
|
||||
print("\n✅ No configuration errors found")
|
||||
|
||||
if warnings:
|
||||
print(f"\n⚠️ WARNINGS ({len(warnings)}):")
|
||||
for i, warning in enumerate(warnings, 1):
|
||||
print(f" {i}. {warning}")
|
||||
else:
|
||||
print("\n✅ No warnings")
|
||||
|
||||
print("\n" + "="*60)
|
||||
|
||||
if errors:
|
||||
return False
|
||||
return True
|
||||
|
||||
def validate_and_report(df=None):
|
||||
"""
|
||||
Validate configuration and print report
|
||||
|
||||
Args:
|
||||
df: Optional DataFrame to validate against
|
||||
|
||||
Returns:
|
||||
bool: True if no errors, False otherwise
|
||||
"""
|
||||
errors, warnings = validate_config(df)
|
||||
return print_validation_report(errors, warnings)
|
||||
|
||||
# ============================================================================
|
||||
# STANDALONE VALIDATION SCRIPT
|
||||
# ============================================================================
|
||||
|
||||
if __name__ == "__main__":
|
||||
"""Run configuration validation"""
|
||||
print("Validating configuration...")
|
||||
is_valid = validate_and_report()
|
||||
|
||||
if is_valid:
|
||||
print("\n✅ Configuration is valid!")
|
||||
exit(0)
|
||||
else:
|
||||
print("\n❌ Configuration has errors. Please fix them before running analyses.")
|
||||
exit(1)
|
||||
Reference in New Issue
Block a user