""" Generic data loading utility with flexible date handling Handles various date column formats and fallback logic This loader is designed to work with different CSV structures by: 1. Trying primary date column first 2. Falling back to alternative date columns if needed 3. Ensuring 100% date coverage """ import pandas as pd import numpy as np from pathlib import Path from config import ( REVENUE_COLUMN, DATE_COLUMN, DATE_FALLBACK_COLUMNS, get_data_path ) def load_sales_data(filepath=None): """ Load sales data with flexible date handling This function provides intelligent data loading with fallback logic: 1. Loads the CSV file 2. Converts revenue column to numeric 3. Attempts to parse dates using primary date column 4. Falls back to alternative date columns if needed (100% coverage) 5. Creates Year and YearMonth columns for analysis CRITICAL: Always use this function instead of pd.read_csv() directly. This ensures proper date parsing with fallback logic. Args: filepath: Path to the CSV file (defaults to config.DATA_FILE). Can be str, Path, or None (uses config.get_data_path()) Returns: pd.DataFrame: DataFrame with properly parsed dates and revenue. Includes 'Year' and 'YearMonth' columns. Raises: FileNotFoundError: If data file doesn't exist. Error message includes file path and suggests checking config.py ValueError: If required columns (REVENUE_COLUMN) are missing. Error message lists available columns and suggests updating config.py Example: >>> from data_loader import load_sales_data >>> from config import get_data_path >>> df = load_sales_data(get_data_path()) >>> print(f"Loaded {len(df):,} rows with {df['Year'].notna().sum():,} with dates") See Also: - .cursor/rules/data_loading.md for detailed patterns - config.py for column name configuration """ # Get data file path if filepath is None: filepath = get_data_path() else: filepath = Path(filepath) # Check if file exists if not filepath.exists(): raise FileNotFoundError( f"Data file not found: {filepath}\n" f"Please update config.py with the correct DATA_FILE path." ) # Load CSV print(f"Loading data from: {filepath}") df = pd.read_csv(filepath, low_memory=False) print(f"Loaded {len(df):,} rows") # Validate required columns if REVENUE_COLUMN not in df.columns: raise ValueError( f"Required column '{REVENUE_COLUMN}' not found in data.\n" f"Available columns: {list(df.columns)}\n" f"Please update config.py REVENUE_COLUMN to match your data." ) # Convert revenue column to numeric df[REVENUE_COLUMN] = pd.to_numeric(df[REVENUE_COLUMN], errors='coerce') # Count missing revenue values missing_revenue = df[REVENUE_COLUMN].isna().sum() if missing_revenue > 0: print(f"Warning: {missing_revenue:,} rows have missing/invalid revenue values") # Create working date column df['WorkingDate'] = pd.NaT # Try primary date column first if DATE_COLUMN in df.columns: print(f"Attempting to parse {DATE_COLUMN}...") df['Date_Parsed'] = pd.to_datetime(df[DATE_COLUMN], errors='coerce', format='mixed') parsed_count = df['Date_Parsed'].notna().sum() df.loc[df['Date_Parsed'].notna(), 'WorkingDate'] = df.loc[df['Date_Parsed'].notna(), 'Date_Parsed'] print(f" Parsed {parsed_count:,} dates from {DATE_COLUMN}") else: print(f"Warning: Primary date column '{DATE_COLUMN}' not found") # Use fallback date columns if DATE_FALLBACK_COLUMNS: for fallback_col in DATE_FALLBACK_COLUMNS: if fallback_col in df.columns: missing_dates = df['WorkingDate'].isna() if missing_dates.sum() > 0: print(f"Using fallback column: {fallback_col}...") fallback_parsed = pd.to_datetime( df.loc[missing_dates, fallback_col], errors='coerce', format='mixed' ) newly_parsed = missing_dates & fallback_parsed.notna() if newly_parsed.sum() > 0: df.loc[newly_parsed, 'WorkingDate'] = fallback_parsed[newly_parsed] print(f" Parsed {newly_parsed.sum():,} additional dates from {fallback_col}") # Final fallback: try to construct from Year column if available if 'Year' in df.columns and df['WorkingDate'].isna().sum() > 0: missing_dates = df['WorkingDate'].isna() year_values = pd.to_numeric(df.loc[missing_dates, 'Year'], errors='coerce') valid_years = missing_dates & year_values.notna() if valid_years.sum() > 0: print(f"Using Year column for remaining {valid_years.sum():,} rows...") df.loc[valid_years, 'WorkingDate'] = pd.to_datetime( df.loc[valid_years, 'Year'].astype(int).astype(str) + '-01-01', errors='coerce' ) # Set WorkingDate as the primary date column df[DATE_COLUMN] = df['WorkingDate'] # Clean up temporary columns df = df.drop(columns=['Date_Parsed', 'WorkingDate'], errors='ignore') # Extract Year from date column df['Year'] = df[DATE_COLUMN].dt.year # Fill missing Year from Year column if it exists and date is missing if 'Year' in df.columns: year_orig = pd.to_numeric(df['Year'], errors='coerce') missing_year = df['Year'].isna() if missing_year.sum() > 0 and 'Year' in df.columns: year_fallback = pd.to_numeric(df.loc[missing_year, 'Year'], errors='coerce') df.loc[missing_year & year_fallback.notna(), 'Year'] = year_fallback[missing_year & year_fallback.notna()] # Create YearMonth for monthly analysis if DATE_COLUMN in df.columns: df['YearMonth'] = df[DATE_COLUMN].dt.to_period('M') # Report date coverage total_rows = len(df) date_coverage = df[DATE_COLUMN].notna().sum() coverage_pct = (date_coverage / total_rows * 100) if total_rows > 0 else 0 print(f"Date coverage: {date_coverage:,} / {total_rows:,} rows ({coverage_pct:.1f}%)") if coverage_pct < 100: print(f"Warning: {total_rows - date_coverage:,} rows have missing dates") # Report date range if df[DATE_COLUMN].notna().any(): min_date = df[DATE_COLUMN].min() max_date = df[DATE_COLUMN].max() print(f"Date range: {min_date.strftime('%Y-%m-%d')} to {max_date.strftime('%Y-%m-%d')}") return df def validate_data_structure(df: pd.DataFrame) -> tuple[bool, str]: """ Validate that loaded data has expected structure. Checks for required columns, data quality, and basic validity. Returns actionable error messages if validation fails. Args: df: DataFrame to validate (should be result of load_sales_data()) Returns: tuple[bool, str]: (is_valid, error_message) - is_valid: True if data structure is valid, False otherwise - error_message: "OK" if valid, otherwise descriptive error message Example: >>> df = load_sales_data(get_data_path()) >>> is_valid, msg = validate_data_structure(df) >>> if not is_valid: ... print(f"ERROR: {msg}") See Also: - load_sales_data() - Load data before validating - config_validator.py - Comprehensive configuration validation """ from config import REVENUE_COLUMN, DATE_COLUMN errors = [] # Check required columns if REVENUE_COLUMN not in df.columns: errors.append(f"Missing required column: {REVENUE_COLUMN}") if DATE_COLUMN not in df.columns: errors.append(f"Missing required column: {DATE_COLUMN}") # Check data quality if len(df) == 0: errors.append("DataFrame is empty") if REVENUE_COLUMN in df.columns: if df[REVENUE_COLUMN].isna().all(): errors.append(f"All {REVENUE_COLUMN} values are NaN") if df[REVENUE_COLUMN].notna().sum() == 0: errors.append(f"No valid {REVENUE_COLUMN} values") if DATE_COLUMN in df.columns: if df[DATE_COLUMN].isna().all(): errors.append(f"All {DATE_COLUMN} values are NaN") if errors: return False, "; ".join(errors) return True, "OK"