Initial commit: sales analysis template

Co-authored-by: Cursor <cursoragent@cursor.com>
2026-02-06 09:16:34 -05:00
commit cf0b596449
38 changed files with 8001 additions and 0 deletions
--- a/data_loader.py
+++ b/data_loader.py
@@ -0,0 +1,224 @@
+"""
+Generic data loading utility with flexible date handling
+Handles various date column formats and fallback logic
+
+This loader is designed to work with different CSV structures by:
+1. Trying primary date column first
+2. Falling back to alternative date columns if needed
+3. Ensuring 100% date coverage
+"""
+import pandas as pd
+import numpy as np
+from pathlib import Path
+from config import (
+    REVENUE_COLUMN, DATE_COLUMN, DATE_FALLBACK_COLUMNS,
+    get_data_path
+)
+
+def load_sales_data(filepath=None):
+    """
+    Load sales data with flexible date handling
+    
+    This function provides intelligent data loading with fallback logic:
+    1. Loads the CSV file
+    2. Converts revenue column to numeric
+    3. Attempts to parse dates using primary date column
+    4. Falls back to alternative date columns if needed (100% coverage)
+    5. Creates Year and YearMonth columns for analysis
+    
+    CRITICAL: Always use this function instead of pd.read_csv() directly.
+    This ensures proper date parsing with fallback logic.
+    
+    Args:
+        filepath: Path to the CSV file (defaults to config.DATA_FILE).
+                  Can be str, Path, or None (uses config.get_data_path())
+    
+    Returns:
+        pd.DataFrame: DataFrame with properly parsed dates and revenue.
+                     Includes 'Year' and 'YearMonth' columns.
+    
+    Raises:
+        FileNotFoundError: If data file doesn't exist.
+                          Error message includes file path and suggests checking config.py
+        ValueError: If required columns (REVENUE_COLUMN) are missing.
+                   Error message lists available columns and suggests updating config.py
+    
+    Example:
+        >>> from data_loader import load_sales_data
+        >>> from config import get_data_path
+        >>> df = load_sales_data(get_data_path())
+        >>> print(f"Loaded {len(df):,} rows with {df['Year'].notna().sum():,} with dates")
+    
+    See Also:
+        - .cursor/rules/data_loading.md for detailed patterns
+        - config.py for column name configuration
+    """
+    # Get data file path
+    if filepath is None:
+        filepath = get_data_path()
+    else:
+        filepath = Path(filepath)
+    
+    # Check if file exists
+    if not filepath.exists():
+        raise FileNotFoundError(
+            f"Data file not found: {filepath}\n"
+            f"Please update config.py with the correct DATA_FILE path."
+        )
+    
+    # Load CSV
+    print(f"Loading data from: {filepath}")
+    df = pd.read_csv(filepath, low_memory=False)
+    print(f"Loaded {len(df):,} rows")
+    
+    # Validate required columns
+    if REVENUE_COLUMN not in df.columns:
+        raise ValueError(
+            f"Required column '{REVENUE_COLUMN}' not found in data.\n"
+            f"Available columns: {list(df.columns)}\n"
+            f"Please update config.py REVENUE_COLUMN to match your data."
+        )
+    
+    # Convert revenue column to numeric
+    df[REVENUE_COLUMN] = pd.to_numeric(df[REVENUE_COLUMN], errors='coerce')
+    
+    # Count missing revenue values
+    missing_revenue = df[REVENUE_COLUMN].isna().sum()
+    if missing_revenue > 0:
+        print(f"Warning: {missing_revenue:,} rows have missing/invalid revenue values")
+    
+    # Create working date column
+    df['WorkingDate'] = pd.NaT
+    
+    # Try primary date column first
+    if DATE_COLUMN in df.columns:
+        print(f"Attempting to parse {DATE_COLUMN}...")
+        df['Date_Parsed'] = pd.to_datetime(df[DATE_COLUMN], errors='coerce', format='mixed')
+        parsed_count = df['Date_Parsed'].notna().sum()
+        df.loc[df['Date_Parsed'].notna(), 'WorkingDate'] = df.loc[df['Date_Parsed'].notna(), 'Date_Parsed']
+        print(f"  Parsed {parsed_count:,} dates from {DATE_COLUMN}")
+    else:
+        print(f"Warning: Primary date column '{DATE_COLUMN}' not found")
+    
+    # Use fallback date columns
+    if DATE_FALLBACK_COLUMNS:
+        for fallback_col in DATE_FALLBACK_COLUMNS:
+            if fallback_col in df.columns:
+                missing_dates = df['WorkingDate'].isna()
+                if missing_dates.sum() > 0:
+                    print(f"Using fallback column: {fallback_col}...")
+                    fallback_parsed = pd.to_datetime(
+                        df.loc[missing_dates, fallback_col], 
+                        errors='coerce',
+                        format='mixed'
+                    )
+                    newly_parsed = missing_dates & fallback_parsed.notna()
+                    if newly_parsed.sum() > 0:
+                        df.loc[newly_parsed, 'WorkingDate'] = fallback_parsed[newly_parsed]
+                        print(f"  Parsed {newly_parsed.sum():,} additional dates from {fallback_col}")
+    
+    # Final fallback: try to construct from Year column if available
+    if 'Year' in df.columns and df['WorkingDate'].isna().sum() > 0:
+        missing_dates = df['WorkingDate'].isna()
+        year_values = pd.to_numeric(df.loc[missing_dates, 'Year'], errors='coerce')
+        valid_years = missing_dates & year_values.notna()
+        if valid_years.sum() > 0:
+            print(f"Using Year column for remaining {valid_years.sum():,} rows...")
+            df.loc[valid_years, 'WorkingDate'] = pd.to_datetime(
+                df.loc[valid_years, 'Year'].astype(int).astype(str) + '-01-01',
+                errors='coerce'
+            )
+    
+    # Set WorkingDate as the primary date column
+    df[DATE_COLUMN] = df['WorkingDate']
+    
+    # Clean up temporary columns
+    df = df.drop(columns=['Date_Parsed', 'WorkingDate'], errors='ignore')
+    
+    # Extract Year from date column
+    df['Year'] = df[DATE_COLUMN].dt.year
+    
+    # Fill missing Year from Year column if it exists and date is missing
+    if 'Year' in df.columns:
+        year_orig = pd.to_numeric(df['Year'], errors='coerce')
+        missing_year = df['Year'].isna()
+        if missing_year.sum() > 0 and 'Year' in df.columns:
+            year_fallback = pd.to_numeric(df.loc[missing_year, 'Year'], errors='coerce')
+            df.loc[missing_year & year_fallback.notna(), 'Year'] = year_fallback[missing_year & year_fallback.notna()]
+    
+    # Create YearMonth for monthly analysis
+    if DATE_COLUMN in df.columns:
+        df['YearMonth'] = df[DATE_COLUMN].dt.to_period('M')
+    
+    # Report date coverage
+    total_rows = len(df)
+    date_coverage = df[DATE_COLUMN].notna().sum()
+    coverage_pct = (date_coverage / total_rows * 100) if total_rows > 0 else 0
+    print(f"Date coverage: {date_coverage:,} / {total_rows:,} rows ({coverage_pct:.1f}%)")
+    
+    if coverage_pct < 100:
+        print(f"Warning: {total_rows - date_coverage:,} rows have missing dates")
+    
+    # Report date range
+    if df[DATE_COLUMN].notna().any():
+        min_date = df[DATE_COLUMN].min()
+        max_date = df[DATE_COLUMN].max()
+        print(f"Date range: {min_date.strftime('%Y-%m-%d')} to {max_date.strftime('%Y-%m-%d')}")
+    
+    return df
+
+def validate_data_structure(df: pd.DataFrame) -> tuple[bool, str]:
+    """
+    Validate that loaded data has expected structure.
+    
+    Checks for required columns, data quality, and basic validity.
+    Returns actionable error messages if validation fails.
+    
+    Args:
+        df: DataFrame to validate (should be result of load_sales_data())
+    
+    Returns:
+        tuple[bool, str]: (is_valid, error_message)
+            - is_valid: True if data structure is valid, False otherwise
+            - error_message: "OK" if valid, otherwise descriptive error message
+    
+    Example:
+        >>> df = load_sales_data(get_data_path())
+        >>> is_valid, msg = validate_data_structure(df)
+        >>> if not is_valid:
+        ...     print(f"ERROR: {msg}")
+    
+    See Also:
+        - load_sales_data() - Load data before validating
+        - config_validator.py - Comprehensive configuration validation
+    """
+    from config import REVENUE_COLUMN, DATE_COLUMN
+    
+    errors = []
+    
+    # Check required columns
+    if REVENUE_COLUMN not in df.columns:
+        errors.append(f"Missing required column: {REVENUE_COLUMN}")
+    
+    if DATE_COLUMN not in df.columns:
+        errors.append(f"Missing required column: {DATE_COLUMN}")
+    
+    # Check data quality
+    if len(df) == 0:
+        errors.append("DataFrame is empty")
+    
+    if REVENUE_COLUMN in df.columns:
+        if df[REVENUE_COLUMN].isna().all():
+            errors.append(f"All {REVENUE_COLUMN} values are NaN")
+        
+        if df[REVENUE_COLUMN].notna().sum() == 0:
+            errors.append(f"No valid {REVENUE_COLUMN} values")
+    
+    if DATE_COLUMN in df.columns:
+        if df[DATE_COLUMN].isna().all():
+            errors.append(f"All {DATE_COLUMN} values are NaN")
+    
+    if errors:
+        return False, "; ".join(errors)
+    
+    return True, "OK"