Initial commit: sales analysis template

Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
Jonathan Pressnell
2026-02-06 09:16:34 -05:00
commit cf0b596449
38 changed files with 8001 additions and 0 deletions

224
data_loader.py Normal file
View File

@@ -0,0 +1,224 @@
"""
Generic data loading utility with flexible date handling
Handles various date column formats and fallback logic
This loader is designed to work with different CSV structures by:
1. Trying primary date column first
2. Falling back to alternative date columns if needed
3. Ensuring 100% date coverage
"""
import pandas as pd
import numpy as np
from pathlib import Path
from config import (
REVENUE_COLUMN, DATE_COLUMN, DATE_FALLBACK_COLUMNS,
get_data_path
)
def load_sales_data(filepath=None):
"""
Load sales data with flexible date handling
This function provides intelligent data loading with fallback logic:
1. Loads the CSV file
2. Converts revenue column to numeric
3. Attempts to parse dates using primary date column
4. Falls back to alternative date columns if needed (100% coverage)
5. Creates Year and YearMonth columns for analysis
CRITICAL: Always use this function instead of pd.read_csv() directly.
This ensures proper date parsing with fallback logic.
Args:
filepath: Path to the CSV file (defaults to config.DATA_FILE).
Can be str, Path, or None (uses config.get_data_path())
Returns:
pd.DataFrame: DataFrame with properly parsed dates and revenue.
Includes 'Year' and 'YearMonth' columns.
Raises:
FileNotFoundError: If data file doesn't exist.
Error message includes file path and suggests checking config.py
ValueError: If required columns (REVENUE_COLUMN) are missing.
Error message lists available columns and suggests updating config.py
Example:
>>> from data_loader import load_sales_data
>>> from config import get_data_path
>>> df = load_sales_data(get_data_path())
>>> print(f"Loaded {len(df):,} rows with {df['Year'].notna().sum():,} with dates")
See Also:
- .cursor/rules/data_loading.md for detailed patterns
- config.py for column name configuration
"""
# Get data file path
if filepath is None:
filepath = get_data_path()
else:
filepath = Path(filepath)
# Check if file exists
if not filepath.exists():
raise FileNotFoundError(
f"Data file not found: {filepath}\n"
f"Please update config.py with the correct DATA_FILE path."
)
# Load CSV
print(f"Loading data from: {filepath}")
df = pd.read_csv(filepath, low_memory=False)
print(f"Loaded {len(df):,} rows")
# Validate required columns
if REVENUE_COLUMN not in df.columns:
raise ValueError(
f"Required column '{REVENUE_COLUMN}' not found in data.\n"
f"Available columns: {list(df.columns)}\n"
f"Please update config.py REVENUE_COLUMN to match your data."
)
# Convert revenue column to numeric
df[REVENUE_COLUMN] = pd.to_numeric(df[REVENUE_COLUMN], errors='coerce')
# Count missing revenue values
missing_revenue = df[REVENUE_COLUMN].isna().sum()
if missing_revenue > 0:
print(f"Warning: {missing_revenue:,} rows have missing/invalid revenue values")
# Create working date column
df['WorkingDate'] = pd.NaT
# Try primary date column first
if DATE_COLUMN in df.columns:
print(f"Attempting to parse {DATE_COLUMN}...")
df['Date_Parsed'] = pd.to_datetime(df[DATE_COLUMN], errors='coerce', format='mixed')
parsed_count = df['Date_Parsed'].notna().sum()
df.loc[df['Date_Parsed'].notna(), 'WorkingDate'] = df.loc[df['Date_Parsed'].notna(), 'Date_Parsed']
print(f" Parsed {parsed_count:,} dates from {DATE_COLUMN}")
else:
print(f"Warning: Primary date column '{DATE_COLUMN}' not found")
# Use fallback date columns
if DATE_FALLBACK_COLUMNS:
for fallback_col in DATE_FALLBACK_COLUMNS:
if fallback_col in df.columns:
missing_dates = df['WorkingDate'].isna()
if missing_dates.sum() > 0:
print(f"Using fallback column: {fallback_col}...")
fallback_parsed = pd.to_datetime(
df.loc[missing_dates, fallback_col],
errors='coerce',
format='mixed'
)
newly_parsed = missing_dates & fallback_parsed.notna()
if newly_parsed.sum() > 0:
df.loc[newly_parsed, 'WorkingDate'] = fallback_parsed[newly_parsed]
print(f" Parsed {newly_parsed.sum():,} additional dates from {fallback_col}")
# Final fallback: try to construct from Year column if available
if 'Year' in df.columns and df['WorkingDate'].isna().sum() > 0:
missing_dates = df['WorkingDate'].isna()
year_values = pd.to_numeric(df.loc[missing_dates, 'Year'], errors='coerce')
valid_years = missing_dates & year_values.notna()
if valid_years.sum() > 0:
print(f"Using Year column for remaining {valid_years.sum():,} rows...")
df.loc[valid_years, 'WorkingDate'] = pd.to_datetime(
df.loc[valid_years, 'Year'].astype(int).astype(str) + '-01-01',
errors='coerce'
)
# Set WorkingDate as the primary date column
df[DATE_COLUMN] = df['WorkingDate']
# Clean up temporary columns
df = df.drop(columns=['Date_Parsed', 'WorkingDate'], errors='ignore')
# Extract Year from date column
df['Year'] = df[DATE_COLUMN].dt.year
# Fill missing Year from Year column if it exists and date is missing
if 'Year' in df.columns:
year_orig = pd.to_numeric(df['Year'], errors='coerce')
missing_year = df['Year'].isna()
if missing_year.sum() > 0 and 'Year' in df.columns:
year_fallback = pd.to_numeric(df.loc[missing_year, 'Year'], errors='coerce')
df.loc[missing_year & year_fallback.notna(), 'Year'] = year_fallback[missing_year & year_fallback.notna()]
# Create YearMonth for monthly analysis
if DATE_COLUMN in df.columns:
df['YearMonth'] = df[DATE_COLUMN].dt.to_period('M')
# Report date coverage
total_rows = len(df)
date_coverage = df[DATE_COLUMN].notna().sum()
coverage_pct = (date_coverage / total_rows * 100) if total_rows > 0 else 0
print(f"Date coverage: {date_coverage:,} / {total_rows:,} rows ({coverage_pct:.1f}%)")
if coverage_pct < 100:
print(f"Warning: {total_rows - date_coverage:,} rows have missing dates")
# Report date range
if df[DATE_COLUMN].notna().any():
min_date = df[DATE_COLUMN].min()
max_date = df[DATE_COLUMN].max()
print(f"Date range: {min_date.strftime('%Y-%m-%d')} to {max_date.strftime('%Y-%m-%d')}")
return df
def validate_data_structure(df: pd.DataFrame) -> tuple[bool, str]:
"""
Validate that loaded data has expected structure.
Checks for required columns, data quality, and basic validity.
Returns actionable error messages if validation fails.
Args:
df: DataFrame to validate (should be result of load_sales_data())
Returns:
tuple[bool, str]: (is_valid, error_message)
- is_valid: True if data structure is valid, False otherwise
- error_message: "OK" if valid, otherwise descriptive error message
Example:
>>> df = load_sales_data(get_data_path())
>>> is_valid, msg = validate_data_structure(df)
>>> if not is_valid:
... print(f"ERROR: {msg}")
See Also:
- load_sales_data() - Load data before validating
- config_validator.py - Comprehensive configuration validation
"""
from config import REVENUE_COLUMN, DATE_COLUMN
errors = []
# Check required columns
if REVENUE_COLUMN not in df.columns:
errors.append(f"Missing required column: {REVENUE_COLUMN}")
if DATE_COLUMN not in df.columns:
errors.append(f"Missing required column: {DATE_COLUMN}")
# Check data quality
if len(df) == 0:
errors.append("DataFrame is empty")
if REVENUE_COLUMN in df.columns:
if df[REVENUE_COLUMN].isna().all():
errors.append(f"All {REVENUE_COLUMN} values are NaN")
if df[REVENUE_COLUMN].notna().sum() == 0:
errors.append(f"No valid {REVENUE_COLUMN} values")
if DATE_COLUMN in df.columns:
if df[DATE_COLUMN].isna().all():
errors.append(f"All {DATE_COLUMN} values are NaN")
if errors:
return False, "; ".join(errors)
return True, "OK"