Files
sales-data-analysis/data_loader.py
Jonathan Pressnell cf0b596449 Initial commit: sales analysis template
Co-authored-by: Cursor <cursoragent@cursor.com>
2026-02-06 09:16:34 -05:00

225 lines
8.5 KiB
Python

"""
Generic data loading utility with flexible date handling
Handles various date column formats and fallback logic
This loader is designed to work with different CSV structures by:
1. Trying primary date column first
2. Falling back to alternative date columns if needed
3. Ensuring 100% date coverage
"""
import pandas as pd
import numpy as np
from pathlib import Path
from config import (
REVENUE_COLUMN, DATE_COLUMN, DATE_FALLBACK_COLUMNS,
get_data_path
)
def load_sales_data(filepath=None):
"""
Load sales data with flexible date handling
This function provides intelligent data loading with fallback logic:
1. Loads the CSV file
2. Converts revenue column to numeric
3. Attempts to parse dates using primary date column
4. Falls back to alternative date columns if needed (100% coverage)
5. Creates Year and YearMonth columns for analysis
CRITICAL: Always use this function instead of pd.read_csv() directly.
This ensures proper date parsing with fallback logic.
Args:
filepath: Path to the CSV file (defaults to config.DATA_FILE).
Can be str, Path, or None (uses config.get_data_path())
Returns:
pd.DataFrame: DataFrame with properly parsed dates and revenue.
Includes 'Year' and 'YearMonth' columns.
Raises:
FileNotFoundError: If data file doesn't exist.
Error message includes file path and suggests checking config.py
ValueError: If required columns (REVENUE_COLUMN) are missing.
Error message lists available columns and suggests updating config.py
Example:
>>> from data_loader import load_sales_data
>>> from config import get_data_path
>>> df = load_sales_data(get_data_path())
>>> print(f"Loaded {len(df):,} rows with {df['Year'].notna().sum():,} with dates")
See Also:
- .cursor/rules/data_loading.md for detailed patterns
- config.py for column name configuration
"""
# Get data file path
if filepath is None:
filepath = get_data_path()
else:
filepath = Path(filepath)
# Check if file exists
if not filepath.exists():
raise FileNotFoundError(
f"Data file not found: {filepath}\n"
f"Please update config.py with the correct DATA_FILE path."
)
# Load CSV
print(f"Loading data from: {filepath}")
df = pd.read_csv(filepath, low_memory=False)
print(f"Loaded {len(df):,} rows")
# Validate required columns
if REVENUE_COLUMN not in df.columns:
raise ValueError(
f"Required column '{REVENUE_COLUMN}' not found in data.\n"
f"Available columns: {list(df.columns)}\n"
f"Please update config.py REVENUE_COLUMN to match your data."
)
# Convert revenue column to numeric
df[REVENUE_COLUMN] = pd.to_numeric(df[REVENUE_COLUMN], errors='coerce')
# Count missing revenue values
missing_revenue = df[REVENUE_COLUMN].isna().sum()
if missing_revenue > 0:
print(f"Warning: {missing_revenue:,} rows have missing/invalid revenue values")
# Create working date column
df['WorkingDate'] = pd.NaT
# Try primary date column first
if DATE_COLUMN in df.columns:
print(f"Attempting to parse {DATE_COLUMN}...")
df['Date_Parsed'] = pd.to_datetime(df[DATE_COLUMN], errors='coerce', format='mixed')
parsed_count = df['Date_Parsed'].notna().sum()
df.loc[df['Date_Parsed'].notna(), 'WorkingDate'] = df.loc[df['Date_Parsed'].notna(), 'Date_Parsed']
print(f" Parsed {parsed_count:,} dates from {DATE_COLUMN}")
else:
print(f"Warning: Primary date column '{DATE_COLUMN}' not found")
# Use fallback date columns
if DATE_FALLBACK_COLUMNS:
for fallback_col in DATE_FALLBACK_COLUMNS:
if fallback_col in df.columns:
missing_dates = df['WorkingDate'].isna()
if missing_dates.sum() > 0:
print(f"Using fallback column: {fallback_col}...")
fallback_parsed = pd.to_datetime(
df.loc[missing_dates, fallback_col],
errors='coerce',
format='mixed'
)
newly_parsed = missing_dates & fallback_parsed.notna()
if newly_parsed.sum() > 0:
df.loc[newly_parsed, 'WorkingDate'] = fallback_parsed[newly_parsed]
print(f" Parsed {newly_parsed.sum():,} additional dates from {fallback_col}")
# Final fallback: try to construct from Year column if available
if 'Year' in df.columns and df['WorkingDate'].isna().sum() > 0:
missing_dates = df['WorkingDate'].isna()
year_values = pd.to_numeric(df.loc[missing_dates, 'Year'], errors='coerce')
valid_years = missing_dates & year_values.notna()
if valid_years.sum() > 0:
print(f"Using Year column for remaining {valid_years.sum():,} rows...")
df.loc[valid_years, 'WorkingDate'] = pd.to_datetime(
df.loc[valid_years, 'Year'].astype(int).astype(str) + '-01-01',
errors='coerce'
)
# Set WorkingDate as the primary date column
df[DATE_COLUMN] = df['WorkingDate']
# Clean up temporary columns
df = df.drop(columns=['Date_Parsed', 'WorkingDate'], errors='ignore')
# Extract Year from date column
df['Year'] = df[DATE_COLUMN].dt.year
# Fill missing Year from Year column if it exists and date is missing
if 'Year' in df.columns:
year_orig = pd.to_numeric(df['Year'], errors='coerce')
missing_year = df['Year'].isna()
if missing_year.sum() > 0 and 'Year' in df.columns:
year_fallback = pd.to_numeric(df.loc[missing_year, 'Year'], errors='coerce')
df.loc[missing_year & year_fallback.notna(), 'Year'] = year_fallback[missing_year & year_fallback.notna()]
# Create YearMonth for monthly analysis
if DATE_COLUMN in df.columns:
df['YearMonth'] = df[DATE_COLUMN].dt.to_period('M')
# Report date coverage
total_rows = len(df)
date_coverage = df[DATE_COLUMN].notna().sum()
coverage_pct = (date_coverage / total_rows * 100) if total_rows > 0 else 0
print(f"Date coverage: {date_coverage:,} / {total_rows:,} rows ({coverage_pct:.1f}%)")
if coverage_pct < 100:
print(f"Warning: {total_rows - date_coverage:,} rows have missing dates")
# Report date range
if df[DATE_COLUMN].notna().any():
min_date = df[DATE_COLUMN].min()
max_date = df[DATE_COLUMN].max()
print(f"Date range: {min_date.strftime('%Y-%m-%d')} to {max_date.strftime('%Y-%m-%d')}")
return df
def validate_data_structure(df: pd.DataFrame) -> tuple[bool, str]:
"""
Validate that loaded data has expected structure.
Checks for required columns, data quality, and basic validity.
Returns actionable error messages if validation fails.
Args:
df: DataFrame to validate (should be result of load_sales_data())
Returns:
tuple[bool, str]: (is_valid, error_message)
- is_valid: True if data structure is valid, False otherwise
- error_message: "OK" if valid, otherwise descriptive error message
Example:
>>> df = load_sales_data(get_data_path())
>>> is_valid, msg = validate_data_structure(df)
>>> if not is_valid:
... print(f"ERROR: {msg}")
See Also:
- load_sales_data() - Load data before validating
- config_validator.py - Comprehensive configuration validation
"""
from config import REVENUE_COLUMN, DATE_COLUMN
errors = []
# Check required columns
if REVENUE_COLUMN not in df.columns:
errors.append(f"Missing required column: {REVENUE_COLUMN}")
if DATE_COLUMN not in df.columns:
errors.append(f"Missing required column: {DATE_COLUMN}")
# Check data quality
if len(df) == 0:
errors.append("DataFrame is empty")
if REVENUE_COLUMN in df.columns:
if df[REVENUE_COLUMN].isna().all():
errors.append(f"All {REVENUE_COLUMN} values are NaN")
if df[REVENUE_COLUMN].notna().sum() == 0:
errors.append(f"No valid {REVENUE_COLUMN} values")
if DATE_COLUMN in df.columns:
if df[DATE_COLUMN].isna().all():
errors.append(f"All {DATE_COLUMN} values are NaN")
if errors:
return False, "; ".join(errors)
return True, "OK"