Initial commit: sales analysis template
Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
224
data_loader.py
Normal file
224
data_loader.py
Normal file
@@ -0,0 +1,224 @@
|
||||
"""
|
||||
Generic data loading utility with flexible date handling
|
||||
Handles various date column formats and fallback logic
|
||||
|
||||
This loader is designed to work with different CSV structures by:
|
||||
1. Trying primary date column first
|
||||
2. Falling back to alternative date columns if needed
|
||||
3. Ensuring 100% date coverage
|
||||
"""
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from pathlib import Path
|
||||
from config import (
|
||||
REVENUE_COLUMN, DATE_COLUMN, DATE_FALLBACK_COLUMNS,
|
||||
get_data_path
|
||||
)
|
||||
|
||||
def load_sales_data(filepath=None):
|
||||
"""
|
||||
Load sales data with flexible date handling
|
||||
|
||||
This function provides intelligent data loading with fallback logic:
|
||||
1. Loads the CSV file
|
||||
2. Converts revenue column to numeric
|
||||
3. Attempts to parse dates using primary date column
|
||||
4. Falls back to alternative date columns if needed (100% coverage)
|
||||
5. Creates Year and YearMonth columns for analysis
|
||||
|
||||
CRITICAL: Always use this function instead of pd.read_csv() directly.
|
||||
This ensures proper date parsing with fallback logic.
|
||||
|
||||
Args:
|
||||
filepath: Path to the CSV file (defaults to config.DATA_FILE).
|
||||
Can be str, Path, or None (uses config.get_data_path())
|
||||
|
||||
Returns:
|
||||
pd.DataFrame: DataFrame with properly parsed dates and revenue.
|
||||
Includes 'Year' and 'YearMonth' columns.
|
||||
|
||||
Raises:
|
||||
FileNotFoundError: If data file doesn't exist.
|
||||
Error message includes file path and suggests checking config.py
|
||||
ValueError: If required columns (REVENUE_COLUMN) are missing.
|
||||
Error message lists available columns and suggests updating config.py
|
||||
|
||||
Example:
|
||||
>>> from data_loader import load_sales_data
|
||||
>>> from config import get_data_path
|
||||
>>> df = load_sales_data(get_data_path())
|
||||
>>> print(f"Loaded {len(df):,} rows with {df['Year'].notna().sum():,} with dates")
|
||||
|
||||
See Also:
|
||||
- .cursor/rules/data_loading.md for detailed patterns
|
||||
- config.py for column name configuration
|
||||
"""
|
||||
# Get data file path
|
||||
if filepath is None:
|
||||
filepath = get_data_path()
|
||||
else:
|
||||
filepath = Path(filepath)
|
||||
|
||||
# Check if file exists
|
||||
if not filepath.exists():
|
||||
raise FileNotFoundError(
|
||||
f"Data file not found: {filepath}\n"
|
||||
f"Please update config.py with the correct DATA_FILE path."
|
||||
)
|
||||
|
||||
# Load CSV
|
||||
print(f"Loading data from: {filepath}")
|
||||
df = pd.read_csv(filepath, low_memory=False)
|
||||
print(f"Loaded {len(df):,} rows")
|
||||
|
||||
# Validate required columns
|
||||
if REVENUE_COLUMN not in df.columns:
|
||||
raise ValueError(
|
||||
f"Required column '{REVENUE_COLUMN}' not found in data.\n"
|
||||
f"Available columns: {list(df.columns)}\n"
|
||||
f"Please update config.py REVENUE_COLUMN to match your data."
|
||||
)
|
||||
|
||||
# Convert revenue column to numeric
|
||||
df[REVENUE_COLUMN] = pd.to_numeric(df[REVENUE_COLUMN], errors='coerce')
|
||||
|
||||
# Count missing revenue values
|
||||
missing_revenue = df[REVENUE_COLUMN].isna().sum()
|
||||
if missing_revenue > 0:
|
||||
print(f"Warning: {missing_revenue:,} rows have missing/invalid revenue values")
|
||||
|
||||
# Create working date column
|
||||
df['WorkingDate'] = pd.NaT
|
||||
|
||||
# Try primary date column first
|
||||
if DATE_COLUMN in df.columns:
|
||||
print(f"Attempting to parse {DATE_COLUMN}...")
|
||||
df['Date_Parsed'] = pd.to_datetime(df[DATE_COLUMN], errors='coerce', format='mixed')
|
||||
parsed_count = df['Date_Parsed'].notna().sum()
|
||||
df.loc[df['Date_Parsed'].notna(), 'WorkingDate'] = df.loc[df['Date_Parsed'].notna(), 'Date_Parsed']
|
||||
print(f" Parsed {parsed_count:,} dates from {DATE_COLUMN}")
|
||||
else:
|
||||
print(f"Warning: Primary date column '{DATE_COLUMN}' not found")
|
||||
|
||||
# Use fallback date columns
|
||||
if DATE_FALLBACK_COLUMNS:
|
||||
for fallback_col in DATE_FALLBACK_COLUMNS:
|
||||
if fallback_col in df.columns:
|
||||
missing_dates = df['WorkingDate'].isna()
|
||||
if missing_dates.sum() > 0:
|
||||
print(f"Using fallback column: {fallback_col}...")
|
||||
fallback_parsed = pd.to_datetime(
|
||||
df.loc[missing_dates, fallback_col],
|
||||
errors='coerce',
|
||||
format='mixed'
|
||||
)
|
||||
newly_parsed = missing_dates & fallback_parsed.notna()
|
||||
if newly_parsed.sum() > 0:
|
||||
df.loc[newly_parsed, 'WorkingDate'] = fallback_parsed[newly_parsed]
|
||||
print(f" Parsed {newly_parsed.sum():,} additional dates from {fallback_col}")
|
||||
|
||||
# Final fallback: try to construct from Year column if available
|
||||
if 'Year' in df.columns and df['WorkingDate'].isna().sum() > 0:
|
||||
missing_dates = df['WorkingDate'].isna()
|
||||
year_values = pd.to_numeric(df.loc[missing_dates, 'Year'], errors='coerce')
|
||||
valid_years = missing_dates & year_values.notna()
|
||||
if valid_years.sum() > 0:
|
||||
print(f"Using Year column for remaining {valid_years.sum():,} rows...")
|
||||
df.loc[valid_years, 'WorkingDate'] = pd.to_datetime(
|
||||
df.loc[valid_years, 'Year'].astype(int).astype(str) + '-01-01',
|
||||
errors='coerce'
|
||||
)
|
||||
|
||||
# Set WorkingDate as the primary date column
|
||||
df[DATE_COLUMN] = df['WorkingDate']
|
||||
|
||||
# Clean up temporary columns
|
||||
df = df.drop(columns=['Date_Parsed', 'WorkingDate'], errors='ignore')
|
||||
|
||||
# Extract Year from date column
|
||||
df['Year'] = df[DATE_COLUMN].dt.year
|
||||
|
||||
# Fill missing Year from Year column if it exists and date is missing
|
||||
if 'Year' in df.columns:
|
||||
year_orig = pd.to_numeric(df['Year'], errors='coerce')
|
||||
missing_year = df['Year'].isna()
|
||||
if missing_year.sum() > 0 and 'Year' in df.columns:
|
||||
year_fallback = pd.to_numeric(df.loc[missing_year, 'Year'], errors='coerce')
|
||||
df.loc[missing_year & year_fallback.notna(), 'Year'] = year_fallback[missing_year & year_fallback.notna()]
|
||||
|
||||
# Create YearMonth for monthly analysis
|
||||
if DATE_COLUMN in df.columns:
|
||||
df['YearMonth'] = df[DATE_COLUMN].dt.to_period('M')
|
||||
|
||||
# Report date coverage
|
||||
total_rows = len(df)
|
||||
date_coverage = df[DATE_COLUMN].notna().sum()
|
||||
coverage_pct = (date_coverage / total_rows * 100) if total_rows > 0 else 0
|
||||
print(f"Date coverage: {date_coverage:,} / {total_rows:,} rows ({coverage_pct:.1f}%)")
|
||||
|
||||
if coverage_pct < 100:
|
||||
print(f"Warning: {total_rows - date_coverage:,} rows have missing dates")
|
||||
|
||||
# Report date range
|
||||
if df[DATE_COLUMN].notna().any():
|
||||
min_date = df[DATE_COLUMN].min()
|
||||
max_date = df[DATE_COLUMN].max()
|
||||
print(f"Date range: {min_date.strftime('%Y-%m-%d')} to {max_date.strftime('%Y-%m-%d')}")
|
||||
|
||||
return df
|
||||
|
||||
def validate_data_structure(df: pd.DataFrame) -> tuple[bool, str]:
|
||||
"""
|
||||
Validate that loaded data has expected structure.
|
||||
|
||||
Checks for required columns, data quality, and basic validity.
|
||||
Returns actionable error messages if validation fails.
|
||||
|
||||
Args:
|
||||
df: DataFrame to validate (should be result of load_sales_data())
|
||||
|
||||
Returns:
|
||||
tuple[bool, str]: (is_valid, error_message)
|
||||
- is_valid: True if data structure is valid, False otherwise
|
||||
- error_message: "OK" if valid, otherwise descriptive error message
|
||||
|
||||
Example:
|
||||
>>> df = load_sales_data(get_data_path())
|
||||
>>> is_valid, msg = validate_data_structure(df)
|
||||
>>> if not is_valid:
|
||||
... print(f"ERROR: {msg}")
|
||||
|
||||
See Also:
|
||||
- load_sales_data() - Load data before validating
|
||||
- config_validator.py - Comprehensive configuration validation
|
||||
"""
|
||||
from config import REVENUE_COLUMN, DATE_COLUMN
|
||||
|
||||
errors = []
|
||||
|
||||
# Check required columns
|
||||
if REVENUE_COLUMN not in df.columns:
|
||||
errors.append(f"Missing required column: {REVENUE_COLUMN}")
|
||||
|
||||
if DATE_COLUMN not in df.columns:
|
||||
errors.append(f"Missing required column: {DATE_COLUMN}")
|
||||
|
||||
# Check data quality
|
||||
if len(df) == 0:
|
||||
errors.append("DataFrame is empty")
|
||||
|
||||
if REVENUE_COLUMN in df.columns:
|
||||
if df[REVENUE_COLUMN].isna().all():
|
||||
errors.append(f"All {REVENUE_COLUMN} values are NaN")
|
||||
|
||||
if df[REVENUE_COLUMN].notna().sum() == 0:
|
||||
errors.append(f"No valid {REVENUE_COLUMN} values")
|
||||
|
||||
if DATE_COLUMN in df.columns:
|
||||
if df[DATE_COLUMN].isna().all():
|
||||
errors.append(f"All {DATE_COLUMN} values are NaN")
|
||||
|
||||
if errors:
|
||||
return False, "; ".join(errors)
|
||||
|
||||
return True, "OK"
|
||||
Reference in New Issue
Block a user