225 lines
8.5 KiB
Python
225 lines
8.5 KiB
Python
"""
|
|
Generic data loading utility with flexible date handling
|
|
Handles various date column formats and fallback logic
|
|
|
|
This loader is designed to work with different CSV structures by:
|
|
1. Trying primary date column first
|
|
2. Falling back to alternative date columns if needed
|
|
3. Ensuring 100% date coverage
|
|
"""
|
|
import pandas as pd
|
|
import numpy as np
|
|
from pathlib import Path
|
|
from config import (
|
|
REVENUE_COLUMN, DATE_COLUMN, DATE_FALLBACK_COLUMNS,
|
|
get_data_path
|
|
)
|
|
|
|
def load_sales_data(filepath=None):
|
|
"""
|
|
Load sales data with flexible date handling
|
|
|
|
This function provides intelligent data loading with fallback logic:
|
|
1. Loads the CSV file
|
|
2. Converts revenue column to numeric
|
|
3. Attempts to parse dates using primary date column
|
|
4. Falls back to alternative date columns if needed (100% coverage)
|
|
5. Creates Year and YearMonth columns for analysis
|
|
|
|
CRITICAL: Always use this function instead of pd.read_csv() directly.
|
|
This ensures proper date parsing with fallback logic.
|
|
|
|
Args:
|
|
filepath: Path to the CSV file (defaults to config.DATA_FILE).
|
|
Can be str, Path, or None (uses config.get_data_path())
|
|
|
|
Returns:
|
|
pd.DataFrame: DataFrame with properly parsed dates and revenue.
|
|
Includes 'Year' and 'YearMonth' columns.
|
|
|
|
Raises:
|
|
FileNotFoundError: If data file doesn't exist.
|
|
Error message includes file path and suggests checking config.py
|
|
ValueError: If required columns (REVENUE_COLUMN) are missing.
|
|
Error message lists available columns and suggests updating config.py
|
|
|
|
Example:
|
|
>>> from data_loader import load_sales_data
|
|
>>> from config import get_data_path
|
|
>>> df = load_sales_data(get_data_path())
|
|
>>> print(f"Loaded {len(df):,} rows with {df['Year'].notna().sum():,} with dates")
|
|
|
|
See Also:
|
|
- .cursor/rules/data_loading.md for detailed patterns
|
|
- config.py for column name configuration
|
|
"""
|
|
# Get data file path
|
|
if filepath is None:
|
|
filepath = get_data_path()
|
|
else:
|
|
filepath = Path(filepath)
|
|
|
|
# Check if file exists
|
|
if not filepath.exists():
|
|
raise FileNotFoundError(
|
|
f"Data file not found: {filepath}\n"
|
|
f"Please update config.py with the correct DATA_FILE path."
|
|
)
|
|
|
|
# Load CSV
|
|
print(f"Loading data from: {filepath}")
|
|
df = pd.read_csv(filepath, low_memory=False)
|
|
print(f"Loaded {len(df):,} rows")
|
|
|
|
# Validate required columns
|
|
if REVENUE_COLUMN not in df.columns:
|
|
raise ValueError(
|
|
f"Required column '{REVENUE_COLUMN}' not found in data.\n"
|
|
f"Available columns: {list(df.columns)}\n"
|
|
f"Please update config.py REVENUE_COLUMN to match your data."
|
|
)
|
|
|
|
# Convert revenue column to numeric
|
|
df[REVENUE_COLUMN] = pd.to_numeric(df[REVENUE_COLUMN], errors='coerce')
|
|
|
|
# Count missing revenue values
|
|
missing_revenue = df[REVENUE_COLUMN].isna().sum()
|
|
if missing_revenue > 0:
|
|
print(f"Warning: {missing_revenue:,} rows have missing/invalid revenue values")
|
|
|
|
# Create working date column
|
|
df['WorkingDate'] = pd.NaT
|
|
|
|
# Try primary date column first
|
|
if DATE_COLUMN in df.columns:
|
|
print(f"Attempting to parse {DATE_COLUMN}...")
|
|
df['Date_Parsed'] = pd.to_datetime(df[DATE_COLUMN], errors='coerce', format='mixed')
|
|
parsed_count = df['Date_Parsed'].notna().sum()
|
|
df.loc[df['Date_Parsed'].notna(), 'WorkingDate'] = df.loc[df['Date_Parsed'].notna(), 'Date_Parsed']
|
|
print(f" Parsed {parsed_count:,} dates from {DATE_COLUMN}")
|
|
else:
|
|
print(f"Warning: Primary date column '{DATE_COLUMN}' not found")
|
|
|
|
# Use fallback date columns
|
|
if DATE_FALLBACK_COLUMNS:
|
|
for fallback_col in DATE_FALLBACK_COLUMNS:
|
|
if fallback_col in df.columns:
|
|
missing_dates = df['WorkingDate'].isna()
|
|
if missing_dates.sum() > 0:
|
|
print(f"Using fallback column: {fallback_col}...")
|
|
fallback_parsed = pd.to_datetime(
|
|
df.loc[missing_dates, fallback_col],
|
|
errors='coerce',
|
|
format='mixed'
|
|
)
|
|
newly_parsed = missing_dates & fallback_parsed.notna()
|
|
if newly_parsed.sum() > 0:
|
|
df.loc[newly_parsed, 'WorkingDate'] = fallback_parsed[newly_parsed]
|
|
print(f" Parsed {newly_parsed.sum():,} additional dates from {fallback_col}")
|
|
|
|
# Final fallback: try to construct from Year column if available
|
|
if 'Year' in df.columns and df['WorkingDate'].isna().sum() > 0:
|
|
missing_dates = df['WorkingDate'].isna()
|
|
year_values = pd.to_numeric(df.loc[missing_dates, 'Year'], errors='coerce')
|
|
valid_years = missing_dates & year_values.notna()
|
|
if valid_years.sum() > 0:
|
|
print(f"Using Year column for remaining {valid_years.sum():,} rows...")
|
|
df.loc[valid_years, 'WorkingDate'] = pd.to_datetime(
|
|
df.loc[valid_years, 'Year'].astype(int).astype(str) + '-01-01',
|
|
errors='coerce'
|
|
)
|
|
|
|
# Set WorkingDate as the primary date column
|
|
df[DATE_COLUMN] = df['WorkingDate']
|
|
|
|
# Clean up temporary columns
|
|
df = df.drop(columns=['Date_Parsed', 'WorkingDate'], errors='ignore')
|
|
|
|
# Extract Year from date column
|
|
df['Year'] = df[DATE_COLUMN].dt.year
|
|
|
|
# Fill missing Year from Year column if it exists and date is missing
|
|
if 'Year' in df.columns:
|
|
year_orig = pd.to_numeric(df['Year'], errors='coerce')
|
|
missing_year = df['Year'].isna()
|
|
if missing_year.sum() > 0 and 'Year' in df.columns:
|
|
year_fallback = pd.to_numeric(df.loc[missing_year, 'Year'], errors='coerce')
|
|
df.loc[missing_year & year_fallback.notna(), 'Year'] = year_fallback[missing_year & year_fallback.notna()]
|
|
|
|
# Create YearMonth for monthly analysis
|
|
if DATE_COLUMN in df.columns:
|
|
df['YearMonth'] = df[DATE_COLUMN].dt.to_period('M')
|
|
|
|
# Report date coverage
|
|
total_rows = len(df)
|
|
date_coverage = df[DATE_COLUMN].notna().sum()
|
|
coverage_pct = (date_coverage / total_rows * 100) if total_rows > 0 else 0
|
|
print(f"Date coverage: {date_coverage:,} / {total_rows:,} rows ({coverage_pct:.1f}%)")
|
|
|
|
if coverage_pct < 100:
|
|
print(f"Warning: {total_rows - date_coverage:,} rows have missing dates")
|
|
|
|
# Report date range
|
|
if df[DATE_COLUMN].notna().any():
|
|
min_date = df[DATE_COLUMN].min()
|
|
max_date = df[DATE_COLUMN].max()
|
|
print(f"Date range: {min_date.strftime('%Y-%m-%d')} to {max_date.strftime('%Y-%m-%d')}")
|
|
|
|
return df
|
|
|
|
def validate_data_structure(df: pd.DataFrame) -> tuple[bool, str]:
|
|
"""
|
|
Validate that loaded data has expected structure.
|
|
|
|
Checks for required columns, data quality, and basic validity.
|
|
Returns actionable error messages if validation fails.
|
|
|
|
Args:
|
|
df: DataFrame to validate (should be result of load_sales_data())
|
|
|
|
Returns:
|
|
tuple[bool, str]: (is_valid, error_message)
|
|
- is_valid: True if data structure is valid, False otherwise
|
|
- error_message: "OK" if valid, otherwise descriptive error message
|
|
|
|
Example:
|
|
>>> df = load_sales_data(get_data_path())
|
|
>>> is_valid, msg = validate_data_structure(df)
|
|
>>> if not is_valid:
|
|
... print(f"ERROR: {msg}")
|
|
|
|
See Also:
|
|
- load_sales_data() - Load data before validating
|
|
- config_validator.py - Comprehensive configuration validation
|
|
"""
|
|
from config import REVENUE_COLUMN, DATE_COLUMN
|
|
|
|
errors = []
|
|
|
|
# Check required columns
|
|
if REVENUE_COLUMN not in df.columns:
|
|
errors.append(f"Missing required column: {REVENUE_COLUMN}")
|
|
|
|
if DATE_COLUMN not in df.columns:
|
|
errors.append(f"Missing required column: {DATE_COLUMN}")
|
|
|
|
# Check data quality
|
|
if len(df) == 0:
|
|
errors.append("DataFrame is empty")
|
|
|
|
if REVENUE_COLUMN in df.columns:
|
|
if df[REVENUE_COLUMN].isna().all():
|
|
errors.append(f"All {REVENUE_COLUMN} values are NaN")
|
|
|
|
if df[REVENUE_COLUMN].notna().sum() == 0:
|
|
errors.append(f"No valid {REVENUE_COLUMN} values")
|
|
|
|
if DATE_COLUMN in df.columns:
|
|
if df[DATE_COLUMN].isna().all():
|
|
errors.append(f"All {DATE_COLUMN} values are NaN")
|
|
|
|
if errors:
|
|
return False, "; ".join(errors)
|
|
|
|
return True, "OK"
|