sales-data-analysis/config.py

"""
Configuration file for sales analysis scripts
CONFIGURE THIS FILE FOR YOUR COMPANY'S SPECIFIC DATA STRUCTURE

This file should be customized based on:
- Your data file name and location
- Column names in your sales data
- Date range and LTM period
- Company-specific settings

CRITICAL: All column names, file paths, and settings are defined here.
Never hardcode these values in analysis scripts - always import from config.

Usage:
    from config import REVENUE_COLUMN, DATE_COLUMN, get_data_path
    revenue = df[REVENUE_COLUMN].sum()  # ✅ Correct
    revenue = df['USD'].sum()           # ❌ Wrong - hardcoded

Quick Setup:
    1. Run: python setup_wizard.py (interactive configuration)
    2. Or manually edit this file following the TODO comments
    3. Validate: python config_validator.py

See Also:
    - .cursor/rules/analysis_patterns.md - How to use config values
    - setup_wizard.py - Interactive configuration tool
    - config_validator.py - Configuration validation
"""
from pathlib import Path
from typing import Optional, Tuple
import pandas as pd

# ============================================================================
# COMPANY INFORMATION
# ============================================================================
# TODO: Update these values for your company
COMPANY_NAME = "Your Company Name"  # Update this
ANALYSIS_DATE = "2026-01-12"  # Update this to current date

# ============================================================================
# DATA FILES
# ============================================================================
# TODO: Update with your actual data file name
DATA_FILE = 'sales_data.csv'  # Update this to your CSV file name
OUTPUT_DIR = Path('charts')
REPORTS_DIR = Path('reports')
DATA_DIR = Path('data')  # Optional: if data is in a subdirectory

# ============================================================================
# DATA COLUMN MAPPINGS
# ============================================================================
# TODO: Map these to your actual column names
# These are the expected column names - update if your CSV uses different names

# Revenue column (REQUIRED)
REVENUE_COLUMN = 'USD'  # Common alternatives: 'Amount', 'Revenue', 'Total', 'Sales'

# Date columns (at least one required)
DATE_COLUMN = 'InvoiceDate'  # Primary date column
DATE_FALLBACK_COLUMNS = ['Month', 'Year']  # Fallback columns if primary is missing

# Customer/Account columns
CUSTOMER_COLUMN = 'Customer'  # Common alternatives: 'Account', 'CustomerName', 'Client'

# Product/Item columns
ITEM_COLUMN = 'Item'  # Common alternatives: 'Product', 'SKU', 'ItemCode'
PRODUCT_GROUP_COLUMN = 'ProductGroup'  # Optional: for product categorization
QUANTITY_COLUMN = 'Quantity'  # Optional: for price calculations

# Geographic columns (optional)
REGION_COLUMN = 'Region'  # Optional: for geographic analysis
COUNTRY_COLUMN = 'Country'  # Optional: for country-level analysis

# Segment/Category columns (optional - customize based on your data)
SEGMENT_COLUMNS = {
    'Technology': 'Technology',  # Optional: technology/product type
    'EndMarket': 'EndMarket',    # Optional: end market/industry
    'ProductGroup': 'ProductGroup',  # Optional: product category
}

# Invoice/Transaction columns
INVOICE_NUMBER_COLUMN = 'Invoice #'  # Optional: for transaction-level analysis

# ============================================================================
# DATE RANGE CONFIGURATION
# ============================================================================
# TODO: Update these based on your data and analysis needs

# Analysis years (years to include in analysis)
ANALYSIS_YEARS = [2021, 2022, 2023, 2024, 2025]  # Update based on your data

# LTM (Last Twelve Months) Configuration
# For the most recent partial year, use LTM for apples-to-apples comparison
# Example: If latest data is through September 2025, use Oct 2024 - Sep 2025
LTM_ENABLED = True  # Set to False if you have complete calendar years only
LTM_START_MONTH = 10  # Month number (1-12) for LTM start
LTM_START_YEAR = 2024  # Year for LTM start
LTM_END_MONTH = 9  # Month number (1-12) for LTM end
LTM_END_YEAR = 2025  # Year for LTM end

# Generate LTM period objects
if LTM_ENABLED:
    LTM_START = pd.Period(f'{LTM_START_YEAR}-{LTM_START_MONTH:02d}', freq='M')
    LTM_END = pd.Period(f'{LTM_END_YEAR}-{LTM_END_MONTH:02d}', freq='M')
    LTM_LABEL = f'{LTM_END_YEAR} (LTM {LTM_END_MONTH}/{LTM_END_YEAR})'
else:
    LTM_START = None
    LTM_END = None
    LTM_LABEL = None

# Data date range (filter data to this range)
MIN_YEAR = 2021  # Minimum year to include
MAX_DATE = pd.Timestamp('2025-09-30')  # Maximum date to include (update based on your data)

# ============================================================================
# CHART SETTINGS
# ============================================================================
CHART_DPI = 300
CHART_FORMAT = 'png'
CHART_BBOX = 'tight'
CHART_STYLE = 'seaborn-v0_8'  # Options: 'default', 'ggplot', 'seaborn-v0_8', etc.

# Chart size presets
CHART_SIZES = {
    'small': (6, 4),
    'medium': (10, 6),
    'large': (12, 8),
    'wide': (14, 6)
}

# ============================================================================
# DATA FILTERING
# ============================================================================
# Quantity filtering for price calculations (exclude outliers)
MIN_QUANTITY = 0  # Minimum valid quantity
MAX_QUANTITY = 1000  # Maximum valid quantity (adjust based on your data)

# Revenue filtering (optional - exclude negative values, returns, etc.)
EXCLUDE_NEGATIVE_REVENUE = False  # Set to True to exclude negative revenue (returns/credits)
MIN_REVENUE = None  # Optional: minimum revenue threshold

# ============================================================================
# EXCLUSION FILTERS (Optional)
# ============================================================================
# Use this section to exclude specific segments, customers, or products
# Example: Exclude a business unit, test accounts, etc.

EXCLUSION_FILTERS = {
    'enabled': False,  # Set to True to enable exclusions
    'exclude_by_column': None,  # Column name to filter on (e.g., 'Country', 'Segment')
    'exclude_values': [],  # List of values to exclude (e.g., ['KVT', 'Test'])
}

# ============================================================================
# VALIDATION THRESHOLDS (Optional)
# ============================================================================
# Expected revenue ranges for validation (update based on your company)
# These are used to validate that data loading is working correctly
VALIDATION_ENABLED = False  # Set to True to enable validation
EXPECTED_REVENUE = {}  # Example: {2021: 99_880_000, 2024: 89_990_000}
REVENUE_TOLERANCE_PCT = 0.01  # 1% tolerance for validation

# ============================================================================
# HELPER FUNCTIONS
# ============================================================================
def ensure_directories() -> None:
    """
    Create output directories if they don't exist

    Creates charts/ and reports/ directories for saving analysis outputs.
    Called automatically by get_chart_path() and get_report_path().

    Returns:
        None: Creates directories in place
    """
    OUTPUT_DIR.mkdir(exist_ok=True)
    REPORTS_DIR.mkdir(exist_ok=True)
    if DATA_DIR.exists():
        DATA_DIR.mkdir(exist_ok=True)

def get_chart_path(filename: str) -> Path:
    """
    Get full path for chart file

    Args:
        filename: Chart filename (e.g., 'revenue_trend.png')

    Returns:
        Path: Full path to chart file in OUTPUT_DIR
    """
    ensure_directories()
    return OUTPUT_DIR / filename

def get_report_path(filename: str) -> Path:
    """
    Get full path for report file

    Args:
        filename: Report filename (e.g., 'analysis_report.pdf')

    Returns:
        Path: Full path to report file in REPORTS_DIR
    """
    ensure_directories()
    return REPORTS_DIR / filename

def get_data_path(filename: Optional[str] = None) -> Path:
    """
    Get full path for data file

    This function handles data file location logic:
    - If DATA_DIR exists, looks there first
    - Otherwise uses current directory
    - Defaults to DATA_FILE from config if filename not provided

    Args:
        filename: Optional filename override (defaults to config.DATA_FILE)

    Returns:
        Path: Full path to data file

    Example:
        >>> from config import get_data_path
        >>> data_path = get_data_path()
        >>> print(f"Loading from: {data_path}")
    """
    if filename is None:
        filename = DATA_FILE
    if DATA_DIR.exists():
        return DATA_DIR / filename
    return Path(filename)

def get_ltm_period() -> Tuple[Optional[pd.Period], Optional[pd.Period]]:
    """
    Get LTM (Last Twelve Months) period boundaries from config

    Returns LTM start and end periods if LTM is enabled and configured,
    otherwise returns (None, None).

    Returns:
        Tuple[Optional[pd.Period], Optional[pd.Period]]:
            (ltm_start, ltm_end) or (None, None) if disabled

    Example:
        >>> ltm_start, ltm_end = get_ltm_period()
        >>> if ltm_start and ltm_end:
        ...     print(f"LTM: {ltm_start} to {ltm_end}")

    See Also:
        - get_ltm_label() - Get formatted LTM label string
        - .cursor/rules/ltm_methodology.md - LTM explanation
    """
    if LTM_ENABLED and LTM_START and LTM_END:
        return LTM_START, LTM_END
    return None, None

def get_ltm_label() -> Optional[str]:
    """
    Get LTM label string for display

    Returns formatted label like "2025 (LTM 9/2025)" if LTM is enabled,
    otherwise None. Use this in chart titles and labels.

    Returns:
        Optional[str]: LTM label string or None if LTM disabled

    Example:
        >>> from config import get_ltm_label
        >>> ltm_label = get_ltm_label()
        >>> if ltm_label:
        ...     title = f'Revenue Trend\n({ltm_label})'

    See Also:
        - get_ltm_period() - Get LTM period objects
        - .cursor/rules/ltm_methodology.md - LTM usage guide
    """
    return LTM_LABEL if LTM_ENABLED else None