sales-data-analysis/examples/product_performance.py

"""
Example: Product Performance Analysis
Example showing product mix and performance analysis

This example demonstrates:
- Product-level aggregation
- Product performance metrics
- Product mix visualization
"""
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

# Import utilities
from data_loader import load_sales_data, validate_data_structure
from validate_revenue import validate_revenue
from analysis_utils import (
    get_ltm_period_config, calculate_annual_metrics,
    apply_exclusion_filters, setup_revenue_chart, save_chart,
    format_currency, sort_mixed_years
)
from config import (
    OUTPUT_DIR, MAX_DATE, CHART_SIZES, ensure_directories,
    get_data_path, COMPANY_NAME, REVENUE_COLUMN, ITEM_COLUMN,
    DATE_COLUMN, MIN_YEAR, QUANTITY_COLUMN
)

# ============================================================================
# CONFIGURATION
# ============================================================================

ANALYSIS_NAME = "Product Performance Analysis"
DESCRIPTION = "Product mix and performance analysis"

# ============================================================================
# MAIN ANALYSIS FUNCTION
# ============================================================================

def main():
    """Main analysis function"""

    print(f"\n{'='*60}")
    print(f"{ANALYSIS_NAME}")
    print(f"{'='*60}\n")

    # 1. Load data
    print("Loading data...")
    try:
        df = load_sales_data(get_data_path())
        print(f"Loaded {len(df):,} transactions")
    except Exception as e:
        print(f"ERROR loading data: {e}")
        return

    # 2. Validate data structure
    is_valid, msg = validate_data_structure(df)
    if not is_valid:
        print(f"ERROR: {msg}")
        return

    if ITEM_COLUMN not in df.columns:
        print(f"WARNING: Item column '{ITEM_COLUMN}' not found. Using transaction-level analysis.")
        # Create a dummy item column for demonstration
        df[ITEM_COLUMN] = 'All Products'

    print("Data validation passed")

    # 3. Apply exclusion filters
    df = apply_exclusion_filters(df)

    # 4. Filter by date range
    df = df[df['Year'] >= MIN_YEAR]
    if DATE_COLUMN in df.columns:
        df = df[df[DATE_COLUMN] <= MAX_DATE]

    # 5. Setup LTM period
    ltm_start, ltm_end = get_ltm_period_config()

    # 6. Product performance summary
    print("\nCalculating product performance...")

    # Get most recent period data
    if ltm_start and ltm_end and 'YearMonth' in df.columns:
        recent_data = df[(df['YearMonth'] >= ltm_start) & (df['YearMonth'] <= ltm_end)]
        period_label = f"LTM {ltm_end}"
    else:
        recent_year = df['Year'].max()
        recent_data = df[df['Year'] == recent_year]
        period_label = str(recent_year)

    # Product-level metrics
    product_metrics = recent_data.groupby(ITEM_COLUMN).agg({
        REVENUE_COLUMN: ['sum', 'count'],
        QUANTITY_COLUMN: 'sum' if QUANTITY_COLUMN in df.columns else 'count'
    }).reset_index()

    product_metrics.columns = [ITEM_COLUMN, 'Revenue', 'Transaction_Count', 'Quantity']

    # Calculate average price per unit if quantity available
    if QUANTITY_COLUMN in df.columns:
        product_metrics['Avg_Price'] = product_metrics['Revenue'] / product_metrics['Quantity'].replace(0, np.nan)
    else:
        product_metrics['Avg_Price'] = product_metrics['Revenue'] / product_metrics['Transaction_Count']

    # Sort by revenue
    product_metrics = product_metrics.sort_values('Revenue', ascending=False)

    # Top products summary
    print(f"\nTop 10 Products by Revenue ({period_label}):")
    print("-" * 80)
    top_10 = product_metrics.head(10)
    total_revenue = product_metrics['Revenue'].sum()

    for idx, row in top_10.iterrows():
        pct = (row['Revenue'] / total_revenue) * 100
        print(f"{row[ITEM_COLUMN]:30s}: {format_currency(row['Revenue']):>12s} ({pct:5.1f}%)")

    # 7. Annual product trends (if multiple years available)
    if len(df['Year'].unique()) > 1:
        print("\nCalculating annual product trends...")

        def calculate_product_metrics(year_data):
            """Calculate product metrics for a year"""
            product_revenue = year_data.groupby(ITEM_COLUMN)[REVENUE_COLUMN].sum()
            # Get top 5 products
            top_5 = product_revenue.nlargest(5)
            return dict(top_5)

        annual_product_df = calculate_annual_metrics(df, calculate_product_metrics, ltm_start, ltm_end)

        # 8. Create visualizations
        print("\nGenerating charts...")
        ensure_directories()

        # Chart 1: Top Products Revenue (Bar Chart)
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=CHART_SIZES['wide'])

        top_10_revenue = top_10['Revenue'].values / 1e6
        top_10_names = top_10[ITEM_COLUMN].values

        ax1.barh(range(len(top_10)), top_10_revenue, color='#2E86AB')
        ax1.set_yticks(range(len(top_10)))
        ax1.set_yticklabels([name[:30] + '...' if len(name) > 30 else name for name in top_10_names])
        ax1.set_xlabel('Revenue (Millions USD)')
        ax1.set_title(f'Top 10 Products by Revenue\n({period_label})', fontsize=12, fontweight='bold')
        setup_revenue_chart(ax1)
        ax1.set_ylabel('')

        # Chart 2: Revenue Distribution (Pie Chart for top 10)
        if len(product_metrics) > 10:
            other_revenue = product_metrics.iloc[10:]['Revenue'].sum()
            pie_data = list(top_10['Revenue'].values) + [other_revenue]
            pie_labels = list(top_10[ITEM_COLUMN].values) + ['Other']
        else:
            pie_data = product_metrics['Revenue'].values
            pie_labels = product_metrics[ITEM_COLUMN].values

        pie_data_millions = [x / 1e6 for x in pie_data]
        ax2.pie(pie_data_millions, labels=pie_labels, autopct='%1.1f%%', startangle=90)
        ax2.set_title('Revenue Distribution\n(Top Products)', fontsize=12, fontweight='bold')

        plt.suptitle(f'Product Performance Analysis - {COMPANY_NAME}',
                     fontsize=14, fontweight='bold', y=1.02)
        plt.tight_layout()
        save_chart(fig, 'product_performance.png')
        plt.close()
    else:
        # Single chart if only one year
        print("\nGenerating chart...")
        ensure_directories()

        fig, ax = plt.subplots(figsize=CHART_SIZES['medium'])

        top_10_revenue = top_10['Revenue'].values / 1e6
        top_10_names = top_10[ITEM_COLUMN].values

        ax.barh(range(len(top_10)), top_10_revenue, color='#2E86AB')
        ax.set_yticks(range(len(top_10)))
        ax.set_yticklabels([name[:40] + '...' if len(name) > 40 else name for name in top_10_names])
        ax.set_xlabel('Revenue (Millions USD)')
        ax.set_title(f'Top 10 Products by Revenue - {COMPANY_NAME}\n({period_label})',
                     fontsize=14, fontweight='bold')
        setup_revenue_chart(ax)
        ax.set_ylabel('')

        plt.tight_layout()
        save_chart(fig, 'product_performance.png')
        plt.close()

    # 9. Validate revenue
    print("\nValidating revenue...")
    validate_revenue(df, ANALYSIS_NAME)

    print(f"\n{ANALYSIS_NAME} complete!")
    print(f"Charts saved to: {OUTPUT_DIR}")

# ============================================================================
# RUN ANALYSIS
# ============================================================================

if __name__ == "__main__":
    main()