Initial commit: sales analysis template

Co-authored-by: Cursor <cursoragent@cursor.com>
2026-02-06 09:16:34 -05:00
commit cf0b596449
38 changed files with 8001 additions and 0 deletions
--- a/.cursor/rules/advanced_analysis_patterns.md
+++ b/.cursor/rules/advanced_analysis_patterns.md
@@ -0,0 +1,307 @@
 # Advanced Analysis Patterns
 This document provides patterns for sophisticated, production-grade analyses that leverage the full capabilities of the template framework.
 ## ⭐ Using Cursor AI Effectively
 When working in Cursor, you can ask the AI to:
 - "Create a cohort analysis script using the template patterns"
 - "Add statistical significance testing to this analysis"
 - "Generate a multi-dimensional analysis with product, customer, and geography"
 - "Create a forecasting analysis with confidence intervals"
 The AI will automatically use these patterns and utilities.
 ## Advanced Analysis Types
 ### 1. Multi-Dimensional Analysis
 **Pattern:** Analyze across multiple dimensions simultaneously (e.g., Product × Customer × Geography)
 ```python
 from data_loader import load_sales_data
 from analysis_utils import calculate_annual_metrics, get_ltm_period_config
 from config import REVENUE_COLUMN, ITEM_COLUMN, CUSTOMER_COLUMN, REGION_COLUMN
 df = load_sales_data(get_data_path())
 # Multi-dimensional pivot
 pivot = df.pivot_table(
    index=[ITEM_COLUMN, CUSTOMER_COLUMN],
    columns=REGION_COLUMN,
    values=REVENUE_COLUMN,
    aggfunc='sum',
    fill_value=0
 )
 # Or use data_processing helper
 from data_processing import create_pivot_table
 pivot = create_pivot_table(
    df,
    index=[ITEM_COLUMN, CUSTOMER_COLUMN],
    columns=REGION_COLUMN,
    values=REVENUE_COLUMN
 )
 ```
 ### 2. Cohort Analysis with Retention Metrics
 **Pattern:** Track customer cohorts over time with retention and revenue metrics
 ```python
 from examples.cohort_analysis import create_cohorts, calculate_cohort_metrics
 df_cohort = create_cohorts(df)
 cohort_metrics = calculate_cohort_metrics(df_cohort)
 # Calculate Net Revenue Retention (NRR)
 nrr = cohort_metrics.groupby('Cohort').agg({
    'Revenue_Retention': lambda x: x.iloc[-1] if len(x) > 0 else 0
 })
 ```
 ### 3. Statistical Significance Testing
 **Pattern:** Compare segments with statistical tests
 ```python
 from statistical_utils import test_statistical_significance
 # Compare two groups
 group1 = df[df['Segment'] == 'A'][REVENUE_COLUMN]
 group2 = df[df['Segment'] == 'B'][REVENUE_COLUMN]
 result = test_statistical_significance(group1, group2)
 if result['significant']:
    print(f"Significant difference (p={result['p_value']:.4f})")
 ```
 ### 4. Price-Volume-Mix (PVM) Decomposition
 **Pattern:** Decompose revenue changes into price, volume, and mix effects
 ```python
 from config import QUANTITY_COLUMN, REVENUE_COLUMN
 def pvm_decomposition(df_base, df_current):
    """Decompose revenue change into price, volume, mix effects"""
    base_price = df_base[REVENUE_COLUMN].sum() / df_base[QUANTITY_COLUMN].sum()
    current_price = df_current[REVENUE_COLUMN].sum() / df_current[QUANTITY_COLUMN].sum()
    base_volume = df_base[QUANTITY_COLUMN].sum()
    current_volume = df_current[QUANTITY_COLUMN].sum()
    # Price effect
    price_effect = (current_price - base_price) * base_volume
    # Volume effect
    volume_effect = (current_volume - base_volume) * base_price
    # Mix effect (residual)
    total_change = df_current[REVENUE_COLUMN].sum() - df_base[REVENUE_COLUMN].sum()
    mix_effect = total_change - price_effect - volume_effect
    return {
        'price_effect': price_effect,
        'volume_effect': volume_effect,
        'mix_effect': mix_effect,
        'total_change': total_change
    }
 ```
 ### 5. Time Series Forecasting
 **Pattern:** Forecast future revenue with confidence intervals
 ```python
 from data_processing import prepare_time_series
 from statistical_utils import calculate_confidence_interval
 # Prepare time series
 ts = prepare_time_series(df, freq='M')
 # Simple forecast (extend trend)
 from scipy import stats
 x = np.arange(len(ts))
 slope, intercept, r_value, p_value, std_err = stats.linregress(x, ts.values)
 # Forecast next 12 months
 future_x = np.arange(len(ts), len(ts) + 12)
 forecast = slope * future_x + intercept
 # Calculate confidence intervals
 ci = calculate_confidence_interval(ts, confidence=0.95)
 ```
 ### 6. Customer Lifetime Value (CLV) Analysis
 **Pattern:** Calculate CLV using historical data
 ```python
 from config import CUSTOMER_COLUMN, REVENUE_COLUMN, DATE_COLUMN
 def calculate_clv(df, years=3):
    """Calculate customer lifetime value"""
    customer_metrics = df.groupby(CUSTOMER_COLUMN).agg({
        REVENUE_COLUMN: 'sum',
        DATE_COLUMN: ['min', 'max', 'count']
    }).reset_index()
    customer_metrics.columns = [CUSTOMER_COLUMN, 'Total_Revenue', 'First_Purchase', 'Last_Purchase', 'Order_Count']
    # Calculate customer age (years)
    customer_metrics['Customer_Age_Years'] = (
        (customer_metrics['Last_Purchase'] - customer_metrics['First_Purchase']).dt.days / 365.25
    )
    # Annual revenue
    customer_metrics['Annual_Revenue'] = customer_metrics['Total_Revenue'] / customer_metrics['Customer_Age_Years'].replace(0, 1)
    # Projected CLV
    customer_metrics['CLV'] = customer_metrics['Annual_Revenue'] * years
    return customer_metrics
 ```
 ### 7. Market Basket Analysis
 **Pattern:** Find product associations and cross-sell opportunities
 ```python
 from mlxtend.frequent_patterns import apriori, association_rules
 from mlxtend.preprocessing import TransactionEncoder
 # Prepare transaction data
 transactions = df.groupby(INVOICE_NUMBER_COLUMN)[ITEM_COLUMN].apply(list).tolist()
 # Encode transactions
 te = TransactionEncoder()
 te_ary = te.fit(transactions).transform(transactions)
 df_encoded = pd.DataFrame(te_ary, columns=te.columns_)
 # Find frequent itemsets
 frequent_itemsets = apriori(df_encoded, min_support=0.01, use_colnames=True)
 # Generate association rules
 rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.5)
 ```
 ### 8. Segmentation with Machine Learning
 **Pattern:** Advanced customer segmentation using clustering
 ```python
 from sklearn.cluster import KMeans
 from sklearn.preprocessing import StandardScaler
 # Prepare features
 features = df.groupby(CUSTOMER_COLUMN).agg({
    REVENUE_COLUMN: ['sum', 'mean', 'count'],
    DATE_COLUMN: lambda x: (x.max() - x.min()).days
 }).reset_index()
 features.columns = [CUSTOMER_COLUMN, 'Total_Revenue', 'Avg_Order', 'Order_Count', 'Customer_Tenure']
 # Scale features
 scaler = StandardScaler()
 features_scaled = scaler.fit_transform(features[['Total_Revenue', 'Avg_Order', 'Order_Count', 'Customer_Tenure']])
 # Cluster
 kmeans = KMeans(n_clusters=5, random_state=42)
 features['Segment'] = kmeans.fit_predict(features_scaled)
 ```
 ### 9. Anomaly Detection
 **Pattern:** Identify unusual patterns in data
 ```python
 from statistical_utils import calculate_z_score
 # Calculate z-scores for revenue
 mean_revenue = df[REVENUE_COLUMN].mean()
 std_revenue = df[REVENUE_COLUMN].std()
 df['Revenue_Z_Score'] = df[REVENUE_COLUMN].apply(
    lambda x: calculate_z_score(x, mean_revenue, std_revenue)
 )
 # Flag anomalies (|z| > 3)
 df['Is_Anomaly'] = df['Revenue_Z_Score'].abs() > 3
 ```
 ### 10. Competitive Analysis Framework
 **Pattern:** Compare performance across dimensions
 ```python
 from statistical_utils import calculate_yoy_growth, calculate_cagr
 def competitive_analysis(df, dimension_col):
    """Compare performance across dimension (e.g., products, regions)"""
    analysis = df.groupby(dimension_col).agg({
        REVENUE_COLUMN: ['sum', 'mean', 'count']
    }).reset_index()
    analysis.columns = [dimension_col, 'Total_Revenue', 'Avg_Order', 'Order_Count']
    # Calculate growth rates
    for year in sorted(df['Year'].unique())[1:]:
        prev_year = year - 1
        current = df[df['Year'] == year].groupby(dimension_col)[REVENUE_COLUMN].sum()
        previous = df[df['Year'] == prev_year].groupby(dimension_col)[REVENUE_COLUMN].sum()
        growth = calculate_yoy_growth(current, previous)
        analysis[f'Growth_{year}'] = growth
    return analysis
 ```
 ## Best Practices for Advanced Analyses
 1. **Always validate data quality first:**
   ```python
   from data_quality import generate_data_quality_report
   report = generate_data_quality_report(df)
   ```
 2. **Use logging for complex analyses:**
   ```python
   from logger_config import get_logger
   logger = get_logger('advanced_analysis')
   logger.info("Starting complex analysis...")
   ```
 3. **Export intermediate results:**
   ```python
   from export_utils import export_to_excel
   export_to_excel(intermediate_df, 'intermediate_results.xlsx')
   ```
 4. **Generate comprehensive reports:**
   ```python
   from report_generator import generate_pdf_report
   generate_pdf_report(charts=['chart1.png', 'chart2.png'], summary_data=summary)
   ```
 5. **Test statistical significance:**
   ```python
   from statistical_utils import test_statistical_significance
   # Always test before making conclusions
   ```
 ## Cursor AI Prompts for Advanced Analyses
 When using Cursor, try these prompts:
 - **"Create a cohort retention analysis with heatmaps"**
 - **"Build a price-volume-mix decomposition analysis"**
 - **"Generate a customer lifetime value analysis with segmentation"**
 - **"Create a forecasting model with confidence intervals"**
 - **"Build a multi-dimensional analysis across product, customer, and geography"**
 - **"Create an anomaly detection analysis for unusual transactions"**
 The AI will automatically use these patterns and the template utilities.
 ---
 **Last Updated:** January 2026  
 **For:** Advanced users and AI-assisted development
--- a/.cursor/rules/ai_assistant_guide.md
+++ b/.cursor/rules/ai_assistant_guide.md
@@ -0,0 +1,316 @@
 # AI Assistant Guide for Sales Analysis Template
 This guide helps you effectively use Cursor's AI assistant to create sophisticated sales analyses.
 ## 🎯 Quick Start with AI
 ### Basic Prompt Structure
 When asking the AI to create an analysis, use this structure:
 ```
 Create a [ANALYSIS_TYPE] analysis that:
 1. [Specific requirement 1]
 2. [Specific requirement 2]
 3. Uses the sales_analysis_template patterns
 4. Includes [specific visualizations/metrics]
 ```
 ### Example Prompts
 **Simple Analysis:**
 ```
 Create an annual revenue trend analysis using the template patterns, 
 with LTM support and proper chart formatting.
 ```
 **Advanced Analysis:**
 ```
 Create a customer cohort retention analysis that:
 1. Groups customers by first purchase month
 2. Calculates retention rates for 12 periods
 3. Shows revenue retention metrics
 4. Creates heatmap visualizations
 5. Uses the template's cohort analysis patterns
 ```
 **Multi-Dimensional Analysis:**
 ```
 Create a product performance analysis across regions that:
 1. Analyzes top products by revenue
 2. Shows regional distribution
 3. Calculates growth rates by region
 4. Creates multi-panel visualizations
 5. Exports results to Excel
 ```
 ## 📋 Template-Aware Prompts
 The AI automatically knows about:
 - `data_loader.py` - Always use this for loading data
 - `analysis_utils.py` - Use utilities for formatting, LTM, etc.
 - `config.py` - Use config values, never hardcode
 - Template patterns - Follows best practices automatically
 ### What the AI Knows
 When you mention the template, the AI will:
 - ✅ Use `load_sales_data()` instead of `pd.read_csv()`
 - ✅ Use `setup_revenue_chart()` for charts
 - ✅ Divide revenue by 1e6 before plotting
 - ✅ Use config values from `config.py`
 - ✅ Apply exclusion filters if configured
 - ✅ Validate data after loading
 - ✅ Use LTM patterns correctly
 ## 🔧 Common AI Tasks
 ### 1. Create New Analysis Script
 **Prompt:**
 ```
 Create a new analysis script called [name].py that:
 - Follows the template structure
 - Analyzes [specific metric/dimension]
 - Creates [type of visualization]
 - Uses template utilities
 ```
 **AI will:**
 - Copy structure from `analysis_template.py`
 - Use proper imports
 - Follow template patterns
 - Include validation
 ### 2. Add Advanced Features
 **Prompt:**
 ```
 Add statistical significance testing to [analysis].py:
 - Compare [group1] vs [group2]
 - Show p-values and confidence intervals
 - Use statistical_utils functions
 ```
 ### 3. Fix Common Issues
 **Prompt:**
 ```
 Fix the chart formatting in [analysis].py - it's showing scientific notation.
 ```
 **AI will:**
 - Add `data / 1e6` conversion
 - Use `setup_revenue_chart()`
 - Fix formatting issues
 ### 4. Enhance Existing Analysis
 **Prompt:**
 ```
 Enhance [analysis].py to:
 - Add export to Excel functionality
 - Include data quality checks
 - Add logging
 - Generate PDF report
 ```
 ## 🚀 Advanced AI Prompts
 ### Multi-Step Analysis
 ```
 Create a comprehensive customer analysis that:
 1. Segments customers using RFM
 2. Calculates CLV for each segment
 3. Identifies at-risk customers
 4. Creates cohort retention analysis
 5. Generates PDF report with all charts
 ```
 ### Data Quality First
 ```
 Before running the analysis, check data quality:
 1. Run data quality report
 2. Fix any critical issues
 3. Validate configuration
 4. Then proceed with analysis
 ```
 ### Statistical Analysis
 ```
 Add statistical analysis to [analysis].py:
 - Calculate year-over-year growth with significance testing
 - Show confidence intervals for forecasts
 - Test differences between segments
 - Use statistical_utils functions
 ```
 ## 💡 Pro Tips
 ### 1. Reference Existing Examples
 ```
 Create an analysis similar to examples/customer_segmentation.py 
 but for product segmentation instead.
 ```
 ### 2. Use Template Utilities
 ```
 Use the template's export_utils to save results to Excel,
 and report_generator to create a PDF report.
 ```
 ### 3. Leverage Cursor Rules
 The AI automatically reads `.cursor/rules/` files, so you can say:
 ```
 Follow the advanced_analysis_patterns.md guide to create
 a price-volume-mix decomposition analysis.
 ```
 ### 4. Iterative Development
 ```
 Start with a basic version, then enhance it:
 1. First version: Simple revenue trend
 2. Add: Statistical significance
 3. Add: Export functionality
 4. Add: PDF report generation
 ```
 ## 🎨 Visualization Prompts
 ### Create Specific Chart Types
 ```
 Create a heatmap showing [metric] across [dimension1] and [dimension2],
 using seaborn and following template chart formatting.
 ```
 ```
 Create an interactive Plotly chart for [analysis],
 saving it as HTML using the template's interactive chart functions.
 ```
 ### Multi-Panel Visualizations
 ```
 Create a 2x2 subplot showing:
 - Top left: Revenue trend
 - Top right: Customer count trend
 - Bottom left: Average order value
 - Bottom right: Growth rates
 All using template chart formatting.
 ```
 ## 📊 Data Analysis Prompts
 ### Cohort Analysis
 ```
 Create a cohort analysis that:
 1. Groups customers by first purchase month
 2. Tracks retention for 12 periods
 3. Calculates revenue retention
 4. Creates retention heatmap
 5. Uses examples/cohort_analysis.py as reference
 ```
 ### Forecasting
 ```
 Create a revenue forecasting analysis:
 1. Prepare time series data
 2. Fit trend model
 3. Forecast next 12 months
 4. Show confidence intervals
 5. Use statistical_utils for calculations
 ```
 ### Segmentation
 ```
 Create an advanced customer segmentation:
 1. Calculate RFM scores
 2. Apply clustering algorithm
 3. Analyze segment characteristics
 4. Create segment visualizations
 5. Export segment data to Excel
 ```
 ## 🔍 Debugging with AI
 ### Fix Errors
 ```
 I'm getting [error message] in [file].py.
 Fix it using template best practices.
 ```
 ### Optimize Performance
 ```
 Optimize [analysis].py for large datasets:
 - Use efficient pandas operations
 - Add progress indicators
 - Consider data sampling if needed
 ```
 ### Improve Code Quality
 ```
 Refactor [analysis].py to:
 - Use more template utilities
 - Follow template patterns better
 - Add proper error handling
 - Include logging
 ```
 ## 📝 Documentation Prompts
 ### Add Documentation
 ```
 Add comprehensive docstrings to [analysis].py following
 the template's documentation style.
 ```
 ### Create README
 ```
 Create a README for [analysis].py explaining:
 - What it does
 - How to run it
 - What outputs it generates
 - Dependencies required
 ```
 ## 🎯 Best Practices for AI Interaction
 1. **Be Specific:** Mention template files and utilities by name
 2. **Reference Examples:** Point to existing examples when relevant
 3. **Iterate:** Start simple, then add complexity
 4. **Use Template Terms:** Mention "LTM", "config values", "template patterns"
 5. **Ask for Validation:** Request data quality checks and validation
 ## Example Full Workflow
 ```
 1. "Check my configuration using config_validator.py"
 2. "Run data quality report on my data"
 3. "Create a revenue trend analysis using template patterns"
 4. "Add statistical significance testing to the analysis"
 5. "Export results to Excel and generate PDF report"
 6. "Create a cohort analysis similar to the example"
 ```
 The AI will guide you through each step using template best practices.
 ---
 **Last Updated:** January 2026  
 **For:** Cursor AI users working with sales_analysis_template
--- a/.cursor/rules/analysis_patterns.md
+++ b/.cursor/rules/analysis_patterns.md
@@ -0,0 +1,161 @@
 # Common Analysis Patterns
 ## ⭐ RECOMMENDED: Use Utilities
 **Always prefer `analysis_utils.py` and `config.py` over manual implementations:**
 - Consistent formatting
 - Fewer errors
 - Easier maintenance
 - Standardized output
 ## Standard Script Structure (Using Utilities)
 **RECOMMENDED:** Use `analysis_utils.py` and `config.py` for consistency:
 ```python
 # 1. IMPORTS
 import pandas as pd
 import numpy as np
 import matplotlib.pyplot as plt
 from data_loader import load_sales_data, validate_data_structure
 from validate_revenue import validate_revenue
 from analysis_utils import (
    get_ltm_period_config, get_annual_data, calculate_annual_metrics,
    get_millions_formatter, setup_revenue_chart, save_chart,
    format_currency, print_annual_summary, sort_mixed_years,
    apply_exclusion_filters
 )
 from config import (
    DATA_FILE, OUTPUT_DIR, CHART_SIZES, ensure_directories,
    get_data_path, REVENUE_COLUMN, COMPANY_NAME
 )
 # 2. LOAD DATA (ALWAYS use data_loader)
 df = load_sales_data(get_data_path())
 # 3. VALIDATE DATA STRUCTURE
 is_valid, msg = validate_data_structure(df)
 if not is_valid:
    print(f"ERROR: {msg}")
    return
 # 4. APPLY EXCLUSION FILTERS (if configured)
 df = apply_exclusion_filters(df)
 # 5. SETUP LTM (if doing annual comparisons and LTM is enabled)
 ltm_start, ltm_end = get_ltm_period_config()
 # 6. DATA PREPARATION
 # Convert columns, filter data, create derived columns
 # 7. ANALYSIS LOGIC
 # Use calculate_annual_metrics() for annual aggregations
 # 8. VISUALIZATIONS
 # Use setup_revenue_chart() and save_chart() from analysis_utils
 # 9. VALIDATION
 validate_revenue(df, "Analysis Name")
 ```
 ## Annual Aggregation Pattern
 **RECOMMENDED:** Use `calculate_annual_metrics()` from `analysis_utils.py`:
 ```python
 from analysis_utils import calculate_annual_metrics, get_ltm_period_config
 from config import REVENUE_COLUMN
 ltm_start, ltm_end = get_ltm_period_config()
 def calculate_metrics(year_data):
    """Calculate metrics for a single year"""
    return {
        'Revenue': year_data[REVENUE_COLUMN].sum(),
        # ... other metrics
    }
 annual_df = calculate_annual_metrics(df, calculate_metrics, ltm_start, ltm_end)
 ```
 ## Chart Formatting Pattern
 **ALWAYS use this pattern for revenue charts:**
 ```python
 from analysis_utils import setup_revenue_chart, save_chart
 from config import CHART_SIZES
 fig, ax = plt.subplots(figsize=CHART_SIZES['medium'])
 # Divide data by 1e6 BEFORE plotting
 ax.plot(data / 1e6, ...)
 # OR
 ax.bar(x, values / 1e6, ...)
 # Apply formatter automatically
 setup_revenue_chart(ax)
 # Save chart
 save_chart(fig, 'chart_name.png')
 plt.close()
 ```
 ## Mixed Type Handling
 When dealing with year columns that may contain mixed int/str types (e.g., "2025 (LTM 9/2025)"):
 ```python
 from analysis_utils import sort_mixed_years
 # Sort DataFrame by year
 df_sorted = sort_mixed_years(df, year_col='Year')
 # For chart labels
 years = df_sorted['Year'].tolist()
 x_pos = range(len(years))
 ax.set_xticks(x_pos)
 ax.set_xticklabels(years, rotation=45, ha='right')
 ```
 ## Price Calculation Pattern
 ```python
 from analysis_utils import calculate_price_per_unit
 from config import QUANTITY_COLUMN, REVENUE_COLUMN
 # Calculate average price per unit (excludes outliers automatically)
 price_per_unit = calculate_price_per_unit(df, QUANTITY_COLUMN, REVENUE_COLUMN)
 ```
 ## Exclusion Filters Pattern
 If you need to exclude specific segments (e.g., test accounts, business units):
 ```python
 from analysis_utils import apply_exclusion_filters
 # Configure in config.py:
 # EXCLUSION_FILTERS = {
 #     'enabled': True,
 #     'exclude_by_column': 'Country',
 #     'exclude_values': ['KVT', 'Test']
 # }
 df = apply_exclusion_filters(df)
 ```
 ## Using Configuration Values
 **ALWAYS use config values instead of hardcoding:**
 ```python
 from config import (
    REVENUE_COLUMN,      # Use this instead of 'USD' or 'Amount'
    CUSTOMER_COLUMN,     # Use this instead of 'Customer'
    DATE_COLUMN,         # Use this instead of 'InvoiceDate'
    COMPANY_NAME,        # Use this for titles
    ANALYSIS_YEARS,      # Use this for year filtering
    CHART_SIZES,         # Use this for figure sizes
 )
 ```
--- a/.cursor/rules/chart_formatting.md
+++ b/.cursor/rules/chart_formatting.md
@@ -0,0 +1,111 @@
 # Chart Formatting Rules
 ## ⭐ RECOMMENDED: Use analysis_utils.py
 **Prefer utility functions:**
 ```python
 from analysis_utils import setup_revenue_chart, save_chart, get_millions_formatter
 from config import CHART_SIZES, OUTPUT_DIR
 fig, ax = plt.subplots(figsize=CHART_SIZES['medium'])
 ax.plot(data / 1e6, ...)
 setup_revenue_chart(ax)  # Applies formatter automatically
 save_chart(fig, 'chart.png')  # Saves to charts/ directory
 ```
 ## Revenue Charts: Millions Formatter
 **ALWAYS use this pattern for revenue charts:**
 ```python
 from analysis_utils import setup_revenue_chart
 # Divide data by 1e6 BEFORE plotting
 ax.plot(data / 1e6, ...)
 # OR
 ax.bar(x, values / 1e6, ...)
 # Apply formatter automatically
 setup_revenue_chart(ax)
 ```
 **Manual approach (if not using utilities):**
 ```python
 from matplotlib.ticker import FuncFormatter
 def millions_formatter(x, pos):
    return f'${x:.1f}m'
 ax.plot(data / 1e6, ...)
 ax.yaxis.set_major_formatter(FuncFormatter(millions_formatter))
 ax.set_ylabel('Revenue (Millions USD)')
 ```
 ## Thousands Formatter (for smaller values)
 ```python
 from analysis_utils import get_thousands_formatter
 ax.xaxis.set_major_formatter(get_thousands_formatter())
 ax.barh(x, values / 1e3, ...)
 ax.set_xlabel('Value (Thousands USD)')
 ```
 ## Chart Labeling with LTM
 **If LTM is enabled, ALWAYS include LTM notation:**
 ```python
 from config import get_ltm_label, COMPANY_NAME
 title = f'Annual Revenue Trend - {COMPANY_NAME}'
 ltm_label = get_ltm_label()
 if ltm_label:
    title += f'\n({ltm_label})'
 ax.set_title(title)
 ```
 ## Chart Sizes
 **Use predefined sizes from config:**
 ```python
 from config import CHART_SIZES
 fig, ax = plt.subplots(figsize=CHART_SIZES['medium'])  # (10, 6)
 # Options: 'small' (6, 4), 'medium' (10, 6), 'large' (12, 8), 'wide' (14, 6)
 ```
 ## Common Mistakes
 ❌ **WRONG:**
 ```python
 ax.plot(revenue, ...)  # Shows scientific notation (1e8)
 ```
 ✅ **CORRECT:**
 ```python
 ax.plot(revenue / 1e6, ...)  # Divide first
 setup_revenue_chart(ax)  # Then format
 ```
 ## Saving Charts
 **ALWAYS use save_chart() utility:**
 ```python
 from analysis_utils import save_chart
 save_chart(fig, 'chart_name.png')  # Saves to charts/ with proper settings
 plt.close()  # Don't forget to close!
 ```
 ## Chart Styling
 **Configure style in config.py:**
 ```python
 # In config.py:
 CHART_STYLE = 'seaborn-v0_8'  # Options: 'default', 'ggplot', 'seaborn-v0_8'
 # In your script:
 import matplotlib.pyplot as plt
 plt.style.use(CHART_STYLE)  # Apply before creating figures
 ```
--- a/.cursor/rules/code_quality.md
+++ b/.cursor/rules/code_quality.md
@@ -0,0 +1,389 @@
 # Code Quality & Best Practices
 **Comprehensive guide for writing Cursor-optimized code in the sales analysis template.**
 This document combines code quality standards and Cursor best practices to ensure AI assistants can effectively understand, modify, and extend the codebase.
 ## Type Hints
 ### When to Use Type Hints
 Use type hints for:
 - Function parameters
 - Return values
 - Class attributes
 - Complex data structures
 ### Example Pattern
 ```python
 from typing import Dict, List, Optional, Tuple
 import pandas as pd
 def calculate_annual_metrics(
    df: pd.DataFrame,
    metrics_func: callable,
    ltm_start: Optional[pd.Period] = None,
    ltm_end: Optional[pd.Period] = None
 ) -> pd.DataFrame:
    """
    Calculate annual metrics for all years
    Args:
        df: DataFrame with 'Year' and 'YearMonth' columns
        metrics_func: Function that takes a DataFrame and returns a dict of metrics
        ltm_start: LTM start period (defaults to config if None)
        ltm_end: LTM end period (defaults to config if None)
    Returns:
        DataFrame with 'Year' index and metric columns
    """
    # Implementation
 ```
 ## Docstrings
 ### Docstring Format
 All functions should use Google-style docstrings:
 ```python
 def function_name(param1: type, param2: type) -> return_type:
    """
    Brief description of what the function does.
    More detailed explanation if needed. Can span multiple lines.
    Explain any complex logic or important considerations.
    Args:
        param1: Description of param1
        param2: Description of param2
    Returns:
        Description of return value
    Raises:
        ValueError: When and why this exception is raised
    Example:
        >>> result = function_name(value1, value2)
        >>> print(result)
        expected_output
    """
 ```
 ### Required Elements
 - Brief one-line summary
 - Detailed description (if needed)
 - Args section (all parameters)
 - Returns section (return value)
 - Raises section (if exceptions raised)
 - Example section (for complex functions)
 ## Variable Naming
 ### Conventions
 - **Descriptive names:** `customer_revenue` not `cr`
 - **Consistent prefixes:** `df_` for DataFrames, `annual_` for annual metrics
 - **Clear abbreviations:** `ltm` for Last Twelve Months (well-known)
 - **Avoid single letters:** Except for loop variables (`i`, `j`, `k`)
 ### Good Examples
 ```python
 # Good
 customer_revenue_by_year = df.groupby(['Customer', 'Year'])[REVENUE_COLUMN].sum()
 annual_metrics_df = calculate_annual_metrics(df, metrics_func)
 ltm_start_period, ltm_end_period = get_ltm_period_config()
 # Bad
 cr = df.groupby(['C', 'Y'])['R'].sum()
 am = calc(df, mf)
 s, e = get_ltm()
 ```
 ## Error Messages
 ### Structure
 Error messages should be:
 1. **Specific:** What exactly went wrong
 2. **Actionable:** How to fix it
 3. **Contextual:** Where it occurred
 4. **Helpful:** Reference to documentation
 ### Good Error Messages
 ```python
 # Good
 raise ValueError(
    f"Required column '{REVENUE_COLUMN}' not found in data.\n"
    f"Available columns: {list(df.columns)}\n"
    f"Please update config.py REVENUE_COLUMN to match your data.\n"
    f"See .cursor/rules/data_loading.md for more help."
 )
 # Bad
 raise ValueError("Column not found")
 ```
 ## Code Comments
 ### When to Comment
 - Complex logic that isn't immediately obvious
 - Business rules or domain-specific knowledge
 - Workarounds or non-obvious solutions
 - Performance considerations
 - TODO items with context
 ### Comment Style
 ```python
 # Good: Explains WHY, not WHAT
 # Use LTM for most recent year to enable apples-to-apples comparison
 # with full calendar years (avoids partial year bias)
 if year == LTM_END_YEAR and LTM_ENABLED:
    year_data = get_ltm_data(df, ltm_start, ltm_end)
 # Bad: States the obvious
 # Check if year equals LTM_END_YEAR
 if year == LTM_END_YEAR:
 ```
 ## Function Design
 ### Single Responsibility
 Each function should do one thing well:
 ```python
 # Good: Single responsibility
 def calculate_revenue(df: pd.DataFrame) -> float:
    """Calculate total revenue from DataFrame"""
    return df[REVENUE_COLUMN].sum()
 def calculate_customer_count(df: pd.DataFrame) -> int:
    """Calculate unique customer count"""
    return df[CUSTOMER_COLUMN].nunique()
 # Bad: Multiple responsibilities
 def calculate_metrics(df):
    """Calculate revenue and customer count"""
    revenue = df[REVENUE_COLUMN].sum()
    customers = df[CUSTOMER_COLUMN].nunique()
    return revenue, customers
 ```
 ### Function Length
 - Keep functions under 50 lines when possible
 - Break complex functions into smaller helper functions
 - Use descriptive function names that explain purpose
 ## Import Organization
 ### Standard Order
 1. Standard library imports
 2. Third-party imports (pandas, numpy, matplotlib)
 3. Local/template imports (data_loader, analysis_utils, config)
 ### Example
 ```python
 # Standard library
 from pathlib import Path
 from typing import Dict, Optional
 from datetime import datetime
 # Third-party
 import pandas as pd
 import numpy as np
 import matplotlib.pyplot as plt
 # Template imports
 from data_loader import load_sales_data, validate_data_structure
 from analysis_utils import calculate_annual_metrics, setup_revenue_chart
 from config import REVENUE_COLUMN, CHART_SIZES, COMPANY_NAME
 ```
 ## Constants and Configuration
 ### Use Config Values
 ```python
 # Good: From config
 from config import REVENUE_COLUMN, DATE_COLUMN
 revenue = df[REVENUE_COLUMN].sum()
 # Bad: Hardcoded
 revenue = df['USD'].sum()
 ```
 ### Magic Numbers
 Avoid magic numbers - use named constants or config:
 ```python
 # Good: Named constant
 MILLIONS_DIVISOR = 1e6
 revenue_millions = revenue / MILLIONS_DIVISOR
 # Or from config
 CHART_DPI = 300  # In config.py
 # Bad: Magic number
 revenue_millions = revenue / 1000000
 ```
 ## Testing Considerations
 ### Testable Code
 Write code that's easy to test:
 - Pure functions when possible (no side effects)
 - Dependency injection for external dependencies
 - Clear inputs and outputs
 ### Example
 ```python
 # Good: Testable
 def calculate_metrics(year_data: pd.DataFrame, revenue_col: str) -> Dict:
    """Calculate metrics - easy to test with sample data"""
    return {
        'Revenue': year_data[revenue_col].sum(),
        'Count': len(year_data)
    }
 # Harder to test: Depends on global config
 def calculate_metrics(year_data):
    """Uses global REVENUE_COLUMN - harder to test"""
    return {'Revenue': year_data[REVENUE_COLUMN].sum()}
 ```
 ## AI-Friendly Patterns
 ### Clear Intent
 Code should clearly express intent:
 ```python
 # Good: Intent is clear
 customers_with_revenue = df[df[REVENUE_COLUMN] > 0][CUSTOMER_COLUMN].unique()
 # Less clear: Requires understanding of pandas
 customers_with_revenue = df.loc[df[REVENUE_COLUMN] > 0, CUSTOMER_COLUMN].unique()
 ```
 ### Explicit Over Implicit
 ```python
 # Good: Explicit
 if LTM_ENABLED and ltm_start is not None and ltm_end is not None:
    use_ltm = True
 else:
    use_ltm = False
 # Less clear: Implicit truthiness
 use_ltm = LTM_ENABLED and ltm_start and ltm_end
 ```
 ## Documentation for AI
 ### Help AI Understand Context
 Add comments that help AI understand business context:
 ```python
 # LTM (Last Twelve Months) is used for the most recent partial year
 # to enable fair comparison with full calendar years.
 # Example: If latest data is through Sep 2025, use Oct 2024 - Sep 2025
 if year == LTM_END_YEAR and LTM_ENABLED:
    # Use 12-month rolling period instead of partial calendar year
    year_data = get_ltm_data(df, ltm_start, ltm_end)
 ```
 ## Cursor-Specific Optimizations
 ### AI-Friendly Code Structure
 Code should be structured so Cursor AI can:
 1. **Understand intent** - Clear function names and comments
 2. **Generate code** - Follow established patterns
 3. **Fix errors** - Actionable error messages
 4. **Extend functionality** - Modular, reusable functions
 ### Example: AI-Generated Code Pattern
 When AI generates code, it should automatically:
 ```python
 # AI recognizes this pattern and replicates it
 def main():
    # 1. Load data (AI knows to use data_loader)
    df = load_sales_data(get_data_path())
    # 2. Validate (AI knows to check structure)
    is_valid, msg = validate_data_structure(df)
    if not is_valid:
        print(f"ERROR: {msg}")
        return
    # 3. Apply filters (AI knows exclusion filters)
    df = apply_exclusion_filters(df)
    # 4. Analysis logic (AI follows template patterns)
    # ...
    # 5. Create charts (AI knows formatting rules)
    # ...
    # 6. Validate revenue (AI knows to validate)
    validate_revenue(df, ANALYSIS_NAME)
 ```
 ### Help AI Generate Better Code
 Add context comments that help AI:
 ```python
 # LTM (Last Twelve Months) is used for the most recent partial year
 # to enable fair comparison with full calendar years.
 # Example: If latest data is through Sep 2025, use Oct 2024 - Sep 2025
 # This avoids partial-year bias in year-over-year comparisons.
 if year == LTM_END_YEAR and LTM_ENABLED:
    # Use 12-month rolling period instead of partial calendar year
    year_data = get_ltm_data(df, ltm_start, ltm_end)
    year_label = get_ltm_label()  # Returns "2025 (LTM 9/2025)"
 ```
 ## Summary Checklist
 For Cursor-optimized code:
 - ✅ Comprehensive docstrings with examples
 - ✅ Type hints on functions
 - ✅ Descriptive variable names
 - ✅ Clear comments for business logic
 - ✅ Structured error messages
 - ✅ Consistent code patterns
 - ✅ Use config values (never hardcode)
 - ✅ Follow template utilities
 - ✅ Include validation steps
 - ✅ Reference documentation
 ## Summary
 Follow these standards to ensure:
 1. AI can understand code structure
 2. AI can modify code safely
 3. AI can generate new code following patterns
 4. Code is maintainable and readable
 5. Errors are clear and actionable
 6. Cursor AI can assist effectively
 ---
 **Last Updated:** January 2026  
 **For:** Cursor AI optimization and human developers
--- a/.cursor/rules/common_errors.md
+++ b/.cursor/rules/common_errors.md
@@ -0,0 +1,109 @@
 # Common Errors and Troubleshooting
 **Quick reference for fixing common issues. For error handling patterns when writing code, see `error_handling.md`.**
 ## Data Loading Errors
 ### Error: "Data file not found"
 **Cause:** DATA_FILE path in config.py is incorrect
 **Fix:**
 1. Check that your CSV file exists
 2. Update `DATA_FILE` in config.py with correct filename
 3. If file is in a subdirectory, set `DATA_DIR` in config.py
 ### Error: "Required column 'USD' not found"
 **Cause:** Column name in data doesn't match config
 **Fix:**
 1. Check your CSV column names
 2. Update `REVENUE_COLUMN` in config.py to match your data
 3. Update other column mappings (DATE_COLUMN, CUSTOMER_COLUMN, etc.)
 ### Error: "All InvoiceDate values are NaN"
 **Cause:** Date column parsing failed
 **Fix:**
 1. Check date format in your CSV
 2. Add fallback date columns to `DATE_FALLBACK_COLUMNS` in config.py
 3. Ensure at least one date column exists (Month, Year, etc.)
 ## Analysis Errors
 ### Error: "DataFrame is empty" after filtering
 **Cause:** Date range or year filters too restrictive
 **Fix:**
 1. Check `MIN_YEAR` and `MAX_DATE` in config.py
 2. Check `ANALYSIS_YEARS` includes years in your data
 3. Verify date parsing worked (check data_loader output)
 ### Error: Charts show scientific notation (1e8)
 **Cause:** Forgot to divide by 1e6 before plotting
 **Fix:**
 ```python
 # WRONG:
 ax.plot(revenue, ...)
 # CORRECT:
 ax.plot(revenue / 1e6, ...)
 setup_revenue_chart(ax)
 ```
 ### Error: "Year column has mixed types"
 **Cause:** LTM year is string "2025 (LTM 9/2025)" while others are int
 **Fix:**
 ```python
 from analysis_utils import sort_mixed_years
 df_sorted = sort_mixed_years(df, year_col='Year')
 ```
 ## Configuration Errors
 ### Error: LTM not working correctly
 **Cause:** LTM configuration incorrect
 **Fix:**
 1. Check `LTM_ENABLED = True` in config.py
 2. Verify `LTM_START_MONTH`, `LTM_START_YEAR`, `LTM_END_MONTH`, `LTM_END_YEAR`
 3. Ensure dates are within your data range
 ### Error: Exclusion filters not working
 **Cause:** Filter configuration incorrect
 **Fix:**
 1. Check `EXCLUSION_FILTERS['enabled'] = True`
 2. Verify `exclude_by_column` matches a column in your data
 3. Check `exclude_values` list is correct
 ## Import Errors
 ### Error: "No module named 'config'"
 **Cause:** Running script from wrong directory
 **Fix:**
 1. Run scripts from template root directory
 2. Or add template directory to Python path
 ### Error: "No module named 'data_loader'"
 **Cause:** Missing import or wrong directory
 **Fix:**
 1. Ensure all template files are in the same directory
 2. Check import statements match file names
 ## Best Practices to Avoid Errors
 1. **Always use utilities:** Use `analysis_utils.py` functions instead of manual code
 2. **Validate data:** Run `validate_data_structure()` after loading
 3. **Check config:** Verify all column names match your data (use `config_validator.py`)
 4. **Test incrementally:** Test data loading before running full analysis
 5. **Read error messages:** They usually tell you exactly what's wrong
 6. **Use Cursor AI:** Ask AI to fix errors - it knows template patterns
 ## Using Cursor AI to Fix Errors
 When you encounter an error, ask Cursor AI:
 ```
 "Fix this error: [paste error message]"
 ```
 The AI will:
 - ✅ Understand the error context
 - ✅ Reference template patterns
 - ✅ Suggest specific fixes
 - ✅ Use template utilities correctly
 **See also:** `.cursor/rules/error_handling.md` for how to write error messages that help AI fix issues.
--- a/.cursor/rules/data_loading.md
+++ b/.cursor/rules/data_loading.md
@@ -0,0 +1,69 @@
 # Data Loading Rules
 ## CRITICAL: Always Use data_loader.py
 **NEVER load data directly with `pd.read_csv()`. Always use:**
 ```python
 from data_loader import load_sales_data
 from config import get_data_path
 df = load_sales_data(get_data_path())
 ```
 ## Why This Matters
 The `data_loader.py` implements intelligent fallback logic to ensure 100% date coverage:
 1. **Primary:** Parse primary date column (from config.DATE_COLUMN)
 2. **Fallback 1:** Use fallback date columns if primary is missing (from config.DATE_FALLBACK_COLUMNS)
 3. **Fallback 2:** Use Year column if both missing
 4. **Result:** Maximum date coverage possible
 ## What data_loader.py Provides
 - **Date Column:** Properly parsed datetime with fallback logic
 - **Year:** Extracted year (100% coverage via fallback)
 - **YearMonth:** Period format for monthly aggregations
 - **Revenue Column:** Converted to numeric (from config.REVENUE_COLUMN)
 ## Column Configuration
 Before using, configure column names in `config.py`:
 - `REVENUE_COLUMN`: Your revenue/amount column name
 - `DATE_COLUMN`: Primary date column name
 - `DATE_FALLBACK_COLUMNS`: List of fallback date columns
 - `CUSTOMER_COLUMN`: Customer/account column name
 - Other columns as needed
 ## Common Mistakes
 ❌ **WRONG:**
 ```python
 df = pd.read_csv('sales_data.csv')
 df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
 df = df.dropna(subset=['Date'])  # May drop significant data!
 ```
 ✅ **CORRECT:**
 ```python
 from data_loader import load_sales_data
 from config import get_data_path
 df = load_sales_data(get_data_path())  # Uses fallback logic
 ```
 ## Data File Location
 The data file path is configured in `config.py`:
 - `DATA_FILE`: Filename (e.g., 'sales_data.csv')
 - `DATA_DIR`: Optional subdirectory (defaults to current directory)
 - Use `get_data_path()` to get the full path
 ## Validation
 After loading, validate data structure:
 ```python
 from data_loader import validate_data_structure
 is_valid, msg = validate_data_structure(df)
 if not is_valid:
    print(f"ERROR: {msg}")
 ```
--- a/.cursor/rules/error_handling.md
+++ b/.cursor/rules/error_handling.md
@@ -0,0 +1,276 @@
 # Error Handling Best Practices
 This guide defines how to handle errors in a way that's helpful for both users and AI assistants.
 ## Error Message Structure
 ### Required Elements
 Every error message should include:
 1. **What went wrong** - Specific error description
 2. **Where it occurred** - File/function context
 3. **Why it happened** - Root cause explanation
 4. **How to fix** - Actionable steps
 5. **Reference** - Link to relevant documentation
 ### Template
 ```python
 raise ErrorType(
    f"[What] - [Specific description]\n"
    f"\n"
    f"Context: [Where/When this occurred]\n"
    f"Reason: [Why this happened]\n"
    f"\n"
    f"Solution:\n"
    f"1. [Step 1]\n"
    f"2. [Step 2]\n"
    f"\n"
    f"For more help, see: [Documentation reference]"
 )
 ```
 ## Common Error Patterns
 ### Data Loading Errors
 ```python
 # Good: Comprehensive error message
 if REVENUE_COLUMN not in df.columns:
    available_cols = list(df.columns)[:10]  # Show first 10
    raise ValueError(
        f"Required column '{REVENUE_COLUMN}' not found in data.\n"
        f"\n"
        f"Context: Loading data from {filepath}\n"
        f"Available columns: {available_cols}\n"
        f"\n"
        f"Solution:\n"
        f"1. Check your CSV file column names\n"
        f"2. Update REVENUE_COLUMN in config.py to match your data\n"
        f"3. Run: python config_validator.py to validate configuration\n"
        f"\n"
        f"For more help, see: .cursor/rules/data_loading.md"
    )
 # Bad: Vague error
 if REVENUE_COLUMN not in df.columns:
    raise ValueError("Column not found")
 ```
 ### Configuration Errors
 ```python
 # Good: Actionable error
 if LTM_ENABLED and (LTM_START is None or LTM_END is None):
    raise ValueError(
        f"LTM configuration error: LTM_ENABLED is True but LTM period is not set.\n"
        f"\n"
        f"Context: Configuration in config.py\n"
        f"Current values: LTM_ENABLED={LTM_ENABLED}, LTM_START={LTM_START}, LTM_END={LTM_END}\n"
        f"\n"
        f"Solution:\n"
        f"1. Set LTM_START_MONTH, LTM_START_YEAR, LTM_END_MONTH, LTM_END_YEAR in config.py\n"
        f"2. Or set LTM_ENABLED = False if you don't need LTM\n"
        f"3. Run: python config_validator.py to check configuration\n"
        f"\n"
        f"For more help, see: .cursor/rules/ltm_methodology.md"
    )
 ```
 ### Data Quality Errors
 ```python
 # Good: Helpful data quality error
 if date_coverage < 0.5:  # Less than 50% coverage
    raise ValueError(
        f"Data quality issue: Only {date_coverage:.1%} of rows have valid dates.\n"
        f"\n"
        f"Context: Date parsing in data_loader.py\n"
        f"Rows with dates: {date_count:,} / {total_rows:,}\n"
        f"\n"
        f"Solution:\n"
        f"1. Check date format in your CSV file\n"
        f"2. Add fallback date columns to DATE_FALLBACK_COLUMNS in config.py\n"
        f"3. Ensure at least one date column (Month, Year) exists\n"
        f"4. Run: python data_quality.py to analyze data quality\n"
        f"\n"
        f"For more help, see: .cursor/rules/data_loading.md"
    )
 ```
 ## Error Handling Patterns
 ### Try-Except with Context
 ```python
 # Good: Provides context and recovery options
 try:
    df = load_sales_data(get_data_path())
 except FileNotFoundError as e:
    error_msg = (
        f"Data file not found: {e}\n"
        f"\n"
        f"Context: Attempting to load data for analysis\n"
        f"Expected file: {get_data_path()}\n"
        f"\n"
        f"Solution:\n"
        f"1. Check that your CSV file exists at the expected location\n"
        f"2. Update DATA_FILE in config.py with correct filename\n"
        f"3. Or update DATA_DIR if file is in a subdirectory\n"
        f"4. Run: python setup_wizard.py to reconfigure\n"
        f"\n"
        f"For more help, see: .cursor/rules/common_errors.md"
    )
    raise FileNotFoundError(error_msg) from e
 ```
 ### Validation with Helpful Messages
 ```python
 # Good: Validates and provides specific guidance
 def validate_data_structure(df: pd.DataFrame) -> Tuple[bool, str]:
    """
    Validate DataFrame has required structure
    Returns:
        Tuple[bool, str]: (is_valid, error_message)
                         If is_valid is False, error_message contains actionable guidance
    """
    errors = []
    if REVENUE_COLUMN not in df.columns:
        errors.append(
            f"Missing required column '{REVENUE_COLUMN}'. "
            f"Update REVENUE_COLUMN in config.py to match your data."
        )
    if DATE_COLUMN not in df.columns:
        errors.append(
            f"Missing required column '{DATE_COLUMN}'. "
            f"Update DATE_COLUMN in config.py or add fallback columns."
        )
    if len(df) == 0:
        errors.append(
            f"DataFrame is empty. Check date filters (MIN_YEAR, MAX_DATE) in config.py."
        )
    if errors:
        error_msg = "Data validation failed:\n" + "\n".join(f"  - {e}" for e in errors)
        error_msg += "\n\nRun: python config_validator.py for detailed validation"
        return False, error_msg
    return True, "OK"
 ```
 ## Warning Messages
 ### When to Use Warnings
 Use warnings (not errors) for:
 - Non-critical data quality issues
 - Optional features that aren't configured
 - Deprecated functionality
 - Performance considerations
 ### Warning Format
 ```python
 import warnings
 # Good: Informative warning
 if date_coverage < 0.9:  # Less than 90% but not critical
    warnings.warn(
        f"Date coverage is {date_coverage:.1%} ({missing_count:,} rows missing dates).\n"
        f"Consider adding fallback date columns to improve coverage.\n"
        f"See .cursor/rules/data_loading.md for details.",
        UserWarning
    )
 ```
 ## Logging Errors
 ### Use Structured Logging
 ```python
 from logger_config import get_logger
 logger = get_logger('analysis_name')
 try:
    df = load_sales_data(get_data_path())
 except Exception as e:
    logger.error(
        f"Failed to load data: {e}",
        exc_info=True,  # Include stack trace
        extra={
            'file_path': str(get_data_path()),
            'config_file': 'config.py',
            'suggestion': 'Run config_validator.py to check configuration'
        }
    )
    raise
 ```
 ## AI-Friendly Error Messages
 ### Help AI Understand and Fix
 Error messages should help AI assistants:
 1. Understand what went wrong
 2. Know where to look for fixes
 3. Suggest specific solutions
 4. Reference relevant documentation
 ```python
 # Good: AI can parse and act on this
 if column not in df.columns:
    raise ValueError(
        f"Column '{column}' not found.\n"
        f"Available: {list(df.columns)}\n"
        f"Fix: Update {column}_COLUMN in config.py\n"
        f"See: .cursor/rules/data_loading.md"
    )
 # Bad: AI has no context
 if column not in df.columns:
    raise ValueError("Not found")
 ```
 ## Error Recovery
 ### Provide Recovery Options
 ```python
 # Good: Offers recovery path
 def load_sales_data(filepath=None):
    try:
        df = pd.read_csv(filepath)
    except FileNotFoundError:
        # Suggest alternatives
        suggestions = [
            f"1. Check file path: {filepath}",
            f"2. Update DATA_FILE in config.py",
            f"3. Run: python setup_wizard.py",
            f"4. Generate sample data: python generate_sample_data.py"
        ]
        raise FileNotFoundError(
            f"Data file not found: {filepath}\n"
            f"\n"
            f"Options:\n" + "\n".join(suggestions)
        )
 ```
 ## Summary
 Good error handling:
 - ✅ Specific and actionable
 - ✅ Provides context
 - ✅ Suggests solutions
 - ✅ References documentation
 - ✅ Helps both users and AI assistants
 ---
 **Last Updated:** January 2026  
 **For:** Error handling in sales_analysis_template
--- a/.cursor/rules/ltm_methodology.md
+++ b/.cursor/rules/ltm_methodology.md
@@ -0,0 +1,89 @@
 # LTM (Last Twelve Months) Methodology Rules
 ## ⭐ RECOMMENDED: Use analysis_utils.py
 **Prefer utility functions:**
 ```python
 from analysis_utils import get_ltm_period_config, get_annual_data, calculate_annual_metrics
 from config import get_ltm_period, get_ltm_label
 ltm_start, ltm_end = get_ltm_period_config()
 year_data, year_label = get_annual_data(df, 2025, ltm_start, ltm_end)
 ```
 ## What is LTM?
 **LTM (Last Twelve Months)** = Rolling 12-month period for the most recent partial year
 - **Purpose:** Apples-to-apples comparison with full calendar years
 - **Example:** If latest data is through September 2025, use Oct 2024 - Sep 2025 (12 months)
 ## When to Use LTM
 - **Full calendar years (2021-2024):** Use complete year data
 - **Most recent partial year (2025):** Use LTM if you only have partial year data
 - **Complete years only:** Disable LTM in config if all years are complete
 ## Configuration
 **Configure in config.py:**
 ```python
 LTM_ENABLED = True  # Set to False if all years are complete
 LTM_START_MONTH = 10  # Month number (1-12)
 LTM_START_YEAR = 2024
 LTM_END_MONTH = 9
 LTM_END_YEAR = 2025
 ```
 ## Implementation Pattern
 ```python
 from analysis_utils import get_ltm_period_config, get_annual_data
 ltm_start, ltm_end = get_ltm_period_config()
 for year in sorted(df['Year'].unique()):
    year_data, year_label = get_annual_data(df, year, ltm_start, ltm_end)
    # year_label will be "2025 (LTM 9/2025)" for LTM year, or "2025" for regular year
 ```
 ## Labeling Requirements
 **ALWAYS label LTM year with notation in:**
 - Chart titles
 - Chart x-axis labels
 - Table headers
 - Print statements
 - Report text
 **Example:**
 ```python
 from config import get_ltm_label
 ltm_label = get_ltm_label()  # Returns "2025 (LTM 9/2025)" or None
 if ltm_label:
    title = f'Annual Revenue Trend\n({ltm_label})'
 ```
 ## Common Mistakes
 ❌ **WRONG:**
 ```python
 year_2025_data = df[df['Year'] == 2025]  # Uses partial year (not comparable)
 ```
 ✅ **CORRECT:**
 ```python
 from analysis_utils import get_annual_data
 ltm_start, ltm_end = get_ltm_period_config()
 year_2025_data, year_label = get_annual_data(df, 2025, ltm_start, ltm_end)
 ```
 ## Disabling LTM
 If all years in your analysis are complete calendar years:
 ```python
 # In config.py:
 LTM_ENABLED = False
 ```
 Then all years will be treated as full calendar years.
--- a/EXAMPLES.md
+++ b/EXAMPLES.md
@@ -0,0 +1,203 @@
 # Example Analysis Scripts
 This directory contains working example analysis scripts that demonstrate how to use the sales analysis template framework.
 ## Available Examples
 ### 1. Annual Revenue Trend (`examples/annual_revenue_trend.py`)
 **Purpose:** Simple annual revenue analysis with LTM support
 **What it demonstrates:**
 - Loading data using `data_loader`
 - Calculating annual metrics with LTM
 - Creating a revenue trend chart
 - Following template best practices
 **Usage:**
 ```bash
 python examples/annual_revenue_trend.py
 ```
 **Output:**
 - Chart: `charts/annual_revenue_trend.png`
 - Console output with annual revenue summary
 ---
 ### 2. Customer Segmentation (`examples/customer_segmentation.py`)
 **Purpose:** Customer segmentation using RFM (Recency, Frequency, Monetary) methodology
 **What it demonstrates:**
 - Customer-level aggregation
 - RFM scoring and segmentation
 - Segment analysis and visualization
 - Multiple chart generation
 **Usage:**
 ```bash
 python examples/customer_segmentation.py
 ```
 **Output:**
 - Chart: `charts/customer_segmentation.png`
 - Console output with segment summary
 **Segments:**
 - **Champions:** High recency, frequency, and monetary value
 - **Loyal Customers:** Regular customers with good value
 - **At Risk:** Recent but declining frequency
 - **Hibernating:** Low recency, may need reactivation
 - **Potential Loyalists:** Good recency and frequency, lower value
 - **Need Attention:** Mixed signals, need engagement
 ---
 ### 3. Product Performance (`examples/product_performance.py`)
 **Purpose:** Product mix and performance analysis
 **What it demonstrates:**
 - Product-level aggregation
 - Product performance metrics
 - Top products identification
 - Product mix visualization
 **Usage:**
 ```bash
 python examples/product_performance.py
 ```
 **Output:**
 - Chart: `charts/product_performance.png`
 - Console output with top products summary
 ---
 ## How to Use Examples
 ### Step 1: Configure Template
 Before running examples, ensure your template is configured:
 ```bash
 python setup_wizard.py
 ```
 Or manually update `config.py` with your data file and column mappings.
 ### Step 2: Prepare Data
 Place your sales data CSV file in the template directory, or update `DATA_DIR` in `config.py`.
 Alternatively, generate sample data for testing:
 ```bash
 python generate_sample_data.py
 ```
 ### Step 3: Run Example
 ```bash
 python examples/annual_revenue_trend.py
 ```
 ### Step 4: Customize
 Copy an example script and modify it for your needs:
 ```bash
 cp examples/annual_revenue_trend.py my_analysis.py
 # Edit my_analysis.py
 python my_analysis.py
 ```
 ---
 ## Example Patterns
 ### Pattern 1: Simple Annual Analysis
 ```python
 from data_loader import load_sales_data
 from analysis_utils import calculate_annual_metrics, get_ltm_period_config
 from config import REVENUE_COLUMN
 df = load_sales_data(get_data_path())
 ltm_start, ltm_end = get_ltm_period_config()
 def calculate_metrics(year_data):
    return {'Revenue': year_data[REVENUE_COLUMN].sum()}
 annual_df = calculate_annual_metrics(df, calculate_metrics, ltm_start, ltm_end)
 ```
 ### Pattern 2: Customer-Level Analysis
 ```python
 from config import CUSTOMER_COLUMN, REVENUE_COLUMN
 customer_metrics = df.groupby(CUSTOMER_COLUMN).agg({
    REVENUE_COLUMN: 'sum',
    DATE_COLUMN: 'count'
 }).reset_index()
 ```
 ### Pattern 3: Product-Level Analysis
 ```python
 from config import ITEM_COLUMN, REVENUE_COLUMN
 product_metrics = df.groupby(ITEM_COLUMN)[REVENUE_COLUMN].sum().sort_values(ascending=False)
 top_10 = product_metrics.head(10)
 ```
 ---
 ## Learning Path
 1. **Start with:** `annual_revenue_trend.py` - Simplest example
 2. **Then try:** `product_performance.py` - More complex aggregation
 3. **Advanced:** `customer_segmentation.py` - Multi-step analysis with custom logic
 ---
 ## Troubleshooting
 **"Module not found" errors:**
 - Ensure you're running from the template root directory
 - Check that all template files are present
 **"Data file not found" errors:**
 - Run `setup_wizard.py` to configure data file path
 - Or update `DATA_FILE` in `config.py`
 **"Column not found" errors:**
 - Update column mappings in `config.py`
 - Run `python config_validator.py` to check configuration
 ---
 ## Advanced Examples
 For more sophisticated analyses, see:
 - `.cursor/rules/advanced_analysis_patterns.md` - Advanced analysis patterns
 - `.cursor/rules/ai_assistant_guide.md` - How to use Cursor AI effectively
 ## Next Steps
 After running examples:
 1. Review the generated charts
 2. Examine the code to understand patterns
 3. Copy an example and customize for your analysis
 4. Check `.cursor/rules/analysis_patterns.md` for more patterns
 5. Read `.cursor/rules/advanced_analysis_patterns.md` for advanced techniques
 6. Use Cursor AI with prompts from `ai_assistant_guide.md`
 7. Read `README.md` for comprehensive documentation
 ---
 **Last Updated:** January 2026  
 **Template Version:** 1.0
--- a/QUICK_START.md
+++ b/QUICK_START.md
@@ -0,0 +1,175 @@
 # Quick Start Guide
 **For Cursor Users:** This template is optimized for Cursor AI. Just ask: *"Create a revenue analysis using the template"* and the AI will handle everything.
 ## 🚀 Get Started in 5 Minutes
 ### Step 1: Install Dependencies
 ```bash
 pip install -r requirements.txt
 ```
 ### Step 2: Run Setup Wizard
 ```bash
 python setup_wizard.py
 ```
 The wizard will ask you:
 - Company name
 - Data file location
 - Column names in your CSV
 - Date range
 - LTM configuration (if needed)
 ### Step 3: Test Data Loading
 ```bash
 python -c "from data_loader import load_sales_data; from config import get_data_path; df = load_sales_data(get_data_path()); print(f'✓ Loaded {len(df):,} rows')"
 ```
 ### Step 4: Run Example Analysis (Recommended)
 ```bash
 # Try an example first to see how it works
 python examples/annual_revenue_trend.py
 ```
 ### Step 5: Create Your First Analysis
 ```bash
 cp analysis_template.py my_analysis.py
 # Or copy an example
 cp examples/annual_revenue_trend.py my_analysis.py
 # Edit my_analysis.py
 python my_analysis.py
 ```
 ---
 ## 📋 Essential Configuration Checklist
 Before running analyses, verify in `config.py`:
 - [ ] `COMPANY_NAME` - Your company name
 - [ ] `DATA_FILE` - Your CSV filename
 - [ ] `REVENUE_COLUMN` - Your revenue column name
 - [ ] `DATE_COLUMN` - Your date column name
 - [ ] `CUSTOMER_COLUMN` - Your customer column name
 - [ ] `ANALYSIS_YEARS` - Years to include
 - [ ] `MIN_YEAR` and `MAX_DATE` - Date range
 - [ ] `LTM_ENABLED` - Set to False if all years complete
 ---
 ## 💡 Common Patterns
 ### Load Data
 ```python
 from data_loader import load_sales_data
 from config import get_data_path
 df = load_sales_data(get_data_path())
 ```
 ### Calculate Annual Metrics
 ```python
 from analysis_utils import calculate_annual_metrics, get_ltm_period_config
 from config import REVENUE_COLUMN
 ltm_start, ltm_end = get_ltm_period_config()
 def calculate_metrics(year_data):
    return {'Revenue': year_data[REVENUE_COLUMN].sum()}
 annual_df = calculate_annual_metrics(df, calculate_metrics, ltm_start, ltm_end)
 ```
 ### Create Chart
 ```python
 from analysis_utils import setup_revenue_chart, save_chart
 from config import CHART_SIZES
 import matplotlib.pyplot as plt
 fig, ax = plt.subplots(figsize=CHART_SIZES['medium'])
 ax.plot(data / 1e6, ...)  # Divide by 1e6!
 setup_revenue_chart(ax)
 save_chart(fig, 'chart.png')
 plt.close()
 ```
 ---
 ## ⚠️ Critical Rules
 1. **ALWAYS use `data_loader.py`** - Never `pd.read_csv()` directly
 2. **ALWAYS divide by 1e6** before plotting revenue
 3. **ALWAYS use `setup_revenue_chart()`** for revenue charts
 4. **ALWAYS use config values** - Never hardcode column names
 5. **ALWAYS validate data** after loading
 ## 💡 New Utilities
 ### Data Quality Check
 ```bash
 python -c "from data_quality import generate_data_quality_report, print_data_quality_report; from data_loader import load_sales_data; from config import get_data_path; df = load_sales_data(get_data_path()); report = generate_data_quality_report(df); print_data_quality_report(report)"
 ```
 ### Configuration Validation
 ```bash
 python config_validator.py
 ```
 ### Export Results
 ```python
 from export_utils import export_to_excel
 export_to_excel(df, 'results.xlsx')
 ```
 ### Generate Sample Data
 ```bash
 python generate_sample_data.py
 ```
 ---
 ## 🐛 Quick Troubleshooting
 **"Data file not found"**
 → Check `DATA_FILE` in config.py
 **"Column not found"**
 → Update column mappings in config.py
 **Charts show 1e8 (scientific notation)**
 → Divide by 1e6 before plotting: `ax.plot(data / 1e6, ...)`
 **"DataFrame is empty"**
 → Check `MIN_YEAR`, `MAX_DATE`, and `ANALYSIS_YEARS` in config.py
 ---
 ## 🎯 Using Cursor AI (Recommended)
 This template is optimized for Cursor. Instead of manual setup, just ask:
 ```
 "Create a revenue trend analysis using template patterns"
 ```
 The AI will:
 - ✅ Use all template utilities automatically
 - ✅ Follow best practices
 - ✅ Include proper validation
 - ✅ Generate production-ready code
 **See:** `.cursor/rules/ai_assistant_guide.md` for complete prompt library
 ## 📚 Next Steps
 - **Run examples:** Try `examples/annual_revenue_trend.py` to see it in action
 - **Check data quality:** Run `python data_quality.py` to analyze your data
 - **Validate config:** Run `python config_validator.py` to check configuration
 - **Read documentation:** See `README.md` for comprehensive guide
 - **Review patterns:** Check `.cursor/rules/` for detailed patterns
 - **See examples:** Check `EXAMPLES.md` for example script guide
 ---
 **Need help?** Check `.cursor/rules/common_errors.md` for detailed troubleshooting.
--- a/README.md
+++ b/README.md
@@ -0,0 +1,589 @@
 # Sales Analysis Template
 **A best-in-class, reusable template for sales invoice detail analysis**
 **Optimized for Cursor AI** - Just ask the AI to create analyses and it handles everything automatically.
 This template provides a complete framework for analyzing sales data from any company. It's designed to be:
 - **Flexible:** Works with different column names, date formats, and data structures
 - **Automated:** Interactive setup wizard configures everything for your company
 - **AI-Optimized:** Fully optimized for Cursor - AI knows all patterns and generates code automatically
 - **Production-Ready:** Includes error handling, validation, and best practices
 ---
 ## 🚀 Quick Start
 ### 1. Setup (Automated)
 Run the interactive setup wizard:
 ```bash
 python setup_wizard.py
 ```
 The wizard will ask you about:
 - Company name and analysis date
 - Data file location
 - Column names in your CSV
 - Date range and LTM configuration
 - Exclusion filters (if needed)
 ### 2. Manual Setup (Alternative)
 If you prefer to configure manually:
 1. **Update `config.py`** with your company-specific settings:
   - `COMPANY_NAME`: Your company name
   - `DATA_FILE`: Your CSV filename
   - `REVENUE_COLUMN`: Your revenue/amount column name
   - `DATE_COLUMN`: Your primary date column
   - Column mappings for Customer, Item, etc.
   - Date range and LTM settings
 2. **Place your data file** in the template directory (or update `DATA_DIR` in config.py)
 ### 3. Test Data Loading
 Verify your configuration works:
 ```bash
 python -c "from data_loader import load_sales_data; from config import get_data_path; df = load_sales_data(get_data_path()); print(f'Loaded {len(df):,} rows')"
 ```
 ### 4. Create Your First Analysis
 Copy the template and customize:
 ```bash
 cp analysis_template.py my_first_analysis.py
 # Edit my_first_analysis.py with your analysis logic
 python my_first_analysis.py
 ```
 ---
 ## 📁 Project Structure
 ```
 sales_analysis_template/
 ├── README.md                    # This file
 ├── QUICK_START.md               # Quick start guide
 ├── TEMPLATE_OVERVIEW.md         # High-level overview
 ├── TEMPLATE_SUMMARY.md          # Comprehensive template summary
 ├── EXAMPLES.md                  # Example scripts guide
 ├── SETUP_CHECKLIST.md           # Setup verification checklist
 ├── requirements.txt             # Python dependencies
 ├── setup_wizard.py              # Interactive setup wizard
 │
 ├── config.py                    # ⭐ Configuration (customize for your company)
 ├── config_validator.py          # Configuration validation utility
 │
 ├── data_loader.py               # ⭐ Data loading with fallback logic
 ├── data_quality.py              # Data quality reporting
 ├── data_processing.py           # Data transformation utilities
 │
 ├── analysis_utils.py           # ⭐ Common utilities (formatters, LTM, helpers)
 ├── statistical_utils.py         # Statistical analysis utilities
 ├── validate_revenue.py          # Revenue validation utility
 │
 ├── export_utils.py              # Export to CSV/Excel
 ├── report_generator.py          # PDF report generation
 ├── logger_config.py             # Logging configuration
 │
 ├── analysis_template.py         # Template for creating new analyses
 ├── run_all_analyses.py          # Batch runner for all scripts
 ├── generate_sample_data.py      # Generate sample data for testing
 │
 ├── examples/                    # Example analysis scripts
 │   ├── annual_revenue_trend.py  # Simple annual revenue analysis
 │   ├── customer_segmentation.py # RFM customer segmentation
 │   ├── cohort_analysis.py       # Customer cohort analysis
 │   └── product_performance.py   # Product performance analysis
 │
 ├── tests/                       # Unit tests
 │   ├── test_data_loader.py      # Data loader tests
 │   ├── test_analysis_utils.py   # Analysis utils tests
 │   └── test_config_validator.py # Config validator tests
 │
 └── .cursor/
    └── rules/                   # Cursor IDE rules (auto-loaded)
        ├── ai_assistant_guide.md # Complete AI assistant guide
        ├── advanced_analysis_patterns.md # Advanced techniques
        ├── analysis_patterns.md  # Common analysis patterns
        ├── chart_formatting.md   # Chart formatting rules
        ├── code_quality.md       # Code quality standards
        ├── common_errors.md      # Error troubleshooting
        ├── data_loading.md       # Data loading patterns
        ├── error_handling.md     # Error handling patterns
        └── ltm_methodology.md    # LTM methodology
 ```
 ---
 ## 🔧 Configuration Guide
 ### Required Configuration
 **In `config.py`, you MUST configure:**
 1. **Company Information:**
   ```python
   COMPANY_NAME = "Your Company Name"
   ```
 2. **Data File:**
   ```python
   DATA_FILE = 'your_sales_data.csv'
   ```
 3. **Column Mappings:**
   ```python
   REVENUE_COLUMN = 'USD'  # Your revenue column name
   DATE_COLUMN = 'InvoiceDate'  # Your date column name
   CUSTOMER_COLUMN = 'Customer'  # Your customer column name
   ```
 4. **Date Range:**
   ```python
   MIN_YEAR = 2021
   MAX_DATE = pd.Timestamp('2025-09-30')
   ANALYSIS_YEARS = [2021, 2022, 2023, 2024, 2025]
   ```
 ### Optional Configuration
 **LTM (Last Twelve Months):**
 ```python
 LTM_ENABLED = True  # Set to False if all years are complete
 LTM_START_MONTH = 10
 LTM_START_YEAR = 2024
 LTM_END_MONTH = 9
 LTM_END_YEAR = 2025
 ```
 **Exclusion Filters:**
 ```python
 EXCLUSION_FILTERS = {
    'enabled': True,
    'exclude_by_column': 'Country',
    'exclude_values': ['Test', 'KVT']
 }
 ```
 **See `config.py` for all available options and detailed comments.**
 ---
 ## 📊 Data Requirements
 ### Required Columns
 Your CSV file must have:
 - **Revenue column:** A numeric column with sales amounts (configured as `REVENUE_COLUMN`)
 - **Date column:** At least one date column (configured as `DATE_COLUMN`)
 ### Recommended Columns
 For full analysis capabilities, include:
 - **Customer/Account:** For customer segmentation and analysis
 - **Item/Product:** For product analysis
 - **Quantity:** For price calculations
 - **Geographic:** Region, Country for geographic analysis
 - **Segments:** Technology, EndMarket, ProductGroup for segmentation
 ### Date Column Fallback
 The data loader supports fallback logic:
 1. **Primary:** Uses `DATE_COLUMN` (e.g., InvoiceDate)
 2. **Fallback 1:** Uses columns in `DATE_FALLBACK_COLUMNS` (e.g., Month, Year)
 3. **Fallback 2:** Constructs from Year column if available
 This ensures maximum date coverage even if some rows have missing dates.
 ---
 ## 💻 Creating Analysis Scripts
 ### Using the Template
 1. **Copy the template:**
   ```bash
   cp analysis_template.py my_analysis.py
   ```
 2. **Update configuration:**
   ```python
   ANALYSIS_NAME = "My Analysis"
   DESCRIPTION = "Description of what this analysis does"
   ```
 3. **Implement your logic:**
   - Use `calculate_annual_metrics()` for annual aggregations
   - Use `setup_revenue_chart()` and `save_chart()` for visualizations
   - Follow patterns from `.cursor/rules/analysis_patterns.md`
 4. **Run your analysis:**
   ```bash
   python my_analysis.py
   ```
 ### Standard Pattern
 ```python
 from data_loader import load_sales_data, validate_data_structure
 from analysis_utils import (
    get_ltm_period_config, calculate_annual_metrics,
    setup_revenue_chart, save_chart, apply_exclusion_filters
 )
 from config import get_data_path, REVENUE_COLUMN, CHART_SIZES
 # Load and validate
 df = load_sales_data(get_data_path())
 is_valid, msg = validate_data_structure(df)
 if not is_valid:
    print(f"ERROR: {msg}")
    return
 # Apply filters
 df = apply_exclusion_filters(df)
 # Calculate metrics
 ltm_start, ltm_end = get_ltm_period_config()
 annual_df = calculate_annual_metrics(df, calculate_metrics, ltm_start, ltm_end)
 # Create charts
 fig, ax = plt.subplots(figsize=CHART_SIZES['medium'])
 ax.plot(data / 1e6, ...)
 setup_revenue_chart(ax)
 save_chart(fig, 'chart.png')
 ```
 ---
 ## 🎯 Key Features
 ### 1. Flexible Data Loading
 - Handles different column names via configuration
 - Fallback logic for date parsing (100% coverage)
 - Automatic validation and error reporting
 ### 2. LTM (Last Twelve Months) Support
 - Automatic LTM calculation for partial years
 - Apples-to-apples comparison with full calendar years
 - Configurable LTM periods
 ### 3. Standardized Chart Formatting
 - Automatic millions formatter for revenue charts
 - Consistent styling and sizing
 - Professional output ready for reports
 - Optional interactive charts with Plotly
 ### 4. Exclusion Filters
 - Easy configuration for excluding segments
 - Useful for excluding test accounts, business units, etc.
 ### 5. Revenue Validation
 - Automatic validation after each analysis
 - Ensures data loading is working correctly
 - Optional validation against expected values
 ### 6. Example Scripts
 - Working examples for common analyses
 - Demonstrates best practices
 - Easy to customize and extend
 ### 7. Data Export
 - Export results to CSV and Excel
 - Formatted summary tables
 - Multiple sheet support
 ### 8. Data Quality Reporting
 - Comprehensive data quality checks
 - Missing value analysis
 - Outlier detection
 - Data profiling
 ### 9. Configuration Validation
 - Early error detection
 - Validates column mappings
 - Checks date ranges and LTM configuration
 ### 10. Statistical Utilities
 - Year-over-year growth calculations
 - CAGR (Compound Annual Growth Rate)
 - Correlation analysis
 - Statistical significance testing
 ### 11. Report Generation
 - Combine multiple charts into PDF reports
 - Professional formatting
 - Summary tables and metadata
 ### 12. Logging Infrastructure
 - Structured logging with file and console output
 - Analysis execution tracking
 - Configurable log levels
 ---
 ## 📚 Documentation
 ### For AI Agents (Cursor IDE)
 The `.cursor/rules/` directory contains comprehensive rules that are automatically loaded by Cursor:
 - **`ai_assistant_guide.md`:** Complete guide with ready-to-use prompts
 - **`advanced_analysis_patterns.md`:** Advanced techniques (cohort, PVM, forecasting, etc.)
 - **`analysis_patterns.md`:** Standard patterns for creating analyses
 - **`data_loading.md`:** Always use `data_loader.py`, never `pd.read_csv()` directly
 - **`chart_formatting.md`:** How to format charts correctly
 - **`ltm_methodology.md`:** LTM implementation and usage
 - **`common_errors.md`:** Troubleshooting guide
 - **`code_quality.md`:** Code quality standards and Cursor best practices
 - **`error_handling.md`:** How to write AI-friendly error messages
 ### For Developers
 - **`config.py`:** Heavily commented with all configuration options
 - **`analysis_template.py`:** Template with examples and comments
 - **`analysis_utils.py`:** Well-documented utility functions
 ---
 ## 🔍 Common Analysis Types
 This template supports all standard sales analyses:
 ### Revenue Analyses
 - Annual revenue trends
 - Monthly revenue analysis
 - Revenue by segment/product/geography
 ### Customer Analyses
 - Customer segmentation (RFM)
 - Customer concentration
 - Churn analysis
 - Cohort analysis
 - Customer lifetime value (CLV)
 ### Product Analyses
 - Product performance
 - Product lifecycle
 - BCG matrix
 - Market basket analysis
 ### Financial Analyses
 - Price elasticity
 - Contribution margin
 - Price vs volume analysis
 ### Advanced Analyses
 - Seasonality analysis
 - Time series forecasting
 - Customer churn prediction
 **See `examples/` directory for working example scripts, or the original Dukane project for 24+ production analysis scripts.**
 ---
 ## 🛠️ Dependencies
 Install required packages:
 ```bash
 pip install -r requirements.txt
 ```
 **Core dependencies:**
 - `pandas` - Data manipulation
 - `numpy` - Numerical operations
 - `matplotlib` - Charting
 - `seaborn` - Enhanced visualizations
 **Optional dependencies** (uncomment in requirements.txt if needed):
 - `openpyxl` - Excel export (export_utils.py)
 - `plotly` - Interactive charts (analysis_utils.py)
 - `reportlab` - PDF reports (report_generator.py)
 - `scipy` - Statistical analysis (statistical_utils.py)
 - `pytest` - Unit testing
 - `pmdarima` - Time series forecasting
 - `mlxtend` - Market basket analysis
 - `scikit-learn` - Machine learning
 ---
 ## ⚠️ Important Notes
 ### Always Use Utilities
 **✅ DO:**
 ```python
 from data_loader import load_sales_data
 from analysis_utils import setup_revenue_chart, save_chart
 from config import REVENUE_COLUMN, CHART_SIZES
 ```
 **❌ DON'T:**
 ```python
 df = pd.read_csv('data.csv')  # Use data_loader instead
 ax.plot(revenue, ...)  # Divide by 1e6 first, use setup_revenue_chart()
 ```
 ### Chart Formatting
 **ALWAYS divide revenue by 1e6 before plotting:**
 ```python
 ax.plot(revenue / 1e6, ...)  # Convert to millions
 setup_revenue_chart(ax)  # Apply formatter
 ```
 ### LTM Labeling
 **ALWAYS label LTM years correctly:**
 ```python
 from config import get_ltm_label
 ltm_label = get_ltm_label()  # Returns "2025 (LTM 9/2025)" or None
 if ltm_label:
    title += f'\n({ltm_label})'
 ```
 ---
 ## 🐛 Troubleshooting
 ### Data Loading Issues
 **Problem:** "Data file not found"
 - **Solution:** Check `DATA_FILE` path in config.py
 - **Solution:** Ensure file is in template directory or update `DATA_DIR`
 **Problem:** "Required column 'USD' not found"
 - **Solution:** Update `REVENUE_COLUMN` in config.py to match your CSV
 - **Solution:** Check all column mappings in config.py
 **Problem:** "All dates are NaN"
 - **Solution:** Add fallback date columns to `DATE_FALLBACK_COLUMNS`
 - **Solution:** Check date format in your CSV
 ### Analysis Issues
 **Problem:** Charts show scientific notation (1e8)
 - **Solution:** Divide by 1e6 before plotting: `ax.plot(data / 1e6, ...)`
 - **Solution:** Use `setup_revenue_chart(ax)` to apply formatter
 **Problem:** "DataFrame is empty" after filtering
 - **Solution:** Check `MIN_YEAR` and `MAX_DATE` in config.py
 - **Solution:** Verify `ANALYSIS_YEARS` includes years in your data
 **See `.cursor/rules/common_errors.md` for more troubleshooting help.**
 ---
 ## 📝 Example Workflow
 ### Complete Analysis Workflow
 1. **Setup:**
   ```bash
   python setup_wizard.py
   ```
 2. **Test data loading:**
   ```bash
   python -c "from data_loader import load_sales_data; from config import get_data_path; df = load_sales_data(get_data_path()); print(f'✓ Loaded {len(df):,} rows')"
   ```
 3. **Create analysis:**
   ```bash
   cp analysis_template.py revenue_analysis.py
   # Edit revenue_analysis.py
   ```
 4. **Run analysis:**
   ```bash
   python revenue_analysis.py
   ```
 5. **Add to batch runner:**
   ```python
   # In run_all_analyses.py:
   ANALYSIS_SCRIPTS = [
       'revenue_analysis.py',
       # ... other analyses
   ]
   ```
 6. **Run all analyses:**
   ```bash
   python run_all_analyses.py
   ```
 ---
 ## 🤝 Best Practices
 1. **Always validate data** after loading:
   ```python
   is_valid, msg = validate_data_structure(df)
   ```
 2. **Use configuration values** instead of hardcoding:
   ```python
   from config import REVENUE_COLUMN  # ✅
   revenue = df['USD'].sum()  # ❌ Hardcoded
   ```
 3. **Apply exclusion filters** if configured:
   ```python
   df = apply_exclusion_filters(df)
   ```
 4. **Validate revenue** at end of each analysis:
   ```python
   validate_revenue(df, "Analysis Name")
   ```
 5. **Use utility functions** for consistency:
   ```python
   from analysis_utils import calculate_annual_metrics, setup_revenue_chart
   ```
 ---
 ## 📄 License
 This template is provided as-is for use in sales analysis projects.
 ---
 ## 🙏 Acknowledgments
 This template is based on best practices developed during the Dukane Corporation sales analysis project, which included 24+ production-ready analysis scripts and comprehensive documentation.
 ---
 ## 📞 Support
 For questions or issues:
 1. Check `.cursor/rules/` for detailed patterns and troubleshooting
 2. Review `config.py` comments for configuration options
 3. See example analyses in the original Dukane project
 ---
 **Last Updated:** January 2026  
 **Template Version:** 1.0  
 **Status:** Production Ready
--- a/SETUP_CHECKLIST.md
+++ b/SETUP_CHECKLIST.md
@@ -0,0 +1,118 @@
 # Setup Checklist
 Use this checklist to ensure your template is properly configured before running analyses.
 ## ✅ Initial Setup
 - [ ] **Install dependencies**
  ```bash
  pip install -r requirements.txt
  ```
 - [ ] **Run setup wizard**
  ```bash
  python setup_wizard.py
  ```
 - [ ] **Place data file** in template directory (or update `DATA_DIR` in config.py)
 ## ✅ Configuration Verification
 Open `config.py` and verify:
 - [ ] **Company Information**
  - [ ] `COMPANY_NAME` is set
  - [ ] `ANALYSIS_DATE` is current
 - [ ] **Data File**
  - [ ] `DATA_FILE` matches your CSV filename
  - [ ] File exists in expected location
 - [ ] **Column Mappings**
  - [ ] `REVENUE_COLUMN` matches your CSV
  - [ ] `DATE_COLUMN` matches your CSV
  - [ ] `CUSTOMER_COLUMN` matches your CSV (if applicable)
  - [ ] `ITEM_COLUMN` matches your CSV (if applicable)
  - [ ] `QUANTITY_COLUMN` matches your CSV (if applicable)
 - [ ] **Date Configuration**
  - [ ] `MIN_YEAR` is correct
  - [ ] `MAX_DATE` is correct
  - [ ] `ANALYSIS_YEARS` includes all years you want to analyze
 - [ ] **LTM Configuration** (if needed)
  - [ ] `LTM_ENABLED` is set correctly
  - [ ] `LTM_START_MONTH`, `LTM_START_YEAR` are correct
  - [ ] `LTM_END_MONTH`, `LTM_END_YEAR` are correct
 - [ ] **Exclusion Filters** (if needed)
  - [ ] `EXCLUSION_FILTERS['enabled']` is set correctly
  - [ ] `exclude_by_column` matches a column in your data
  - [ ] `exclude_values` list is correct
 ## ✅ Data Loading Test
 - [ ] **Test data loading**
  ```bash
  python -c "from data_loader import load_sales_data; from config import get_data_path; df = load_sales_data(get_data_path()); print(f'✓ Loaded {len(df):,} rows')"
  ```
 - [ ] **Verify date coverage**
  - Check output shows good date coverage (>95% recommended)
  - Verify date range matches expectations
 - [ ] **Verify revenue column**
  - Check that revenue values are numeric
  - Verify no unexpected NaN values
 ## ✅ First Analysis Test
 - [ ] **Copy template**
  ```bash
  cp analysis_template.py test_analysis.py
  ```
 - [ ] **Run test analysis**
  ```bash
  python test_analysis.py
  ```
 - [ ] **Verify outputs**
  - [ ] Chart generated successfully
  - [ ] Chart saved to `charts/` directory
  - [ ] Revenue validation passed
  - [ ] No errors in console output
 ## ✅ Common Issues Check
 Before running full analyses, verify:
 - [ ] **Column names match** - All column mappings in config.py match your CSV
 - [ ] **Date format works** - Dates are parsing correctly (check data_loader output)
 - [ ] **Date range is correct** - MIN_YEAR and MAX_DATE include your data
 - [ ] **LTM is configured** - If using LTM, dates are within your data range
 - [ ] **Exclusions work** - If using exclusions, column and values are correct
 ## ✅ Ready for Production
 Once all checks pass:
 - [ ] **Create your analyses** using `analysis_template.py`
 - [ ] **Add to batch runner** in `run_all_analyses.py`
 - [ ] **Run all analyses** to generate complete analysis suite
 ---
 ## 🐛 Troubleshooting
 If any check fails:
 1. **Data loading issues:** See `.cursor/rules/data_loading.md`
 2. **Configuration issues:** Review `config.py` comments
 3. **Common errors:** See `.cursor/rules/common_errors.md`
 4. **Pattern questions:** See `.cursor/rules/analysis_patterns.md`
 ---
 **Checklist Version:** 1.0  
 **Last Updated:** January 2026
--- a/TEMPLATE_OVERVIEW.md
+++ b/TEMPLATE_OVERVIEW.md
@@ -0,0 +1,150 @@
 # Sales Analysis Template - Overview
 **Start here for a high-level understanding of the template.**
 For detailed setup, see `QUICK_START.md`. For complete documentation, see `README.md`.
 ## 🎯 Purpose
 This template provides a **production-ready, reusable framework** for analyzing sales invoice detail data from any company. It's designed to be:
 - **Flexible:** Works with different column names, date formats, and data structures
 - **Automated:** Interactive setup wizard configures everything
 - **AI-Optimized:** Fully optimized for Cursor AI - just ask and the AI generates complete analyses
 - **Best-in-Class:** Based on proven patterns from 24+ production analyses
 ## 📦 What's Included
 ### Core Framework
 - **`config.py`** - Centralized configuration (customize for your company)
 - **`data_loader.py`** - Intelligent data loading with fallback logic
 - **`analysis_utils.py`** - Common utilities (formatters, LTM, helpers)
 - **`validate_revenue.py`** - Revenue validation utility
 ### Templates & Tools
 - **`analysis_template.py`** - Template for creating new analyses
 - **`run_all_analyses.py`** - Batch runner for all scripts
 - **`setup_wizard.py`** - Interactive setup wizard
 ### Documentation
 - **`README.md`** - Comprehensive documentation
 - **`QUICK_START.md`** - Quick reference guide
 - **`.cursor/rules/`** - Cursor IDE rules for automation
 ### Configuration
 - **`requirements.txt`** - Python dependencies
 - **`.gitignore`** - Git ignore patterns
 ## 🚀 Quick Start
 1. **Run setup wizard:**
   ```bash
   python setup_wizard.py
   ```
 2. **Test data loading:**
   ```bash
   python -c "from data_loader import load_sales_data; from config import get_data_path; df = load_sales_data(get_data_path()); print(f'✓ Loaded {len(df):,} rows')"
   ```
 3. **Create your first analysis:**
   ```bash
   cp analysis_template.py my_analysis.py
   # Edit my_analysis.py
   python my_analysis.py
   ```
 ## 🎨 Key Features
 ### 1. Flexible Data Loading
 - Handles different column names via configuration
 - Fallback logic for date parsing (100% coverage)
 - Automatic validation
 ### 2. LTM Support
 - Automatic Last Twelve Months calculation
 - Apples-to-apples comparison with full years
 - Configurable periods
 ### 3. Standardized Formatting
 - Automatic millions formatter for revenue
 - Consistent chart styling
 - Professional output
 ### 4. Exclusion Filters
 - Easy configuration for excluding segments
 - Useful for test accounts, business units, etc.
 ### 5. AI Automation
 - Comprehensive Cursor rules
 - Automated agent assistance
 - Best practices enforcement
 ## 📊 Analysis Types Supported
 This template supports all standard sales analyses:
 - **Revenue:** Annual trends, monthly analysis, by segment
 - **Customer:** Segmentation, concentration, churn, CLV
 - **Product:** Performance, lifecycle, BCG matrix
 - **Financial:** Price elasticity, margins
 - **Advanced:** Seasonality, forecasting, predictions
 ## 🔧 Customization Points
 All customization happens in `config.py`:
 1. **Company Info:** Name, analysis date
 2. **Data File:** Location, filename
 3. **Column Mappings:** Revenue, date, customer, product, etc.
 4. **Date Range:** Years, LTM configuration
 5. **Filters:** Exclusion rules
 6. **Chart Settings:** Sizes, styles, DPI
 ## 📚 Documentation Structure
 - **`README.md`** - Complete guide (start here)
 - **`QUICK_START.md`** - Quick start (includes Cursor tips)
 - **`EXAMPLES.md`** - Example scripts guide
 - **`TEMPLATE_SUMMARY.md`** - Comprehensive template overview
 - **`.cursor/rules/`** - Detailed patterns for AI agents (auto-loaded by Cursor)
 - **`config.py`** - Heavily commented configuration
 ## 🎓 Learning Path
 1. **Read:** `QUICK_START.md` (5 minutes)
 2. **Run:** `setup_wizard.py` (2 minutes)
 3. **Test:** Data loading (1 minute)
 4. **Create:** First analysis using `analysis_template.py` (15 minutes)
 5. **Explore:** `.cursor/rules/` for patterns (as needed)
 ## 💡 Best Practices
 1. **Always use utilities** - Don't reinvent the wheel
 2. **Use config values** - Never hardcode column names
 3. **Validate data** - After loading and after analysis
 4. **Follow patterns** - See `.cursor/rules/analysis_patterns.md`
 5. **Test incrementally** - Test data loading before full analysis
 ## 🔍 What Makes This "Best-in-Class"
 1. **Proven Patterns:** Based on 24+ production analyses
 2. **Flexibility:** Works with any data structure
 3. **Automation:** Setup wizard + AI-friendly rules
 4. **Documentation:** Comprehensive guides and examples
 5. **Error Handling:** Validation and troubleshooting built-in
 6. **Consistency:** Standardized formatting and patterns
 ## 📈 Next Steps
 1. Run `setup_wizard.py` to configure for your company
 2. Review `config.py` to understand all options
 3. Create your first analysis using `analysis_template.py`
 4. Explore `.cursor/rules/` for detailed patterns
 5. Build your analysis suite
 ---
 **Template Version:** 1.0  
 **Last Updated:** January 2026  
 **Status:** Production Ready
--- a/TEMPLATE_SUMMARY.md
+++ b/TEMPLATE_SUMMARY.md
@@ -0,0 +1,254 @@
 # Sales Analysis Template - Summary
 **This document provides a comprehensive overview of the template structure and capabilities.**
 For quick start, see `QUICK_START.md`. For detailed documentation, see `README.md`.
 ## 📋 What This Template Provides
 This template was created based on the comprehensive Dukane Corporation sales analysis project, which included 24+ production-ready analysis scripts. All best practices, patterns, and lessons learned have been distilled into this reusable template.
 ## 📁 Complete File Structure
 ```
 sales_analysis_template/
 ├── README.md                    # Comprehensive documentation
 ├── QUICK_START.md              # Quick reference guide
 ├── TEMPLATE_OVERVIEW.md        # Template overview and features
 ├── TEMPLATE_SUMMARY.md         # This file
 ├── EXAMPLES.md                  # Example scripts guide
 ├── SETUP_CHECKLIST.md          # Setup verification checklist
 ├── requirements.txt            # Python dependencies
 ├── .gitignore                  # Git ignore patterns
 │
 ├── Core Framework Files:
 │   ├── config.py               # ⭐ Centralized configuration
 │   ├── config_validator.py     # Configuration validation utility
 │   ├── data_loader.py          # ⭐ Intelligent data loading
 │   ├── data_quality.py         # Data quality reporting
 │   ├── data_processing.py      # Data transformation utilities
 │   ├── analysis_utils.py       # ⭐ Common utilities
 │   ├── statistical_utils.py    # Statistical analysis utilities
 │   └── validate_revenue.py     # Revenue validation
 │
 ├── Utility Files:
 │   ├── export_utils.py         # Export to CSV/Excel
 │   ├── report_generator.py     # PDF report generation
 │   ├── logger_config.py        # Logging configuration
 │   └── generate_sample_data.py # Generate sample data for testing
 │
 ├── Templates & Tools:
 │   ├── analysis_template.py   # Template for new analyses
 │   ├── run_all_analyses.py    # Batch runner
 │   └── setup_wizard.py        # Interactive setup wizard
 │
 ├── examples/                   # Example analysis scripts
 │   ├── annual_revenue_trend.py # Simple annual revenue analysis
 │   ├── customer_segmentation.py # RFM customer segmentation
 │   ├── cohort_analysis.py      # Customer cohort analysis
 │   └── product_performance.py  # Product performance analysis
 │
 ├── tests/                      # Unit tests
 │   ├── test_data_loader.py     # Data loader tests
 │   ├── test_analysis_utils.py  # Analysis utils tests
 │   └── test_config_validator.py # Config validator tests
 │
 └── .cursor/
    └── rules/                  # Cursor IDE rules (auto-loaded)
        ├── ai_assistant_guide.md # Complete AI assistant guide
        ├── advanced_analysis_patterns.md # Advanced techniques
        ├── analysis_patterns.md # Analysis patterns
        ├── chart_formatting.md  # Chart formatting rules
        ├── code_quality.md      # Code quality standards
        ├── common_errors.md     # Error troubleshooting
        ├── data_loading.md      # Data loading patterns
        ├── error_handling.md    # Error handling patterns
        └── ltm_methodology.md   # LTM methodology
 ```
 ## 🎯 Key Features Implemented
 ### 1. Flexible Configuration System
 - **`config.py`**: Centralized configuration with extensive comments
 - All column names, date ranges, and settings configurable
 - No hardcoded values - everything comes from config
 ### 2. Intelligent Data Loading
 - **`data_loader.py`**: Fallback logic for date parsing
 - Handles missing dates gracefully
 - 100% date coverage via fallback columns
 - Automatic validation and error reporting
 ### 3. Comprehensive Utilities
 - **`analysis_utils.py`**: All common functions in one place
 - Chart formatters (millions, thousands)
 - LTM calculation helpers
 - Mixed type handling for years
 - Price calculation utilities
 - Exclusion filter helpers
 ### 4. Interactive Setup
 - **`setup_wizard.py`**: Asks clarifying questions
 - Automatically configures `config.py`
 - Validates inputs
 - Provides next steps
 ### 5. AI-Friendly Rules
 - **`.cursor/rules/`**: Comprehensive Cursor IDE rules
 - Auto-loaded by Cursor
 - Enforces best practices
 - Provides patterns and troubleshooting
 ### 6. Production-Ready Templates
 - **`analysis_template.py`**: Complete template with examples
 - **`run_all_analyses.py`**: Batch runner with error handling
 - Follows all best practices
 ## 🔑 Design Principles
 ### Flexibility
 - Works with any column names (configured in config.py)
 - Handles different date formats
 - Supports various data structures
 - Optional features (LTM, exclusions) can be disabled
 ### Automation
 - Setup wizard asks all necessary questions
 - Cursor rules guide AI agents automatically
 - Batch runner handles multiple analyses
 - Validation catches errors early
 ### Best Practices
 - Always use utilities (never reinvent the wheel)
 - Consistent formatting across all analyses
 - Proper error handling and validation
 - Comprehensive documentation
 ### Reusability
 - Generic enough for any company
 - Specific enough to be immediately useful
 - Well-documented for future agents
 - Easy to extend with new analyses
 ## 📊 Analysis Types Supported
 The template supports all standard sales analyses:
 ### Revenue Analyses
 - Annual revenue trends
 - Monthly revenue analysis
 - Revenue by segment/product/geography
 ### Customer Analyses
 - Customer segmentation (RFM)
 - Customer concentration
 - Churn analysis
 - Cohort analysis
 - Customer lifetime value (CLV)
 ### Product Analyses
 - Product performance
 - Product lifecycle
 - BCG matrix
 - Market basket analysis
 ### Financial Analyses
 - Price elasticity
 - Contribution margin
 - Price vs volume analysis
 ### Advanced Analyses
 - Seasonality analysis
 - Time series forecasting
 - Customer churn prediction
 ## 🚀 Usage Workflow
 1. **Setup** (5 minutes)
   - Run `setup_wizard.py`
   - Answer questions about your data
   - Configuration automatically updated
 2. **Test** (2 minutes)
   - Test data loading
   - Verify configuration works
 3. **Create** (15 minutes)
   - Copy `analysis_template.py`
   - Customize for your analysis
   - Run and verify
 4. **Scale** (ongoing)
   - Create multiple analyses
   - Add to batch runner
   - Generate complete analysis suite
 ## 💡 What Makes This "Best-in-Class"
 1. **Proven Patterns**: Based on 24+ production analyses
 2. **Comprehensive**: Covers all common analysis types
 3. **Flexible**: Works with any data structure
 4. **Automated**: Setup wizard + AI-friendly rules
 5. **Documented**: Extensive documentation at every level
 6. **Production-Ready**: Error handling, validation, best practices
 ## 📚 Documentation Hierarchy
 1. **`QUICK_START.md`** - Start here (5-minute overview, includes Cursor tips)
 2. **`README.md`** - Complete guide (comprehensive)
 3. **`EXAMPLES.md`** - Example scripts guide
 4. **`TEMPLATE_OVERVIEW.md`** - High-level overview
 5. **`SETUP_CHECKLIST.md`** - Verification checklist
 6. **`.cursor/rules/`** - Detailed patterns for AI agents (auto-loaded by Cursor)
 7. **`config.py`** - Inline comments for all options
 ## 🎓 Learning Resources
 - **Quick Start**: `QUICK_START.md` - Get running in 5 minutes
 - **Full Guide**: `README.md` - Complete documentation
 - **Patterns**: `.cursor/rules/analysis_patterns.md` - Code patterns
 - **Troubleshooting**: `.cursor/rules/common_errors.md` - Fix issues
 - **Examples**: `analysis_template.py` - Working example
 ## ✅ Quality Assurance
 All components include:
 - ✅ Error handling
 - ✅ Input validation
 - ✅ Comprehensive comments
 - ✅ Type hints where helpful
 - ✅ Documentation strings
 - ✅ Best practices enforcement
 ## 🔄 Future Enhancements
 Potential additions (not included in v1.0):
 - Example analysis scripts (can be added from Dukane project)
 - Unit tests
 - CI/CD configuration
 - Docker containerization
 - Additional visualization libraries
 ## 📝 Notes for Users
 1. **First Time**: Start with `QUICK_START.md` and `setup_wizard.py`
 2. **Configuration**: All customization in `config.py`
 3. **Creating Analyses**: Use `analysis_template.py` as starting point
 4. **AI Assistance**: Cursor rules are auto-loaded, just ask for help
 5. **Troubleshooting**: Check `.cursor/rules/common_errors.md` first
 ## 🎉 Success Criteria
 The template is ready when:
 - ✅ Setup wizard runs successfully
 - ✅ Data loads without errors
 - ✅ First analysis generates charts
 - ✅ All validations pass
 - ✅ Documentation is clear
 ---
 **Template Version:** 1.0  
 **Created:** January 2026  
 **Based On:** Dukane Corporation Sales Analysis Project  
 **Status:** Production Ready ✅
--- a/analysis_template.py
+++ b/analysis_template.py
@@ -0,0 +1,147 @@
 """
 Template for creating new analysis scripts
 Copy this file and modify for your specific analysis
 Usage:
 1. Copy this file: cp analysis_template.py my_new_analysis.py
 2. Update the ANALYSIS_NAME and DESCRIPTION
 3. Implement your analysis logic in the main() function
 4. Update the chart generation section
 5. Run: python my_new_analysis.py
 """
 import pandas as pd
 import numpy as np
 import matplotlib.pyplot as plt
 import seaborn as sns
 from pathlib import Path
 # Import utilities
 from data_loader import load_sales_data, validate_data_structure
 from validate_revenue import validate_revenue
 from analysis_utils import (
    get_ltm_period_config, get_annual_data, calculate_annual_metrics,
    get_millions_formatter, setup_revenue_chart, save_chart,
    format_currency, print_annual_summary, sort_mixed_years,
    apply_exclusion_filters
 )
 from config import (
    DATA_FILE, OUTPUT_DIR, ANALYSIS_YEARS, MAX_DATE,
    CHART_SIZES, ensure_directories, get_data_path, COMPANY_NAME
 )
 # ============================================================================
 # CONFIGURATION
 # ============================================================================
 ANALYSIS_NAME = "Template Analysis"
 DESCRIPTION = "Template for new analyses - customize this for your specific analysis"
 # ============================================================================
 # MAIN ANALYSIS FUNCTION
 # ============================================================================
 def main():
    """Main analysis function"""
    print(f"\n{'='*60}")
    print(f"{ANALYSIS_NAME}")
    print(f"{'='*60}\n")
    # 1. Load data
    print("Loading data...")
    try:
        df = load_sales_data(get_data_path())
        print(f"Loaded {len(df):,} transactions")
    except Exception as e:
        print(f"ERROR loading data: {e}")
        return
    # 2. Validate data structure
    is_valid, msg = validate_data_structure(df)
    if not is_valid:
        print(f"ERROR: {msg}")
        return
    print("Data validation passed")
    # 3. Apply exclusion filters (if configured)
    df = apply_exclusion_filters(df)
    # 4. Filter by date range
    from config import MIN_YEAR, DATE_COLUMN
    df = df[df['Year'] >= MIN_YEAR]
    if DATE_COLUMN in df.columns:
        df = df[df[DATE_COLUMN] <= MAX_DATE]
    # 5. Setup LTM period (if enabled)
    ltm_start, ltm_end = get_ltm_period_config()
    if ltm_start and ltm_end:
        print(f"LTM period: {ltm_start} to {ltm_end}")
    # 6. Prepare data
    print("\nPreparing data...")
    # Add your data preparation logic here
    # Example: df['CustomColumn'] = df[REVENUE_COLUMN] * df[QUANTITY_COLUMN]
    # 7. Calculate annual metrics
    print("\nCalculating annual metrics...")
    def calculate_metrics(year_data):
        """Calculate metrics for a single year"""
        from config import REVENUE_COLUMN
        return {
            'Revenue': year_data[REVENUE_COLUMN].sum(),
            # Add your custom metrics here
            # 'CustomMetric': year_data['CustomColumn'].mean(),
        }
    annual_df = calculate_annual_metrics(df, calculate_metrics, ltm_start, ltm_end)
    # 8. Print summary
    print_annual_summary(annual_df, 'Revenue', 'Revenue')
    # 9. Create visualizations
    print("Generating charts...")
    ensure_directories()
    # Example chart: Annual revenue trend
    fig, ax = plt.subplots(figsize=CHART_SIZES['medium'])
    # Prepare data for plotting (handle mixed types)
    annual_df_sorted = sort_mixed_years(annual_df.reset_index(), 'Year')
    years = annual_df_sorted['Year'].tolist()
    revenue = annual_df_sorted['Revenue'].values / 1e6  # Convert to millions
    # Create chart
    ax.plot(range(len(years)), revenue, marker='o', linewidth=2, markersize=8)
    ax.set_xticks(range(len(years)))
    ax.set_xticklabels(years, rotation=45, ha='right')
    setup_revenue_chart(ax)
    # Add LTM notation to title if applicable
    title = f'Annual Revenue Trend - {COMPANY_NAME}'
    if ltm_start and ltm_end:
        from config import get_ltm_label
        ltm_label = get_ltm_label()
        if ltm_label:
            title += f'\n({ltm_label})'
    ax.set_title(title)
    plt.tight_layout()
    save_chart(fig, f'{ANALYSIS_NAME.lower().replace(" ", "_")}_trend.png')
    plt.close()
    # Add more charts as needed...
    # 10. Validate revenue
    print("\nValidating revenue...")
    validate_revenue(df, ANALYSIS_NAME)
    print(f"\n{ANALYSIS_NAME} complete!")
    print(f"Charts saved to: {OUTPUT_DIR}")
 # ============================================================================
 # RUN ANALYSIS
 # ============================================================================
 if __name__ == "__main__":
    main()
--- a/analysis_utils.py
+++ b/analysis_utils.py
@@ -0,0 +1,510 @@
 """
 Common utilities for analysis scripts
 Provides formatters, LTM setup, and helper functions
 This module is designed to work with any sales data structure
 by using configuration from config.py
 """
 import pandas as pd
 import numpy as np
 from matplotlib.ticker import FuncFormatter
 from pathlib import Path
 from config import (
    REVENUE_COLUMN, LTM_ENABLED, get_ltm_period, get_ltm_label,
    OUTPUT_DIR, CHART_DPI, CHART_BBOX
 )
 # ============================================================================
 # CHART FORMATTERS
 # ============================================================================
 def millions_formatter(x: float, pos: int) -> str:
    """
    Format numbers in millions for chart display (e.g., $99.9m)
    This formatter is used with matplotlib FuncFormatter to display
    revenue values in millions on chart axes.
    Args:
        x: Numeric value (already in millions, e.g., 99.9 for $99.9m)
        pos: Position parameter (required by FuncFormatter, not used)
    Returns:
        str: Formatted string like "$99.9m"
    Example:
        >>> from matplotlib.ticker import FuncFormatter
        >>> formatter = FuncFormatter(millions_formatter)
        >>> ax.yaxis.set_major_formatter(formatter)
    """
    return f'${x:.1f}m'
 def thousands_formatter(x: float, pos: int) -> str:
    """
    Format numbers in thousands for chart display (e.g., $99.9k)
    Args:
        x: Numeric value (already in thousands)
        pos: Position parameter (required by FuncFormatter, not used)
    Returns:
        str: Formatted string like "$99.9k"
    """
    return f'${x:.1f}k'
 def get_millions_formatter() -> FuncFormatter:
    """
    Get FuncFormatter for millions
    Returns:
        FuncFormatter: Configured formatter for millions display
    """
    return FuncFormatter(millions_formatter)
 def get_thousands_formatter() -> FuncFormatter:
    """
    Get FuncFormatter for thousands
    Returns:
        FuncFormatter: Configured formatter for thousands display
    """
    return FuncFormatter(thousands_formatter)
 # ============================================================================
 # LTM (Last Twelve Months) SETUP
 # ============================================================================
 def get_ltm_period_config():
    """
    Get LTM period boundaries from config
    Returns:
        tuple: (ltm_start, ltm_end) as pd.Period objects, or (None, None) if disabled
    """
    if LTM_ENABLED:
        return get_ltm_period()
    return None, None
 def get_annual_data(df, year, ltm_start=None, ltm_end=None):
    """
    Get data for a specific year, using LTM for the most recent partial year
    Args:
        df: DataFrame with 'Year' and 'YearMonth' columns
        year: Year to extract (int)
        ltm_start: LTM start period (defaults to config if None)
        ltm_end: LTM end period (defaults to config if None)
    Returns:
        tuple: (year_data DataFrame, year_label string)
    """
    from config import LTM_END_YEAR
    # Get LTM period from config if not provided
    if ltm_start is None or ltm_end is None:
        ltm_start, ltm_end = get_ltm_period_config()
    # Use LTM for the most recent year if enabled
    if LTM_ENABLED and ltm_start and ltm_end and year == LTM_END_YEAR:
        if 'YearMonth' in df.columns:
            year_data = df[(df['YearMonth'] >= ltm_start) & (df['YearMonth'] <= ltm_end)]
            year_label = get_ltm_label() or str(year)
        else:
            # Fallback if YearMonth not available
            year_data = df[df['Year'] == year]
            year_label = str(year)
    else:
        # Use full calendar year
        year_data = df[df['Year'] == year]
        year_label = str(year)
    return year_data, year_label
 def calculate_annual_metrics(df, metrics_func, ltm_start=None, ltm_end=None):
    """
    Calculate annual metrics for all years, using LTM for most recent year
    Args:
        df: DataFrame with 'Year' and 'YearMonth' columns
        metrics_func: Function that takes a DataFrame and returns a dict of metrics
        ltm_start: LTM start period (defaults to config if None)
        ltm_end: LTM end period (defaults to config if None)
    Returns:
        DataFrame with 'Year' index and metric columns
    """
    from config import ANALYSIS_YEARS
    if ltm_start is None or ltm_end is None:
        ltm_start, ltm_end = get_ltm_period_config()
    annual_data = []
    for year in sorted(ANALYSIS_YEARS):
        if year in df['Year'].unique():
            year_data, year_label = get_annual_data(df, year, ltm_start, ltm_end)
            if len(year_data) > 0:
                metrics = metrics_func(year_data)
                metrics['Year'] = year_label
                annual_data.append(metrics)
    if not annual_data:
        return pd.DataFrame()
    return pd.DataFrame(annual_data).set_index('Year')
 # ============================================================================
 # MIXED TYPE HANDLING
 # ============================================================================
 def create_year_sort_column(df, year_col='Year'):
    """
    Create a numeric sort column for mixed int/str year columns
    Args:
        df: DataFrame
        year_col: Name of year column
    Returns:
        Series with numeric sort values
    """
    from config import LTM_END_YEAR
    def sort_value(x):
        if isinstance(x, str) and str(LTM_END_YEAR) in str(x):
            return float(LTM_END_YEAR) + 0.5
        elif isinstance(x, (int, float)):
            return float(x)
        else:
            return 9999
    return df[year_col].apply(sort_value)
 def sort_mixed_years(df, year_col='Year'):
    """
    Sort DataFrame by year column that may contain mixed int/str types
    Args:
        df: DataFrame
        year_col: Name of year column
    Returns:
        Sorted DataFrame
    """
    df = df.copy()
    df['_Year_Sort'] = create_year_sort_column(df, year_col)
    df = df.sort_values('_Year_Sort').drop(columns=['_Year_Sort'])
    return df
 def safe_year_labels(years):
    """
    Convert year values to safe string labels for chart axes
    Args:
        years: Iterable of year values (int or str)
    Returns:
        List of string labels
    """
    return [str(year) for year in years]
 # ============================================================================
 # CHART HELPERS
 # ============================================================================
 def setup_revenue_chart(ax, ylabel: str = 'Revenue (Millions USD)') -> None:
    """
    Setup a chart axis for revenue display (millions)
    CRITICAL: Always use this function for revenue charts. It applies
    the millions formatter and standard styling.
    IMPORTANT: Data must be divided by 1e6 BEFORE plotting:
        ax.plot(revenue / 1e6, ...)  # ✅ Correct
        ax.plot(revenue, ...)         # ❌ Wrong - will show scientific notation
    Args:
        ax: Matplotlib axis object to configure
        ylabel: Y-axis label (default: 'Revenue (Millions USD)')
    Returns:
        None: Modifies ax in place
    Example:
        >>> import matplotlib.pyplot as plt
        >>> from analysis_utils import setup_revenue_chart
        >>> fig, ax = plt.subplots()
        >>> ax.plot(revenue_data / 1e6, marker='o')  # Divide by 1e6 first!
        >>> setup_revenue_chart(ax)
        >>> plt.show()
    See Also:
        - .cursor/rules/chart_formatting.md for detailed patterns
        - save_chart() for saving charts
    """
    ax.yaxis.set_major_formatter(get_millions_formatter())
    ax.set_ylabel(ylabel)
    ax.grid(True, alpha=0.3)
 def save_chart(fig, filename, output_dir=None):
    """
    Save chart to file with organized directory structure
    Args:
        fig: Matplotlib figure object
        filename: Output filename (e.g., 'revenue_trend.png')
        output_dir: Output directory (defaults to config.OUTPUT_DIR)
    """
    if output_dir is None:
        output_dir = OUTPUT_DIR
    else:
        output_dir = Path(output_dir)
    output_dir.mkdir(exist_ok=True)
    filepath = output_dir / filename
    fig.savefig(filepath, dpi=CHART_DPI, bbox_inches=CHART_BBOX, format='png')
    print(f"Chart saved: {filepath}")
 # ============================================================================
 # DATA VALIDATION
 # ============================================================================
 def validate_dataframe(df, required_columns=None):
    """
    Validate DataFrame has required columns and basic data quality
    Args:
        df: DataFrame to validate
        required_columns: List of required column names (defaults to config)
    Returns:
        tuple: (is_valid bool, error_message str)
    """
    if required_columns is None:
        required_columns = [REVENUE_COLUMN, 'Year']
        if 'YearMonth' in df.columns:
            required_columns.append('YearMonth')
    missing_cols = [col for col in required_columns if col not in df.columns]
    if missing_cols:
        return False, f"Missing required columns: {missing_cols}"
    if len(df) == 0:
        return False, "DataFrame is empty"
    if REVENUE_COLUMN in df.columns:
        if df[REVENUE_COLUMN].isna().all():
            return False, f"All {REVENUE_COLUMN} values are NaN"
    return True, "OK"
 # ============================================================================
 # PRICE CALCULATION
 # ============================================================================
 def calculate_price_per_unit(df, quantity_col=None, revenue_col=None):
    """
    Calculate average price per unit, excluding invalid quantities
    Args:
        df: DataFrame with quantity and revenue columns
        quantity_col: Name of quantity column (defaults to config)
        revenue_col: Name of revenue column (defaults to config)
    Returns:
        float: Average price per unit
    """
    from config import QUANTITY_COLUMN, REVENUE_COLUMN, MIN_QUANTITY, MAX_QUANTITY
    if quantity_col is None:
        quantity_col = QUANTITY_COLUMN
    if revenue_col is None:
        revenue_col = REVENUE_COLUMN
    # Check if quantity column exists
    if quantity_col not in df.columns:
        return np.nan
    # Filter for valid quantity transactions
    df_valid = df[(df[quantity_col] > MIN_QUANTITY) & (df[quantity_col] <= MAX_QUANTITY)].copy()
    if len(df_valid) == 0:
        return np.nan
    total_revenue = df_valid[revenue_col].sum()
    total_quantity = df_valid[quantity_col].sum()
    if total_quantity == 0:
        return np.nan
    return total_revenue / total_quantity
 # ============================================================================
 # OUTPUT FORMATTING
 # ============================================================================
 def format_currency(value: float, millions: bool = True) -> str:
    """
    Format currency value for console output
    Args:
        value: Numeric value to format
        millions: If True, format as millions ($X.Xm), else thousands ($X.Xk)
    Returns:
        str: Formatted string like "$99.9m" or "$99.9k" or "N/A" if NaN
    Example:
        >>> format_currency(1000000)
        '$1.00m'
        >>> format_currency(1000, millions=False)
        '$1.00k'
    """
    if pd.isna(value):
        return "N/A"
    if millions:
        return f"${value / 1e6:.2f}m"
    else:
        return f"${value / 1e3:.2f}k"
 def print_annual_summary(annual_df, metric_col='Revenue', label='Revenue'):
    """
    Print formatted annual summary to console
    Args:
        annual_df: DataFrame with annual metrics (indexed by Year)
        metric_col: Column name to print
        label: Label for the metric
    """
    print(f"\n{label} by Year:")
    print("-" * 40)
    for year in annual_df.index:
        value = annual_df.loc[year, metric_col]
        formatted = format_currency(value)
        print(f"  {year}: {formatted}")
    print()
 # ============================================================================
 # DATA FILTERING HELPERS
 # ============================================================================
 def apply_exclusion_filters(df):
    """
    Apply exclusion filters from config
    Args:
        df: DataFrame to filter
    Returns:
        Filtered DataFrame
    """
    from config import EXCLUSION_FILTERS
    if not EXCLUSION_FILTERS.get('enabled', False):
        return df
    exclude_col = EXCLUSION_FILTERS.get('exclude_by_column')
    exclude_values = EXCLUSION_FILTERS.get('exclude_values', [])
    if exclude_col and exclude_col in df.columns and exclude_values:
        original_count = len(df)
        df_filtered = df[~df[exclude_col].isin(exclude_values)]
        excluded_count = original_count - len(df_filtered)
        if excluded_count > 0:
            print(f"Excluded {excluded_count:,} rows based on {exclude_col} filter")
        return df_filtered
    return df
 # ============================================================================
 # INTERACTIVE VISUALIZATIONS (OPTIONAL - PLOTLY)
 # ============================================================================
 def create_interactive_chart(data, chart_type='line', title=None, xlabel=None, ylabel=None):
    """
    Create interactive chart using Plotly (optional dependency)
    Args:
        data: DataFrame or dict with chart data
        chart_type: Type of chart ('line', 'bar', 'scatter')
        title: Chart title
        xlabel: X-axis label
        ylabel: Y-axis label
    Returns:
        plotly.graph_objects.Figure: Plotly figure object
    Raises:
        ImportError: If plotly is not installed
    Example:
        fig = create_interactive_chart(
            {'x': [1, 2, 3], 'y': [10, 20, 30]},
            chart_type='line',
            title='Revenue Trend'
        )
        fig.show()
    """
    try:
        import plotly.graph_objects as go
        from plotly.subplots import make_subplots
    except ImportError:
        raise ImportError(
            "plotly is required for interactive charts. Install with: pip install plotly"
        )
    fig = go.Figure()
    if chart_type == 'line':
        if isinstance(data, dict) and 'x' in data and 'y' in data:
            fig.add_trace(go.Scatter(
                x=data['x'],
                y=data['y'],
                mode='lines+markers',
                name='Data'
            ))
    elif chart_type == 'bar':
        if isinstance(data, dict) and 'x' in data and 'y' in data:
            fig.add_trace(go.Bar(
                x=data['x'],
                y=data['y'],
                name='Data'
            ))
    if title:
        fig.update_layout(title=title)
    if xlabel:
        fig.update_xaxes(title_text=xlabel)
    if ylabel:
        fig.update_yaxes(title_text=ylabel)
    fig.update_layout(
        template='plotly_white',
        hovermode='x unified'
    )
    return fig
 def save_interactive_chart(fig, filename, output_dir=None):
    """
    Save interactive Plotly chart to HTML file
    Args:
        fig: Plotly figure object
        filename: Output filename (e.g., 'chart.html')
        output_dir: Output directory (defaults to config.OUTPUT_DIR)
    """
    if output_dir is None:
        output_dir = OUTPUT_DIR
    else:
        output_dir = Path(output_dir)
    output_dir.mkdir(exist_ok=True)
    filepath = output_dir / filename
    fig.write_html(str(filepath))
    print(f"Interactive chart saved: {filepath}")
    return filepath
--- a/config.py
+++ b/config.py
@@ -0,0 +1,277 @@
 """
 Configuration file for sales analysis scripts
 CONFIGURE THIS FILE FOR YOUR COMPANY'S SPECIFIC DATA STRUCTURE
 This file should be customized based on:
 - Your data file name and location
 - Column names in your sales data
 - Date range and LTM period
 - Company-specific settings
 CRITICAL: All column names, file paths, and settings are defined here.
 Never hardcode these values in analysis scripts - always import from config.
 Usage:
    from config import REVENUE_COLUMN, DATE_COLUMN, get_data_path
    revenue = df[REVENUE_COLUMN].sum()  # ✅ Correct
    revenue = df['USD'].sum()           # ❌ Wrong - hardcoded
 Quick Setup:
    1. Run: python setup_wizard.py (interactive configuration)
    2. Or manually edit this file following the TODO comments
    3. Validate: python config_validator.py
 See Also:
    - .cursor/rules/analysis_patterns.md - How to use config values
    - setup_wizard.py - Interactive configuration tool
    - config_validator.py - Configuration validation
 """
 from pathlib import Path
 from typing import Optional, Tuple
 import pandas as pd
 # ============================================================================
 # COMPANY INFORMATION
 # ============================================================================
 # TODO: Update these values for your company
 COMPANY_NAME = "Your Company Name"  # Update this
 ANALYSIS_DATE = "2026-01-12"  # Update this to current date
 # ============================================================================
 # DATA FILES
 # ============================================================================
 # TODO: Update with your actual data file name
 DATA_FILE = 'sales_data.csv'  # Update this to your CSV file name
 OUTPUT_DIR = Path('charts')
 REPORTS_DIR = Path('reports')
 DATA_DIR = Path('data')  # Optional: if data is in a subdirectory
 # ============================================================================
 # DATA COLUMN MAPPINGS
 # ============================================================================
 # TODO: Map these to your actual column names
 # These are the expected column names - update if your CSV uses different names
 # Revenue column (REQUIRED)
 REVENUE_COLUMN = 'USD'  # Common alternatives: 'Amount', 'Revenue', 'Total', 'Sales'
 # Date columns (at least one required)
 DATE_COLUMN = 'InvoiceDate'  # Primary date column
 DATE_FALLBACK_COLUMNS = ['Month', 'Year']  # Fallback columns if primary is missing
 # Customer/Account columns
 CUSTOMER_COLUMN = 'Customer'  # Common alternatives: 'Account', 'CustomerName', 'Client'
 # Product/Item columns
 ITEM_COLUMN = 'Item'  # Common alternatives: 'Product', 'SKU', 'ItemCode'
 PRODUCT_GROUP_COLUMN = 'ProductGroup'  # Optional: for product categorization
 QUANTITY_COLUMN = 'Quantity'  # Optional: for price calculations
 # Geographic columns (optional)
 REGION_COLUMN = 'Region'  # Optional: for geographic analysis
 COUNTRY_COLUMN = 'Country'  # Optional: for country-level analysis
 # Segment/Category columns (optional - customize based on your data)
 SEGMENT_COLUMNS = {
    'Technology': 'Technology',  # Optional: technology/product type
    'EndMarket': 'EndMarket',    # Optional: end market/industry
    'ProductGroup': 'ProductGroup',  # Optional: product category
 }
 # Invoice/Transaction columns
 INVOICE_NUMBER_COLUMN = 'Invoice #'  # Optional: for transaction-level analysis
 # ============================================================================
 # DATE RANGE CONFIGURATION
 # ============================================================================
 # TODO: Update these based on your data and analysis needs
 # Analysis years (years to include in analysis)
 ANALYSIS_YEARS = [2021, 2022, 2023, 2024, 2025]  # Update based on your data
 # LTM (Last Twelve Months) Configuration
 # For the most recent partial year, use LTM for apples-to-apples comparison
 # Example: If latest data is through September 2025, use Oct 2024 - Sep 2025
 LTM_ENABLED = True  # Set to False if you have complete calendar years only
 LTM_START_MONTH = 10  # Month number (1-12) for LTM start
 LTM_START_YEAR = 2024  # Year for LTM start
 LTM_END_MONTH = 9  # Month number (1-12) for LTM end
 LTM_END_YEAR = 2025  # Year for LTM end
 # Generate LTM period objects
 if LTM_ENABLED:
    LTM_START = pd.Period(f'{LTM_START_YEAR}-{LTM_START_MONTH:02d}', freq='M')
    LTM_END = pd.Period(f'{LTM_END_YEAR}-{LTM_END_MONTH:02d}', freq='M')
    LTM_LABEL = f'{LTM_END_YEAR} (LTM {LTM_END_MONTH}/{LTM_END_YEAR})'
 else:
    LTM_START = None
    LTM_END = None
    LTM_LABEL = None
 # Data date range (filter data to this range)
 MIN_YEAR = 2021  # Minimum year to include
 MAX_DATE = pd.Timestamp('2025-09-30')  # Maximum date to include (update based on your data)
 # ============================================================================
 # CHART SETTINGS
 # ============================================================================
 CHART_DPI = 300
 CHART_FORMAT = 'png'
 CHART_BBOX = 'tight'
 CHART_STYLE = 'seaborn-v0_8'  # Options: 'default', 'ggplot', 'seaborn-v0_8', etc.
 # Chart size presets
 CHART_SIZES = {
    'small': (6, 4),
    'medium': (10, 6),
    'large': (12, 8),
    'wide': (14, 6)
 }
 # ============================================================================
 # DATA FILTERING
 # ============================================================================
 # Quantity filtering for price calculations (exclude outliers)
 MIN_QUANTITY = 0  # Minimum valid quantity
 MAX_QUANTITY = 1000  # Maximum valid quantity (adjust based on your data)
 # Revenue filtering (optional - exclude negative values, returns, etc.)
 EXCLUDE_NEGATIVE_REVENUE = False  # Set to True to exclude negative revenue (returns/credits)
 MIN_REVENUE = None  # Optional: minimum revenue threshold
 # ============================================================================
 # EXCLUSION FILTERS (Optional)
 # ============================================================================
 # Use this section to exclude specific segments, customers, or products
 # Example: Exclude a business unit, test accounts, etc.
 EXCLUSION_FILTERS = {
    'enabled': False,  # Set to True to enable exclusions
    'exclude_by_column': None,  # Column name to filter on (e.g., 'Country', 'Segment')
    'exclude_values': [],  # List of values to exclude (e.g., ['KVT', 'Test'])
 }
 # ============================================================================
 # VALIDATION THRESHOLDS (Optional)
 # ============================================================================
 # Expected revenue ranges for validation (update based on your company)
 # These are used to validate that data loading is working correctly
 VALIDATION_ENABLED = False  # Set to True to enable validation
 EXPECTED_REVENUE = {}  # Example: {2021: 99_880_000, 2024: 89_990_000}
 REVENUE_TOLERANCE_PCT = 0.01  # 1% tolerance for validation
 # ============================================================================
 # HELPER FUNCTIONS
 # ============================================================================
 def ensure_directories() -> None:
    """
    Create output directories if they don't exist
    Creates charts/ and reports/ directories for saving analysis outputs.
    Called automatically by get_chart_path() and get_report_path().
    Returns:
        None: Creates directories in place
    """
    OUTPUT_DIR.mkdir(exist_ok=True)
    REPORTS_DIR.mkdir(exist_ok=True)
    if DATA_DIR.exists():
        DATA_DIR.mkdir(exist_ok=True)
 def get_chart_path(filename: str) -> Path:
    """
    Get full path for chart file
    Args:
        filename: Chart filename (e.g., 'revenue_trend.png')
    Returns:
        Path: Full path to chart file in OUTPUT_DIR
    """
    ensure_directories()
    return OUTPUT_DIR / filename
 def get_report_path(filename: str) -> Path:
    """
    Get full path for report file
    Args:
        filename: Report filename (e.g., 'analysis_report.pdf')
    Returns:
        Path: Full path to report file in REPORTS_DIR
    """
    ensure_directories()
    return REPORTS_DIR / filename
 def get_data_path(filename: Optional[str] = None) -> Path:
    """
    Get full path for data file
    This function handles data file location logic:
    - If DATA_DIR exists, looks there first
    - Otherwise uses current directory
    - Defaults to DATA_FILE from config if filename not provided
    Args:
        filename: Optional filename override (defaults to config.DATA_FILE)
    Returns:
        Path: Full path to data file
    Example:
        >>> from config import get_data_path
        >>> data_path = get_data_path()
        >>> print(f"Loading from: {data_path}")
    """
    if filename is None:
        filename = DATA_FILE
    if DATA_DIR.exists():
        return DATA_DIR / filename
    return Path(filename)
 def get_ltm_period() -> Tuple[Optional[pd.Period], Optional[pd.Period]]:
    """
    Get LTM (Last Twelve Months) period boundaries from config
    Returns LTM start and end periods if LTM is enabled and configured,
    otherwise returns (None, None).
    Returns:
        Tuple[Optional[pd.Period], Optional[pd.Period]]: 
            (ltm_start, ltm_end) or (None, None) if disabled
    Example:
        >>> ltm_start, ltm_end = get_ltm_period()
        >>> if ltm_start and ltm_end:
        ...     print(f"LTM: {ltm_start} to {ltm_end}")
    See Also:
        - get_ltm_label() - Get formatted LTM label string
        - .cursor/rules/ltm_methodology.md - LTM explanation
    """
    if LTM_ENABLED and LTM_START and LTM_END:
        return LTM_START, LTM_END
    return None, None
 def get_ltm_label() -> Optional[str]:
    """
    Get LTM label string for display
    Returns formatted label like "2025 (LTM 9/2025)" if LTM is enabled,
    otherwise None. Use this in chart titles and labels.
    Returns:
        Optional[str]: LTM label string or None if LTM disabled
    Example:
        >>> from config import get_ltm_label
        >>> ltm_label = get_ltm_label()
        >>> if ltm_label:
        ...     title = f'Revenue Trend\n({ltm_label})'
    See Also:
        - get_ltm_period() - Get LTM period objects
        - .cursor/rules/ltm_methodology.md - LTM usage guide
    """
    return LTM_LABEL if LTM_ENABLED else None
--- a/config_validator.py
+++ b/config_validator.py
@@ -0,0 +1,214 @@
 """
 Configuration validation utility
 Validates configuration settings against data to catch errors early
 Usage:
    from config_validator import validate_config
    # Validate configuration
    errors, warnings = validate_config(df)
    if errors:
        print("Configuration errors found:", errors)
 """
 import pandas as pd
 from pathlib import Path
 from config import (
    DATA_FILE, REVENUE_COLUMN, DATE_COLUMN, DATE_FALLBACK_COLUMNS,
    CUSTOMER_COLUMN, ITEM_COLUMN, QUANTITY_COLUMN,
    MIN_YEAR, MAX_DATE, ANALYSIS_YEARS,
    LTM_ENABLED, LTM_START, LTM_END, LTM_START_YEAR, LTM_END_YEAR,
    EXCLUSION_FILTERS, get_data_path
 )
 def validate_config(df=None):
    """
    Validate configuration against data
    Args:
        df: Optional DataFrame to validate against. If None, attempts to load data.
    Returns:
        tuple: (errors list, warnings list)
    Example:
        errors, warnings = validate_config(df)
        if errors:
            for error in errors:
                print(f"ERROR: {error}")
        if warnings:
            for warning in warnings:
                print(f"WARNING: {warning}")
    """
    errors = []
    warnings = []
    # Load data if not provided
    if df is None:
        try:
            from data_loader import load_sales_data
            data_path = get_data_path()
            if not data_path.exists():
                errors.append(f"Data file not found: {data_path}")
                return errors, warnings
            df = load_sales_data(data_path)
        except Exception as e:
            errors.append(f"Could not load data for validation: {e}")
            return errors, warnings
    # 1. Validate required columns exist
    required_columns = [REVENUE_COLUMN, DATE_COLUMN]
    for col in required_columns:
        if col not in df.columns:
            errors.append(f"Required column '{col}' not found in data. Available columns: {list(df.columns)[:10]}")
    # 2. Validate date column has valid dates
    if DATE_COLUMN in df.columns:
        date_coverage = df[DATE_COLUMN].notna().sum() / len(df) * 100
        if date_coverage < 50:
            errors.append(f"Date coverage is very low ({date_coverage:.1f}%). Check date column configuration.")
        elif date_coverage < 90:
            warnings.append(f"Date coverage is {date_coverage:.1f}%. Consider adding fallback date columns.")
    # 3. Validate fallback date columns
    if DATE_FALLBACK_COLUMNS:
        missing_fallbacks = [col for col in DATE_FALLBACK_COLUMNS if col not in df.columns]
        if missing_fallbacks:
            warnings.append(f"Fallback date columns not found: {missing_fallbacks}")
    # 4. Validate revenue column is numeric
    if REVENUE_COLUMN in df.columns:
        try:
            pd.to_numeric(df[REVENUE_COLUMN], errors='coerce')
            valid_revenue = df[REVENUE_COLUMN].notna().sum()
            if valid_revenue == 0:
                errors.append(f"Revenue column '{REVENUE_COLUMN}' has no valid numeric values")
            elif valid_revenue < len(df) * 0.9:
                warnings.append(f"Revenue column has {len(df) - valid_revenue} invalid values")
        except Exception:
            errors.append(f"Revenue column '{REVENUE_COLUMN}' cannot be converted to numeric")
    # 5. Validate date range
    if DATE_COLUMN in df.columns and df[DATE_COLUMN].notna().any():
        min_date_in_data = df[DATE_COLUMN].min()
        max_date_in_data = df[DATE_COLUMN].max()
        if MIN_YEAR and min_date_in_data.year > MIN_YEAR:
            warnings.append(f"MIN_YEAR ({MIN_YEAR}) is earlier than earliest data ({min_date_in_data.year})")
        if MAX_DATE and max_date_in_data > MAX_DATE:
            warnings.append(f"MAX_DATE ({MAX_DATE.date()}) is earlier than latest data ({max_date_in_data.date()})")
    # 6. Validate analysis years
    if 'Year' in df.columns:
        available_years = sorted(df['Year'].unique())
        missing_years = [year for year in ANALYSIS_YEARS if year not in available_years]
        if missing_years:
            warnings.append(f"ANALYSIS_YEARS includes years not in data: {missing_years}")
    # 7. Validate LTM configuration
    if LTM_ENABLED:
        if LTM_START is None or LTM_END is None:
            errors.append("LTM_ENABLED is True but LTM_START or LTM_END is None")
        else:
            if LTM_START > LTM_END:
                errors.append(f"LTM_START ({LTM_START}) is after LTM_END ({LTM_END})")
            if 'YearMonth' in df.columns:
                available_periods = df['YearMonth'].unique()
                if LTM_START not in available_periods:
                    warnings.append(f"LTM_START ({LTM_START}) not found in data")
                if LTM_END not in available_periods:
                    warnings.append(f"LTM_END ({LTM_END}) not found in data")
    # 8. Validate exclusion filters
    if EXCLUSION_FILTERS.get('enabled', False):
        exclude_col = EXCLUSION_FILTERS.get('exclude_by_column')
        if exclude_col:
            if exclude_col not in df.columns:
                errors.append(f"Exclusion filter column '{exclude_col}' not found in data")
            else:
                exclude_values = EXCLUSION_FILTERS.get('exclude_values', [])
                if exclude_values:
                    available_values = df[exclude_col].unique()
                    invalid_values = [v for v in exclude_values if v not in available_values]
                    if invalid_values:
                        warnings.append(f"Exclusion filter values not found in data: {invalid_values}")
    # 9. Validate optional columns (warnings only)
    optional_columns = {
        'Customer': CUSTOMER_COLUMN,
        'Item': ITEM_COLUMN,
        'Quantity': QUANTITY_COLUMN
    }
    for col_type, col_name in optional_columns.items():
        if col_name and col_name not in df.columns:
            warnings.append(f"Optional {col_type} column '{col_name}' not found. Some analyses may not work.")
    # 10. Validate data file exists
    data_path = get_data_path()
    if not data_path.exists():
        errors.append(f"Data file not found: {data_path}")
    return errors, warnings
 def print_validation_report(errors, warnings):
    """
    Print a formatted validation report
    Args:
        errors: List of error messages
        warnings: List of warning messages
    """
    print("\n" + "="*60)
    print("Configuration Validation Report")
    print("="*60)
    if errors:
        print(f"\n❌ ERRORS ({len(errors)}):")
        for i, error in enumerate(errors, 1):
            print(f"  {i}. {error}")
    else:
        print("\n✅ No configuration errors found")
    if warnings:
        print(f"\n⚠️  WARNINGS ({len(warnings)}):")
        for i, warning in enumerate(warnings, 1):
            print(f"  {i}. {warning}")
    else:
        print("\n✅ No warnings")
    print("\n" + "="*60)
    if errors:
        return False
    return True
 def validate_and_report(df=None):
    """
    Validate configuration and print report
    Args:
        df: Optional DataFrame to validate against
    Returns:
        bool: True if no errors, False otherwise
    """
    errors, warnings = validate_config(df)
    return print_validation_report(errors, warnings)
 # ============================================================================
 # STANDALONE VALIDATION SCRIPT
 # ============================================================================
 if __name__ == "__main__":
    """Run configuration validation"""
    print("Validating configuration...")
    is_valid = validate_and_report()
    if is_valid:
        print("\n✅ Configuration is valid!")
        exit(0)
    else:
        print("\n❌ Configuration has errors. Please fix them before running analyses.")
        exit(1)
--- a/data_loader.py
+++ b/data_loader.py
@@ -0,0 +1,224 @@
 """
 Generic data loading utility with flexible date handling
 Handles various date column formats and fallback logic
 This loader is designed to work with different CSV structures by:
 1. Trying primary date column first
 2. Falling back to alternative date columns if needed
 3. Ensuring 100% date coverage
 """
 import pandas as pd
 import numpy as np
 from pathlib import Path
 from config import (
    REVENUE_COLUMN, DATE_COLUMN, DATE_FALLBACK_COLUMNS,
    get_data_path
 )
 def load_sales_data(filepath=None):
    """
    Load sales data with flexible date handling
    This function provides intelligent data loading with fallback logic:
    1. Loads the CSV file
    2. Converts revenue column to numeric
    3. Attempts to parse dates using primary date column
    4. Falls back to alternative date columns if needed (100% coverage)
    5. Creates Year and YearMonth columns for analysis
    CRITICAL: Always use this function instead of pd.read_csv() directly.
    This ensures proper date parsing with fallback logic.
    Args:
        filepath: Path to the CSV file (defaults to config.DATA_FILE).
                  Can be str, Path, or None (uses config.get_data_path())
    Returns:
        pd.DataFrame: DataFrame with properly parsed dates and revenue.
                     Includes 'Year' and 'YearMonth' columns.
    Raises:
        FileNotFoundError: If data file doesn't exist.
                          Error message includes file path and suggests checking config.py
        ValueError: If required columns (REVENUE_COLUMN) are missing.
                   Error message lists available columns and suggests updating config.py
    Example:
        >>> from data_loader import load_sales_data
        >>> from config import get_data_path
        >>> df = load_sales_data(get_data_path())
        >>> print(f"Loaded {len(df):,} rows with {df['Year'].notna().sum():,} with dates")
    See Also:
        - .cursor/rules/data_loading.md for detailed patterns
        - config.py for column name configuration
    """
    # Get data file path
    if filepath is None:
        filepath = get_data_path()
    else:
        filepath = Path(filepath)
    # Check if file exists
    if not filepath.exists():
        raise FileNotFoundError(
            f"Data file not found: {filepath}\n"
            f"Please update config.py with the correct DATA_FILE path."
        )
    # Load CSV
    print(f"Loading data from: {filepath}")
    df = pd.read_csv(filepath, low_memory=False)
    print(f"Loaded {len(df):,} rows")
    # Validate required columns
    if REVENUE_COLUMN not in df.columns:
        raise ValueError(
            f"Required column '{REVENUE_COLUMN}' not found in data.\n"
            f"Available columns: {list(df.columns)}\n"
            f"Please update config.py REVENUE_COLUMN to match your data."
        )
    # Convert revenue column to numeric
    df[REVENUE_COLUMN] = pd.to_numeric(df[REVENUE_COLUMN], errors='coerce')
    # Count missing revenue values
    missing_revenue = df[REVENUE_COLUMN].isna().sum()
    if missing_revenue > 0:
        print(f"Warning: {missing_revenue:,} rows have missing/invalid revenue values")
    # Create working date column
    df['WorkingDate'] = pd.NaT
    # Try primary date column first
    if DATE_COLUMN in df.columns:
        print(f"Attempting to parse {DATE_COLUMN}...")
        df['Date_Parsed'] = pd.to_datetime(df[DATE_COLUMN], errors='coerce', format='mixed')
        parsed_count = df['Date_Parsed'].notna().sum()
        df.loc[df['Date_Parsed'].notna(), 'WorkingDate'] = df.loc[df['Date_Parsed'].notna(), 'Date_Parsed']
        print(f"  Parsed {parsed_count:,} dates from {DATE_COLUMN}")
    else:
        print(f"Warning: Primary date column '{DATE_COLUMN}' not found")
    # Use fallback date columns
    if DATE_FALLBACK_COLUMNS:
        for fallback_col in DATE_FALLBACK_COLUMNS:
            if fallback_col in df.columns:
                missing_dates = df['WorkingDate'].isna()
                if missing_dates.sum() > 0:
                    print(f"Using fallback column: {fallback_col}...")
                    fallback_parsed = pd.to_datetime(
                        df.loc[missing_dates, fallback_col], 
                        errors='coerce',
                        format='mixed'
                    )
                    newly_parsed = missing_dates & fallback_parsed.notna()
                    if newly_parsed.sum() > 0:
                        df.loc[newly_parsed, 'WorkingDate'] = fallback_parsed[newly_parsed]
                        print(f"  Parsed {newly_parsed.sum():,} additional dates from {fallback_col}")
    # Final fallback: try to construct from Year column if available
    if 'Year' in df.columns and df['WorkingDate'].isna().sum() > 0:
        missing_dates = df['WorkingDate'].isna()
        year_values = pd.to_numeric(df.loc[missing_dates, 'Year'], errors='coerce')
        valid_years = missing_dates & year_values.notna()
        if valid_years.sum() > 0:
            print(f"Using Year column for remaining {valid_years.sum():,} rows...")
            df.loc[valid_years, 'WorkingDate'] = pd.to_datetime(
                df.loc[valid_years, 'Year'].astype(int).astype(str) + '-01-01',
                errors='coerce'
            )
    # Set WorkingDate as the primary date column
    df[DATE_COLUMN] = df['WorkingDate']
    # Clean up temporary columns
    df = df.drop(columns=['Date_Parsed', 'WorkingDate'], errors='ignore')
    # Extract Year from date column
    df['Year'] = df[DATE_COLUMN].dt.year
    # Fill missing Year from Year column if it exists and date is missing
    if 'Year' in df.columns:
        year_orig = pd.to_numeric(df['Year'], errors='coerce')
        missing_year = df['Year'].isna()
        if missing_year.sum() > 0 and 'Year' in df.columns:
            year_fallback = pd.to_numeric(df.loc[missing_year, 'Year'], errors='coerce')
            df.loc[missing_year & year_fallback.notna(), 'Year'] = year_fallback[missing_year & year_fallback.notna()]
    # Create YearMonth for monthly analysis
    if DATE_COLUMN in df.columns:
        df['YearMonth'] = df[DATE_COLUMN].dt.to_period('M')
    # Report date coverage
    total_rows = len(df)
    date_coverage = df[DATE_COLUMN].notna().sum()
    coverage_pct = (date_coverage / total_rows * 100) if total_rows > 0 else 0
    print(f"Date coverage: {date_coverage:,} / {total_rows:,} rows ({coverage_pct:.1f}%)")
    if coverage_pct < 100:
        print(f"Warning: {total_rows - date_coverage:,} rows have missing dates")
    # Report date range
    if df[DATE_COLUMN].notna().any():
        min_date = df[DATE_COLUMN].min()
        max_date = df[DATE_COLUMN].max()
        print(f"Date range: {min_date.strftime('%Y-%m-%d')} to {max_date.strftime('%Y-%m-%d')}")
    return df
 def validate_data_structure(df: pd.DataFrame) -> tuple[bool, str]:
    """
    Validate that loaded data has expected structure.
    Checks for required columns, data quality, and basic validity.
    Returns actionable error messages if validation fails.
    Args:
        df: DataFrame to validate (should be result of load_sales_data())
    Returns:
        tuple[bool, str]: (is_valid, error_message)
            - is_valid: True if data structure is valid, False otherwise
            - error_message: "OK" if valid, otherwise descriptive error message
    Example:
        >>> df = load_sales_data(get_data_path())
        >>> is_valid, msg = validate_data_structure(df)
        >>> if not is_valid:
        ...     print(f"ERROR: {msg}")
    See Also:
        - load_sales_data() - Load data before validating
        - config_validator.py - Comprehensive configuration validation
    """
    from config import REVENUE_COLUMN, DATE_COLUMN
    errors = []
    # Check required columns
    if REVENUE_COLUMN not in df.columns:
        errors.append(f"Missing required column: {REVENUE_COLUMN}")
    if DATE_COLUMN not in df.columns:
        errors.append(f"Missing required column: {DATE_COLUMN}")
    # Check data quality
    if len(df) == 0:
        errors.append("DataFrame is empty")
    if REVENUE_COLUMN in df.columns:
        if df[REVENUE_COLUMN].isna().all():
            errors.append(f"All {REVENUE_COLUMN} values are NaN")
        if df[REVENUE_COLUMN].notna().sum() == 0:
            errors.append(f"No valid {REVENUE_COLUMN} values")
    if DATE_COLUMN in df.columns:
        if df[DATE_COLUMN].isna().all():
            errors.append(f"All {DATE_COLUMN} values are NaN")
    if errors:
        return False, "; ".join(errors)
    return True, "OK"
--- a/data_processing.py
+++ b/data_processing.py
@@ -0,0 +1,285 @@
 """
 Data processing utilities
 Common data cleaning and transformation helpers
 Usage:
    from data_processing import clean_data, create_pivot_table, prepare_time_series
    # Clean data
    df_clean = clean_data(df)
    # Create pivot table
    pivot = create_pivot_table(df, index='Year', columns='Product', values='Revenue')
 """
 import pandas as pd
 import numpy as np
 from config import REVENUE_COLUMN, DATE_COLUMN, MIN_QUANTITY, MAX_QUANTITY
 def clean_data(df, remove_duplicates=True, handle_missing_dates=True):
    """
    Clean data with common operations
    Args:
        df: DataFrame to clean
        remove_duplicates: Whether to remove duplicate rows
        handle_missing_dates: Whether to handle missing dates
    Returns:
        DataFrame: Cleaned DataFrame
    """
    df_clean = df.copy()
    # Remove duplicates
    if remove_duplicates:
        initial_count = len(df_clean)
        df_clean = df_clean.drop_duplicates()
        removed = initial_count - len(df_clean)
        if removed > 0:
            print(f"Removed {removed:,} duplicate rows")
    # Handle missing dates
    if handle_missing_dates and DATE_COLUMN in df_clean.columns:
        missing_dates = df_clean[DATE_COLUMN].isna().sum()
        if missing_dates > 0:
            print(f"Warning: {missing_dates:,} rows have missing dates")
    # Remove rows with negative revenue (if configured)
    if REVENUE_COLUMN in df_clean.columns:
        negative_revenue = (df_clean[REVENUE_COLUMN] < 0).sum()
        if negative_revenue > 0:
            print(f"Found {negative_revenue:,} rows with negative revenue")
            # Optionally remove: df_clean = df_clean[df_clean[REVENUE_COLUMN] >= 0]
    return df_clean
 def create_pivot_table(df, index, columns=None, values=None, aggfunc='sum', fill_value=0):
    """
    Create pivot table with common defaults
    Args:
        df: DataFrame
        index: Column(s) to use as index
        columns: Column(s) to use as columns
        values: Column(s) to aggregate
        aggfunc: Aggregation function (default: 'sum')
        fill_value: Value to fill missing cells (default: 0)
    Returns:
        DataFrame: Pivot table
    """
    if values is None and REVENUE_COLUMN in df.columns:
        values = REVENUE_COLUMN
    pivot = pd.pivot_table(
        df,
        index=index,
        columns=columns,
        values=values,
        aggfunc=aggfunc,
        fill_value=fill_value
    )
    return pivot
 def prepare_time_series(df, date_column=None, value_column=None, freq='M'):
    """
    Prepare time series data
    Args:
        df: DataFrame
        date_column: Date column name (defaults to config.DATE_COLUMN)
        value_column: Value column to aggregate (defaults to config.REVENUE_COLUMN)
        freq: Frequency for resampling ('D', 'W', 'M', 'Q', 'Y')
    Returns:
        Series: Time series data
    """
    if date_column is None:
        date_column = DATE_COLUMN
    if value_column is None:
        value_column = REVENUE_COLUMN
    if date_column not in df.columns:
        raise ValueError(f"Date column '{date_column}' not found")
    if value_column not in df.columns:
        raise ValueError(f"Value column '{value_column}' not found")
    # Ensure date column is datetime
    df = df.copy()
    df[date_column] = pd.to_datetime(df[date_column], errors='coerce')
    # Set date as index
    df_indexed = df.set_index(date_column)
    # Resample and aggregate
    time_series = df_indexed[value_column].resample(freq).sum()
    return time_series
 def aggregate_by_period(df, period='year', date_column=None, value_column=None):
    """
    Aggregate data by time period
    Args:
        df: DataFrame
        period: Period type ('year', 'month', 'quarter')
        date_column: Date column name
        value_column: Value column to aggregate
    Returns:
        DataFrame: Aggregated data
    """
    if date_column is None:
        date_column = DATE_COLUMN
    if value_column is None:
        value_column = REVENUE_COLUMN
    df = df.copy()
    df[date_column] = pd.to_datetime(df[date_column], errors='coerce')
    # Extract period
    if period == 'year':
        df['Period'] = df[date_column].dt.year
    elif period == 'month':
        df['Period'] = df[date_column].dt.to_period('M')
    elif period == 'quarter':
        df['Period'] = df[date_column].dt.to_period('Q')
    else:
        raise ValueError(f"Unknown period: {period}")
    # Aggregate
    aggregated = df.groupby('Period')[value_column].agg(['sum', 'count', 'mean']).reset_index()
    aggregated.columns = ['Period', 'Total', 'Count', 'Average']
    return aggregated
 def filter_outliers(df, column, method='iqr', lower_bound=None, upper_bound=None):
    """
    Filter outliers from DataFrame
    Args:
        df: DataFrame
        column: Column name to filter on
        method: Method ('iqr' for interquartile range, 'zscore' for z-score)
        lower_bound: Manual lower bound
        upper_bound: Manual upper bound
    Returns:
        DataFrame: Filtered DataFrame
    """
    df_filtered = df.copy()
    if method == 'iqr':
        q1 = df[column].quantile(0.25)
        q3 = df[column].quantile(0.75)
        iqr = q3 - q1
        lower = lower_bound if lower_bound is not None else q1 - 1.5 * iqr
        upper = upper_bound if upper_bound is not None else q3 + 1.5 * iqr
    elif method == 'zscore':
        mean = df[column].mean()
        std = df[column].std()
        lower = lower_bound if lower_bound is not None else mean - 3 * std
        upper = upper_bound if upper_bound is not None else mean + 3 * std
    else:
        raise ValueError(f"Unknown method: {method}")
    initial_count = len(df_filtered)
    df_filtered = df_filtered[(df_filtered[column] >= lower) & (df_filtered[column] <= upper)]
    removed = initial_count - len(df_filtered)
    if removed > 0:
        print(f"Removed {removed:,} outliers from {column} ({removed/initial_count*100:.1f}%)")
    return df_filtered
 def normalize_column(df, column, method='min_max'):
    """
    Normalize a column
    Args:
        df: DataFrame
        column: Column name to normalize
        method: Normalization method ('min_max', 'zscore')
    Returns:
        Series: Normalized values
    """
    if method == 'min_max':
        min_val = df[column].min()
        max_val = df[column].max()
        if max_val - min_val == 0:
            return pd.Series([0] * len(df), index=df.index)
        return (df[column] - min_val) / (max_val - min_val)
    elif method == 'zscore':
        mean = df[column].mean()
        std = df[column].std()
        if std == 0:
            return pd.Series([0] * len(df), index=df.index)
        return (df[column] - mean) / std
    else:
        raise ValueError(f"Unknown method: {method}")
 def create_derived_columns(df):
    """
    Create common derived columns
    Args:
        df: DataFrame
    Returns:
        DataFrame: DataFrame with derived columns
    """
    df_derived = df.copy()
    # Extract year, month, quarter if date column exists
    if DATE_COLUMN in df_derived.columns:
        df_derived[DATE_COLUMN] = pd.to_datetime(df_derived[DATE_COLUMN], errors='coerce')
        if 'Year' not in df_derived.columns:
            df_derived['Year'] = df_derived[DATE_COLUMN].dt.year
        if 'Month' not in df_derived.columns:
            df_derived['Month'] = df_derived[DATE_COLUMN].dt.month
        if 'Quarter' not in df_derived.columns:
            df_derived['Quarter'] = df_derived[DATE_COLUMN].dt.quarter
        if 'YearMonth' not in df_derived.columns:
            df_derived['YearMonth'] = df_derived[DATE_COLUMN].dt.to_period('M')
    # Calculate price per unit if quantity and revenue exist
    from config import QUANTITY_COLUMN
    if QUANTITY_COLUMN in df_derived.columns and REVENUE_COLUMN in df_derived.columns:
        df_derived['Price_Per_Unit'] = df_derived[REVENUE_COLUMN] / df_derived[QUANTITY_COLUMN].replace(0, np.nan)
    return df_derived
 # ============================================================================
 # EXAMPLE USAGE
 # ============================================================================
 if __name__ == "__main__":
    """Example usage"""
    # Create sample data
    df = pd.DataFrame({
        'InvoiceDate': pd.date_range('2023-01-01', periods=100, freq='D'),
        'USD': np.random.normal(1000, 200, 100),
        'Quantity': np.random.randint(1, 100, 100)
    })
    # Clean data
    df_clean = clean_data(df)
    print(f"Cleaned data: {len(df_clean)} rows")
    # Create pivot table
    df_clean['Year'] = df_clean['InvoiceDate'].dt.year
    pivot = create_pivot_table(df_clean, index='Year', values='USD')
    print("\nPivot table:")
    print(pivot)
    # Prepare time series
    ts = prepare_time_series(df_clean, freq='M')
    print(f"\nTime series: {len(ts)} periods")
--- a/data_quality.py
+++ b/data_quality.py
@@ -0,0 +1,344 @@
 """
 Data quality reporting utility
 Generates comprehensive data quality reports
 Usage:
    from data_quality import generate_data_quality_report, print_data_quality_report
    # Generate and print report
    report = generate_data_quality_report(df)
    print_data_quality_report(report)
 """
 import pandas as pd
 import numpy as np
 from config import (
    REVENUE_COLUMN, DATE_COLUMN, CUSTOMER_COLUMN, ITEM_COLUMN,
    QUANTITY_COLUMN, MIN_QUANTITY, MAX_QUANTITY
 )
 def generate_data_quality_report(df):
    """
    Generate comprehensive data quality report
    Args:
        df: DataFrame to analyze
    Returns:
        dict: Dictionary containing data quality metrics
    """
    report = {
        'overview': {},
        'missing_values': {},
        'duplicates': {},
        'outliers': {},
        'data_types': {},
        'date_coverage': {},
        'revenue_summary': {},
        'issues': []
    }
    # Overview
    report['overview'] = {
        'total_rows': len(df),
        'total_columns': len(df.columns),
        'memory_usage_mb': df.memory_usage(deep=True).sum() / 1024**2
    }
    # Missing values
    missing = df.isnull().sum()
    missing_pct = (missing / len(df)) * 100
    report['missing_values'] = {
        'by_column': missing[missing > 0].to_dict(),
        'percentages': missing_pct[missing > 0].to_dict(),
        'total_missing': missing.sum(),
        'columns_with_missing': len(missing[missing > 0])
    }
    # Duplicates
    duplicate_rows = df.duplicated().sum()
    report['duplicates'] = {
        'duplicate_rows': int(duplicate_rows),
        'duplicate_percentage': (duplicate_rows / len(df)) * 100 if len(df) > 0 else 0
    }
    # Outliers (revenue and quantity)
    outliers = {}
    if REVENUE_COLUMN in df.columns:
        revenue = pd.to_numeric(df[REVENUE_COLUMN], errors='coerce')
        q1 = revenue.quantile(0.25)
        q3 = revenue.quantile(0.75)
        iqr = q3 - q1
        lower_bound = q1 - 1.5 * iqr
        upper_bound = q3 + 1.5 * iqr
        revenue_outliers = ((revenue < lower_bound) | (revenue > upper_bound)).sum()
        outliers['revenue'] = {
            'count': int(revenue_outliers),
            'percentage': (revenue_outliers / len(df)) * 100 if len(df) > 0 else 0,
            'lower_bound': float(lower_bound),
            'upper_bound': float(upper_bound),
            'negative_values': int((revenue < 0).sum())
        }
    if QUANTITY_COLUMN in df.columns:
        quantity = pd.to_numeric(df[QUANTITY_COLUMN], errors='coerce')
        # Use config thresholds if available
        if MIN_QUANTITY is not None and MAX_QUANTITY is not None:
            quantity_outliers = ((quantity < MIN_QUANTITY) | (quantity > MAX_QUANTITY)).sum()
            outliers['quantity'] = {
                'count': int(quantity_outliers),
                'percentage': (quantity_outliers / len(df)) * 100 if len(df) > 0 else 0,
                'below_min': int((quantity < MIN_QUANTITY).sum()),
                'above_max': int((quantity > MAX_QUANTITY).sum())
            }
        else:
            q1 = quantity.quantile(0.25)
            q3 = quantity.quantile(0.75)
            iqr = q3 - q1
            lower_bound = q1 - 1.5 * iqr
            upper_bound = q3 + 1.5 * iqr
            quantity_outliers = ((quantity < lower_bound) | (quantity > upper_bound)).sum()
            outliers['quantity'] = {
                'count': int(quantity_outliers),
                'percentage': (quantity_outliers / len(df)) * 100 if len(df) > 0 else 0,
                'lower_bound': float(lower_bound),
                'upper_bound': float(upper_bound)
            }
    report['outliers'] = outliers
    # Data types
    report['data_types'] = {
        'numeric_columns': list(df.select_dtypes(include=[np.number]).columns),
        'datetime_columns': list(df.select_dtypes(include=['datetime64']).columns),
        'object_columns': list(df.select_dtypes(include=['object']).columns),
        'type_summary': df.dtypes.value_counts().to_dict()
    }
    # Date coverage
    if DATE_COLUMN in df.columns:
        date_coverage = df[DATE_COLUMN].notna().sum()
        report['date_coverage'] = {
            'total_rows': len(df),
            'rows_with_dates': int(date_coverage),
            'coverage_percentage': (date_coverage / len(df)) * 100 if len(df) > 0 else 0,
            'min_date': str(df[DATE_COLUMN].min()) if date_coverage > 0 else None,
            'max_date': str(df[DATE_COLUMN].max()) if date_coverage > 0 else None
        }
    # Revenue summary
    if REVENUE_COLUMN in df.columns:
        revenue = pd.to_numeric(df[REVENUE_COLUMN], errors='coerce')
        valid_revenue = revenue.dropna()
        if len(valid_revenue) > 0:
            report['revenue_summary'] = {
                'total_revenue': float(valid_revenue.sum()),
                'mean_revenue': float(valid_revenue.mean()),
                'median_revenue': float(valid_revenue.median()),
                'min_revenue': float(valid_revenue.min()),
                'max_revenue': float(valid_revenue.max()),
                'std_revenue': float(valid_revenue.std()),
                'valid_rows': int(len(valid_revenue)),
                'invalid_rows': int(len(df) - len(valid_revenue))
            }
    # Identify issues
    issues = []
    # Critical issues
    if report['missing_values']['columns_with_missing'] > 0:
        high_missing = {k: v for k, v in report['missing_values']['percentages'].items() if v > 50}
        if high_missing:
            issues.append({
                'severity': 'critical',
                'issue': f"Columns with >50% missing values: {list(high_missing.keys())}",
                'impact': 'High'
            })
    if DATE_COLUMN in df.columns:
        if report['date_coverage']['coverage_percentage'] < 50:
            issues.append({
                'severity': 'critical',
                'issue': f"Date coverage is only {report['date_coverage']['coverage_percentage']:.1f}%",
                'impact': 'High - analyses may fail'
            })
    if REVENUE_COLUMN in df.columns:
        if report['revenue_summary'].get('invalid_rows', 0) > len(df) * 0.1:
            issues.append({
                'severity': 'critical',
                'issue': f"{report['revenue_summary']['invalid_rows']} rows have invalid revenue values",
                'impact': 'High'
            })
    # Warnings
    if report['duplicates']['duplicate_percentage'] > 5:
        issues.append({
            'severity': 'warning',
            'issue': f"{report['duplicates']['duplicate_rows']} duplicate rows ({report['duplicates']['duplicate_percentage']:.1f}%)",
            'impact': 'Medium'
        })
    if 'revenue' in outliers:
        if outliers['revenue']['percentage'] > 10:
            issues.append({
                'severity': 'warning',
                'issue': f"{outliers['revenue']['count']} revenue outliers ({outliers['revenue']['percentage']:.1f}%)",
                'impact': 'Medium'
            })
    report['issues'] = issues
    return report
 def print_data_quality_report(report):
    """
    Print formatted data quality report
    Args:
        report: Dictionary from generate_data_quality_report()
    """
    print("\n" + "="*70)
    print("DATA QUALITY REPORT")
    print("="*70)
    # Overview
    print("\n📊 OVERVIEW")
    print("-" * 70)
    print(f"Total Rows: {report['overview']['total_rows']:,}")
    print(f"Total Columns: {report['overview']['total_columns']}")
    print(f"Memory Usage: {report['overview']['memory_usage_mb']:.2f} MB")
    # Missing values
    print("\n🔍 MISSING VALUES")
    print("-" * 70)
    if report['missing_values']['columns_with_missing'] > 0:
        print(f"Columns with missing values: {report['missing_values']['columns_with_missing']}")
        print(f"Total missing values: {report['missing_values']['total_missing']:,}")
        print("\nTop columns by missing values:")
        missing_sorted = sorted(
            report['missing_values']['percentages'].items(),
            key=lambda x: x[1],
            reverse=True
        )[:10]
        for col, pct in missing_sorted:
            count = report['missing_values']['by_column'][col]
            print(f"  {col:30s}: {count:8,} ({pct:5.1f}%)")
    else:
        print("✅ No missing values found")
    # Duplicates
    print("\n🔄 DUPLICATES")
    print("-" * 70)
    if report['duplicates']['duplicate_rows'] > 0:
        print(f"Duplicate Rows: {report['duplicates']['duplicate_rows']:,} ({report['duplicates']['duplicate_percentage']:.2f}%)")
    else:
        print("✅ No duplicate rows found")
    # Outliers
    print("\n📈 OUTLIERS")
    print("-" * 70)
    if 'revenue' in report['outliers']:
        rev_out = report['outliers']['revenue']
        print(f"Revenue Outliers: {rev_out['count']:,} ({rev_out['percentage']:.2f}%)")
        if 'negative_values' in rev_out and rev_out['negative_values'] > 0:
            print(f"  Negative Revenue Values: {rev_out['negative_values']:,}")
    if 'quantity' in report['outliers']:
        qty_out = report['outliers']['quantity']
        print(f"Quantity Outliers: {qty_out['count']:,} ({qty_out['percentage']:.2f}%)")
    if not report['outliers']:
        print("✅ No significant outliers detected")
    # Date coverage
    if report['date_coverage']:
        print("\n📅 DATE COVERAGE")
        print("-" * 70)
        dc = report['date_coverage']
        print(f"Rows with Dates: {dc['rows_with_dates']:,} / {dc['total_rows']:,} ({dc['coverage_percentage']:.1f}%)")
        if dc['min_date']:
            print(f"Date Range: {dc['min_date']} to {dc['max_date']}")
    # Revenue summary
    if report['revenue_summary']:
        print("\n💰 REVENUE SUMMARY")
        print("-" * 70)
        rs = report['revenue_summary']
        print(f"Total Revenue: ${rs['total_revenue'] / 1e6:.2f}m")
        print(f"Valid Rows: {rs['valid_rows']:,} / {rs['valid_rows'] + rs['invalid_rows']:,}")
        if rs['invalid_rows'] > 0:
            print(f"Invalid Rows: {rs['invalid_rows']:,}")
        print(f"Mean: ${rs['mean_revenue']:,.2f}")
        print(f"Median: ${rs['median_revenue']:,.2f}")
        print(f"Min: ${rs['min_revenue']:,.2f}")
        print(f"Max: ${rs['max_revenue']:,.2f}")
    # Issues
    if report['issues']:
        print("\n⚠️  ISSUES DETECTED")
        print("-" * 70)
        critical = [i for i in report['issues'] if i['severity'] == 'critical']
        warnings = [i for i in report['issues'] if i['severity'] == 'warning']
        if critical:
            print("❌ CRITICAL ISSUES:")
            for issue in critical:
                print(f"  • {issue['issue']}")
                print(f"    Impact: {issue['impact']}")
        if warnings:
            print("\n⚠️  WARNINGS:")
            for issue in warnings:
                print(f"  • {issue['issue']}")
                print(f"    Impact: {issue['impact']}")
    else:
        print("\n✅ NO ISSUES DETECTED")
    print("\n" + "="*70)
 def generate_data_quality_report_simple(df):
    """
    Generate a simple data quality summary (quick check)
    Args:
        df: DataFrame to analyze
    Returns:
        str: Simple summary string
    """
    summary_parts = []
    summary_parts.append(f"Rows: {len(df):,}")
    summary_parts.append(f"Columns: {len(df.columns)}")
    if REVENUE_COLUMN in df.columns:
        revenue = pd.to_numeric(df[REVENUE_COLUMN], errors='coerce')
        valid = revenue.notna().sum()
        summary_parts.append(f"Valid Revenue: {valid:,} ({valid/len(df)*100:.1f}%)")
    if DATE_COLUMN in df.columns:
        date_coverage = df[DATE_COLUMN].notna().sum()
        summary_parts.append(f"Date Coverage: {date_coverage:,} ({date_coverage/len(df)*100:.1f}%)")
    return " | ".join(summary_parts)
 # ============================================================================
 # STANDALONE DATA QUALITY CHECK
 # ============================================================================
 if __name__ == "__main__":
    """Run data quality check"""
    from data_loader import load_sales_data
    from config import get_data_path
    print("Loading data for quality check...")
    try:
        df = load_sales_data(get_data_path())
        report = generate_data_quality_report(df)
        print_data_quality_report(report)
    except Exception as e:
        print(f"ERROR: {e}")
--- a/examples/annual_revenue_trend.py
+++ b/examples/annual_revenue_trend.py
@@ -0,0 +1,134 @@
 """
 Example: Annual Revenue Trend Analysis
 Simple example showing annual revenue with LTM support
 This is a working example that demonstrates:
 - Loading data using data_loader
 - Calculating annual metrics with LTM
 - Creating a revenue trend chart
 - Following template best practices
 """
 import pandas as pd
 import matplotlib.pyplot as plt
 from pathlib import Path
 # Import utilities
 from data_loader import load_sales_data, validate_data_structure
 from validate_revenue import validate_revenue
 from analysis_utils import (
    get_ltm_period_config, calculate_annual_metrics,
    setup_revenue_chart, save_chart,
    format_currency, print_annual_summary, sort_mixed_years,
    apply_exclusion_filters
 )
 from config import (
    OUTPUT_DIR, ANALYSIS_YEARS, MAX_DATE,
    CHART_SIZES, ensure_directories, get_data_path, COMPANY_NAME,
    REVENUE_COLUMN, MIN_YEAR, DATE_COLUMN
 )
 # ============================================================================
 # CONFIGURATION
 # ============================================================================
 ANALYSIS_NAME = "Annual Revenue Trend"
 DESCRIPTION = "Simple annual revenue trend analysis with LTM support"
 # ============================================================================
 # MAIN ANALYSIS FUNCTION
 # ============================================================================
 def main():
    """Main analysis function"""
    print(f"\n{'='*60}")
    print(f"{ANALYSIS_NAME}")
    print(f"{'='*60}\n")
    # 1. Load data
    print("Loading data...")
    try:
        df = load_sales_data(get_data_path())
        print(f"Loaded {len(df):,} transactions")
    except Exception as e:
        print(f"ERROR loading data: {e}")
        return
    # 2. Validate data structure
    is_valid, msg = validate_data_structure(df)
    if not is_valid:
        print(f"ERROR: {msg}")
        return
    print("Data validation passed")
    # 3. Apply exclusion filters (if configured)
    df = apply_exclusion_filters(df)
    # 4. Filter by date range
    df = df[df['Year'] >= MIN_YEAR]
    if DATE_COLUMN in df.columns:
        df = df[df[DATE_COLUMN] <= MAX_DATE]
    # 5. Setup LTM period (if enabled)
    ltm_start, ltm_end = get_ltm_period_config()
    if ltm_start and ltm_end:
        print(f"LTM period: {ltm_start} to {ltm_end}")
    # 6. Calculate annual metrics
    print("\nCalculating annual metrics...")
    def calculate_metrics(year_data):
        """Calculate metrics for a single year"""
        return {
            'Revenue': year_data[REVENUE_COLUMN].sum(),
        }
    annual_df = calculate_annual_metrics(df, calculate_metrics, ltm_start, ltm_end)
    # 7. Print summary
    print_annual_summary(annual_df, 'Revenue', 'Revenue')
    # 8. Create visualization
    print("Generating chart...")
    ensure_directories()
    # Annual revenue trend chart
    fig, ax = plt.subplots(figsize=CHART_SIZES['medium'])
    # Prepare data for plotting (handle mixed types)
    annual_df_sorted = sort_mixed_years(annual_df.reset_index(), 'Year')
    years = annual_df_sorted['Year'].tolist()
    revenue = annual_df_sorted['Revenue'].values / 1e6  # Convert to millions
    # Create chart
    ax.plot(range(len(years)), revenue, marker='o', linewidth=2, markersize=8, color='#2E86AB')
    ax.set_xticks(range(len(years)))
    ax.set_xticklabels(years, rotation=45, ha='right')
    setup_revenue_chart(ax)
    # Add LTM notation to title if applicable
    title = f'Annual Revenue Trend - {COMPANY_NAME}'
    if ltm_start and ltm_end:
        from config import get_ltm_label
        ltm_label = get_ltm_label()
        if ltm_label:
            title += f'\n({ltm_label})'
    ax.set_title(title, fontsize=14, fontweight='bold')
    plt.tight_layout()
    save_chart(fig, 'annual_revenue_trend.png')
    plt.close()
    # 9. Validate revenue
    print("\nValidating revenue...")
    validate_revenue(df, ANALYSIS_NAME)
    print(f"\n{ANALYSIS_NAME} complete!")
    print(f"Chart saved to: {OUTPUT_DIR}")
 # ============================================================================
 # RUN ANALYSIS
 # ============================================================================
 if __name__ == "__main__":
    main()
--- a/examples/cohort_analysis.py
+++ b/examples/cohort_analysis.py
@@ -0,0 +1,218 @@
 """
 Example: Cohort Analysis
 Advanced example showing customer cohort retention analysis
 This demonstrates:
 - Cohort-based analysis
 - Retention rate calculations
 - Revenue retention metrics
 - Advanced visualization
 """
 import pandas as pd
 import numpy as np
 import matplotlib.pyplot as plt
 import seaborn as sns
 from pathlib import Path
 from operator import attrgetter
 # Import utilities
 from data_loader import load_sales_data, validate_data_structure
 from validate_revenue import validate_revenue
 from analysis_utils import (
    get_ltm_period_config, apply_exclusion_filters,
    setup_revenue_chart, save_chart, format_currency
 )
 from config import (
    OUTPUT_DIR, MAX_DATE, CHART_SIZES, ensure_directories,
    get_data_path, COMPANY_NAME, REVENUE_COLUMN, CUSTOMER_COLUMN,
    DATE_COLUMN, MIN_YEAR
 )
 # ============================================================================
 # CONFIGURATION
 # ============================================================================
 ANALYSIS_NAME = "Cohort Analysis"
 DESCRIPTION = "Customer cohort retention and revenue analysis"
 # ============================================================================
 # COHORT ANALYSIS FUNCTIONS
 # ============================================================================
 def create_cohorts(df):
    """
    Create customer cohorts based on first purchase date
    Args:
        df: DataFrame with customer and date columns
    Returns:
        DataFrame: Original DataFrame with 'Cohort' and 'CohortPeriod' columns
    """
    from config import CUSTOMER_COLUMN, DATE_COLUMN
    # Get first purchase date for each customer
    first_purchase = df.groupby(CUSTOMER_COLUMN)[DATE_COLUMN].min().reset_index()
    first_purchase.columns = [CUSTOMER_COLUMN, 'FirstPurchaseDate']
    # Extract cohort year-month
    first_purchase['Cohort'] = first_purchase['FirstPurchaseDate'].dt.to_period('M')
    # Merge back to original data
    df_with_cohort = df.merge(first_purchase[[CUSTOMER_COLUMN, 'Cohort']], on=CUSTOMER_COLUMN)
    # Calculate period number (months since first purchase)
    df_with_cohort['Period'] = df_with_cohort[DATE_COLUMN].dt.to_period('M')
    df_with_cohort['CohortPeriod'] = (df_with_cohort['Period'] - df_with_cohort['Cohort']).apply(attrgetter('n'))
    return df_with_cohort
 def calculate_cohort_metrics(df_with_cohort):
    """
    Calculate cohort retention metrics
    Args:
        df_with_cohort: DataFrame with Cohort and CohortPeriod columns
    Returns:
        DataFrame: Cohort metrics by period
    """
    from config import REVENUE_COLUMN, CUSTOMER_COLUMN
    # Customer count by cohort and period
    cohort_size = df_with_cohort.groupby('Cohort')[CUSTOMER_COLUMN].nunique()
    # Revenue by cohort and period
    cohort_revenue = df_with_cohort.groupby(['Cohort', 'CohortPeriod']).agg({
        CUSTOMER_COLUMN: 'nunique',
        REVENUE_COLUMN: 'sum'
    }).reset_index()
    cohort_revenue.columns = ['Cohort', 'Period', 'Customers', 'Revenue']
    # Calculate retention rates
    cohort_retention = []
    for cohort in cohort_revenue['Cohort'].unique():
        cohort_data = cohort_revenue[cohort_revenue['Cohort'] == cohort].copy()
        initial_customers = cohort_data[cohort_data['Period'] == 0]['Customers'].values[0]
        cohort_data['Retention_Rate'] = (cohort_data['Customers'] / initial_customers) * 100
        cohort_data['Revenue_Retention'] = cohort_data['Revenue'] / cohort_data[cohort_data['Period'] == 0]['Revenue'].values[0] * 100
        cohort_retention.append(cohort_data)
    return pd.concat(cohort_retention, ignore_index=True)
 # ============================================================================
 # MAIN ANALYSIS FUNCTION
 # ============================================================================
 def main():
    """Main analysis function"""
    print(f"\n{'='*60}")
    print(f"{ANALYSIS_NAME}")
    print(f"{'='*60}\n")
    # 1. Load data
    print("Loading data...")
    try:
        df = load_sales_data(get_data_path())
        print(f"Loaded {len(df):,} transactions")
    except Exception as e:
        print(f"ERROR loading data: {e}")
        return
    # 2. Validate
    is_valid, msg = validate_data_structure(df)
    if not is_valid:
        print(f"ERROR: {msg}")
        return
    if CUSTOMER_COLUMN not in df.columns:
        print(f"ERROR: Customer column '{CUSTOMER_COLUMN}' not found")
        return
    # 3. Apply filters
    df = apply_exclusion_filters(df)
    df = df[df['Year'] >= MIN_YEAR]
    if DATE_COLUMN in df.columns:
        df = df[df[DATE_COLUMN] <= MAX_DATE]
    # 4. Create cohorts
    print("\nCreating customer cohorts...")
    df_cohort = create_cohorts(df)
    # 5. Calculate cohort metrics
    print("Calculating cohort metrics...")
    cohort_metrics = calculate_cohort_metrics(df_cohort)
    # 6. Print summary
    print("\nCohort Summary:")
    print("-" * 60)
    for cohort in sorted(cohort_metrics['Cohort'].unique())[:5]:  # Show top 5 cohorts
        cohort_data = cohort_metrics[cohort_metrics['Cohort'] == cohort]
        period_0 = cohort_data[cohort_data['Period'] == 0]
        if len(period_0) > 0:
            initial_customers = period_0['Customers'].values[0]
            initial_revenue = period_0['Revenue'].values[0]
            print(f"\n{cohort}:")
            print(f"  Initial: {initial_customers:,} customers, {format_currency(initial_revenue)}")
            # Show retention at period 12
            period_12 = cohort_data[cohort_data['Period'] == 12]
            if len(period_12) > 0:
                retention = period_12['Retention_Rate'].values[0]
                revenue_ret = period_12['Revenue_Retention'].values[0]
                print(f"  Period 12: {retention:.1f}% customer retention, {revenue_ret:.1f}% revenue retention")
    # 7. Create visualizations
    print("\nGenerating charts...")
    ensure_directories()
    # Heatmap: Customer retention
    pivot_retention = cohort_metrics.pivot_table(
        index='Cohort',
        columns='Period',
        values='Retention_Rate',
        aggfunc='mean'
    )
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=CHART_SIZES['wide'])
    # Retention heatmap
    sns.heatmap(pivot_retention, annot=True, fmt='.0f', cmap='YlOrRd', ax=ax1, cbar_kws={'label': 'Retention %'})
    ax1.set_title('Customer Retention by Cohort\n(Period 0 = 100%)', fontsize=12, fontweight='bold')
    ax1.set_xlabel('Months Since First Purchase')
    ax1.set_ylabel('Cohort')
    # Revenue retention heatmap
    pivot_revenue = cohort_metrics.pivot_table(
        index='Cohort',
        columns='Period',
        values='Revenue_Retention',
        aggfunc='mean'
    )
    sns.heatmap(pivot_revenue, annot=True, fmt='.0f', cmap='YlGnBu', ax=ax2, cbar_kws={'label': 'Revenue Retention %'})
    ax2.set_title('Revenue Retention by Cohort\n(Period 0 = 100%)', fontsize=12, fontweight='bold')
    ax2.set_xlabel('Months Since First Purchase')
    ax2.set_ylabel('Cohort')
    plt.suptitle(f'Cohort Analysis - {COMPANY_NAME}', fontsize=14, fontweight='bold', y=1.02)
    plt.tight_layout()
    save_chart(fig, 'cohort_analysis.png')
    plt.close()
    # 8. Validate
    print("\nValidating revenue...")
    validate_revenue(df, ANALYSIS_NAME)
    print(f"\n{ANALYSIS_NAME} complete!")
    print(f"Charts saved to: {OUTPUT_DIR}")
 # ============================================================================
 # RUN ANALYSIS
 # ============================================================================
 if __name__ == "__main__":
    main()
--- a/examples/customer_segmentation.py
+++ b/examples/customer_segmentation.py
@@ -0,0 +1,213 @@
 """
 Example: Customer Segmentation (RFM) Analysis
 Example showing customer segmentation using RFM methodology
 This example demonstrates:
 - Customer-level aggregation
 - RFM segmentation (Recency, Frequency, Monetary)
 - Segment analysis and visualization
 """
 import pandas as pd
 import numpy as np
 import matplotlib.pyplot as plt
 from pathlib import Path
 # Import utilities
 from data_loader import load_sales_data, validate_data_structure
 from validate_revenue import validate_revenue
 from analysis_utils import (
    get_ltm_period_config, apply_exclusion_filters,
    setup_revenue_chart, save_chart, format_currency
 )
 from config import (
    OUTPUT_DIR, MAX_DATE, CHART_SIZES, ensure_directories,
    get_data_path, COMPANY_NAME, REVENUE_COLUMN, CUSTOMER_COLUMN,
    DATE_COLUMN, MIN_YEAR
 )
 # ============================================================================
 # CONFIGURATION
 # ============================================================================
 ANALYSIS_NAME = "Customer Segmentation (RFM)"
 DESCRIPTION = "Customer segmentation using RFM methodology"
 # ============================================================================
 # RFM SEGMENTATION FUNCTIONS
 # ============================================================================
 def calculate_rfm_scores(df, analysis_date=None):
    """
    Calculate RFM scores for each customer
    Args:
        df: DataFrame with customer, date, and revenue columns
        analysis_date: Reference date for recency calculation (defaults to max date)
    Returns:
        DataFrame with RFM scores and segment assignment
    """
    if analysis_date is None:
        analysis_date = df[DATE_COLUMN].max()
    # Calculate customer-level metrics
    customer_metrics = df.groupby(CUSTOMER_COLUMN).agg({
        DATE_COLUMN: ['max', 'count'],
        REVENUE_COLUMN: 'sum'
    }).reset_index()
    customer_metrics.columns = [CUSTOMER_COLUMN, 'LastPurchaseDate', 'Frequency', 'Monetary']
    # Calculate Recency (days since last purchase)
    customer_metrics['Recency'] = (analysis_date - customer_metrics['LastPurchaseDate']).dt.days
    # Score each dimension (1-5 scale, 5 = best)
    customer_metrics['R_Score'] = pd.qcut(
        customer_metrics['Recency'].rank(method='first'),
        q=5, labels=[5, 4, 3, 2, 1], duplicates='drop'
    ).astype(int)
    customer_metrics['F_Score'] = pd.qcut(
        customer_metrics['Frequency'].rank(method='first'),
        q=5, labels=[1, 2, 3, 4, 5], duplicates='drop'
    ).astype(int)
    customer_metrics['M_Score'] = pd.qcut(
        customer_metrics['Monetary'].rank(method='first'),
        q=5, labels=[1, 2, 3, 4, 5], duplicates='drop'
    ).astype(int)
    # Calculate RFM score (sum of R, F, M)
    customer_metrics['RFM_Score'] = (
        customer_metrics['R_Score'] +
        customer_metrics['F_Score'] +
        customer_metrics['M_Score']
    )
    # Assign segments
    def assign_segment(row):
        r, f, m = row['R_Score'], row['F_Score'], row['M_Score']
        if r >= 4 and f >= 4 and m >= 4:
            return 'Champions'
        elif r >= 3 and f >= 3 and m >= 4:
            return 'Loyal Customers'
        elif r >= 4 and f <= 2:
            return 'At Risk'
        elif r <= 2:
            return 'Hibernating'
        elif r >= 3 and f >= 3 and m <= 2:
            return 'Potential Loyalists'
        else:
            return 'Need Attention'
    customer_metrics['Segment'] = customer_metrics.apply(assign_segment, axis=1)
    return customer_metrics
 # ============================================================================
 # MAIN ANALYSIS FUNCTION
 # ============================================================================
 def main():
    """Main analysis function"""
    print(f"\n{'='*60}")
    print(f"{ANALYSIS_NAME}")
    print(f"{'='*60}\n")
    # 1. Load data
    print("Loading data...")
    try:
        df = load_sales_data(get_data_path())
        print(f"Loaded {len(df):,} transactions")
    except Exception as e:
        print(f"ERROR loading data: {e}")
        return
    # 2. Validate data structure
    is_valid, msg = validate_data_structure(df)
    if not is_valid:
        print(f"ERROR: {msg}")
        return
    if CUSTOMER_COLUMN not in df.columns:
        print(f"ERROR: Customer column '{CUSTOMER_COLUMN}' not found in data")
        return
    print("Data validation passed")
    # 3. Apply exclusion filters
    df = apply_exclusion_filters(df)
    # 4. Filter by date range
    df = df[df['Year'] >= MIN_YEAR]
    if DATE_COLUMN in df.columns:
        df = df[df[DATE_COLUMN] <= MAX_DATE]
    # 5. Calculate RFM scores
    print("\nCalculating RFM scores...")
    rfm_df = calculate_rfm_scores(df)
    # 6. Segment summary
    print("\nCustomer Segmentation Summary:")
    print("-" * 60)
    segment_summary = rfm_df.groupby('Segment').agg({
        CUSTOMER_COLUMN: 'count',
        'Monetary': 'sum'
    }).reset_index()
    segment_summary.columns = ['Segment', 'Customer Count', 'Total Revenue']
    segment_summary = segment_summary.sort_values('Total Revenue', ascending=False)
    for _, row in segment_summary.iterrows():
        pct_customers = (row['Customer Count'] / len(rfm_df)) * 100
        pct_revenue = (row['Total Revenue'] / rfm_df['Monetary'].sum()) * 100
        print(f"{row['Segment']:20s}: {row['Customer Count']:5d} customers ({pct_customers:5.1f}%), "
              f"{format_currency(row['Total Revenue'])} ({pct_revenue:5.1f}% of revenue)")
    # 7. Create visualizations
    print("\nGenerating charts...")
    ensure_directories()
    # Chart 1: Revenue by Segment
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=CHART_SIZES['wide'])
    segment_summary_sorted = segment_summary.sort_values('Total Revenue', ascending=True)
    revenue_millions = segment_summary_sorted['Total Revenue'].values / 1e6
    ax1.barh(range(len(segment_summary_sorted)), revenue_millions, color='#2E86AB')
    ax1.set_yticks(range(len(segment_summary_sorted)))
    ax1.set_yticklabels(segment_summary_sorted['Segment'].values)
    ax1.set_xlabel('Revenue (Millions USD)')
    ax1.set_title('Revenue by Customer Segment', fontsize=12, fontweight='bold')
    setup_revenue_chart(ax1)
    ax1.set_ylabel('')
    # Chart 2: Customer Count by Segment
    customer_counts = segment_summary_sorted['Customer Count'].values
    ax2.barh(range(len(segment_summary_sorted)), customer_counts, color='#A23B72')
    ax2.set_yticks(range(len(segment_summary_sorted)))
    ax2.set_yticklabels(segment_summary_sorted['Segment'].values)
    ax2.set_xlabel('Number of Customers')
    ax2.set_title('Customer Count by Segment', fontsize=12, fontweight='bold')
    ax2.set_ylabel('')
    ax2.grid(True, alpha=0.3)
    plt.suptitle(f'Customer Segmentation Analysis - {COMPANY_NAME}', 
                 fontsize=14, fontweight='bold', y=1.02)
    plt.tight_layout()
    save_chart(fig, 'customer_segmentation.png')
    plt.close()
    # 8. Validate revenue
    print("\nValidating revenue...")
    validate_revenue(df, ANALYSIS_NAME)
    print(f"\n{ANALYSIS_NAME} complete!")
    print(f"Charts saved to: {OUTPUT_DIR}")
 # ============================================================================
 # RUN ANALYSIS
 # ============================================================================
 if __name__ == "__main__":
    main()
--- a/examples/product_performance.py
+++ b/examples/product_performance.py
@@ -0,0 +1,203 @@
 """
 Example: Product Performance Analysis
 Example showing product mix and performance analysis
 This example demonstrates:
 - Product-level aggregation
 - Product performance metrics
 - Product mix visualization
 """
 import pandas as pd
 import numpy as np
 import matplotlib.pyplot as plt
 from pathlib import Path
 # Import utilities
 from data_loader import load_sales_data, validate_data_structure
 from validate_revenue import validate_revenue
 from analysis_utils import (
    get_ltm_period_config, calculate_annual_metrics,
    apply_exclusion_filters, setup_revenue_chart, save_chart,
    format_currency, sort_mixed_years
 )
 from config import (
    OUTPUT_DIR, MAX_DATE, CHART_SIZES, ensure_directories,
    get_data_path, COMPANY_NAME, REVENUE_COLUMN, ITEM_COLUMN,
    DATE_COLUMN, MIN_YEAR, QUANTITY_COLUMN
 )
 # ============================================================================
 # CONFIGURATION
 # ============================================================================
 ANALYSIS_NAME = "Product Performance Analysis"
 DESCRIPTION = "Product mix and performance analysis"
 # ============================================================================
 # MAIN ANALYSIS FUNCTION
 # ============================================================================
 def main():
    """Main analysis function"""
    print(f"\n{'='*60}")
    print(f"{ANALYSIS_NAME}")
    print(f"{'='*60}\n")
    # 1. Load data
    print("Loading data...")
    try:
        df = load_sales_data(get_data_path())
        print(f"Loaded {len(df):,} transactions")
    except Exception as e:
        print(f"ERROR loading data: {e}")
        return
    # 2. Validate data structure
    is_valid, msg = validate_data_structure(df)
    if not is_valid:
        print(f"ERROR: {msg}")
        return
    if ITEM_COLUMN not in df.columns:
        print(f"WARNING: Item column '{ITEM_COLUMN}' not found. Using transaction-level analysis.")
        # Create a dummy item column for demonstration
        df[ITEM_COLUMN] = 'All Products'
    print("Data validation passed")
    # 3. Apply exclusion filters
    df = apply_exclusion_filters(df)
    # 4. Filter by date range
    df = df[df['Year'] >= MIN_YEAR]
    if DATE_COLUMN in df.columns:
        df = df[df[DATE_COLUMN] <= MAX_DATE]
    # 5. Setup LTM period
    ltm_start, ltm_end = get_ltm_period_config()
    # 6. Product performance summary
    print("\nCalculating product performance...")
    # Get most recent period data
    if ltm_start and ltm_end and 'YearMonth' in df.columns:
        recent_data = df[(df['YearMonth'] >= ltm_start) & (df['YearMonth'] <= ltm_end)]
        period_label = f"LTM {ltm_end}"
    else:
        recent_year = df['Year'].max()
        recent_data = df[df['Year'] == recent_year]
        period_label = str(recent_year)
    # Product-level metrics
    product_metrics = recent_data.groupby(ITEM_COLUMN).agg({
        REVENUE_COLUMN: ['sum', 'count'],
        QUANTITY_COLUMN: 'sum' if QUANTITY_COLUMN in df.columns else 'count'
    }).reset_index()
    product_metrics.columns = [ITEM_COLUMN, 'Revenue', 'Transaction_Count', 'Quantity']
    # Calculate average price per unit if quantity available
    if QUANTITY_COLUMN in df.columns:
        product_metrics['Avg_Price'] = product_metrics['Revenue'] / product_metrics['Quantity'].replace(0, np.nan)
    else:
        product_metrics['Avg_Price'] = product_metrics['Revenue'] / product_metrics['Transaction_Count']
    # Sort by revenue
    product_metrics = product_metrics.sort_values('Revenue', ascending=False)
    # Top products summary
    print(f"\nTop 10 Products by Revenue ({period_label}):")
    print("-" * 80)
    top_10 = product_metrics.head(10)
    total_revenue = product_metrics['Revenue'].sum()
    for idx, row in top_10.iterrows():
        pct = (row['Revenue'] / total_revenue) * 100
        print(f"{row[ITEM_COLUMN]:30s}: {format_currency(row['Revenue']):>12s} ({pct:5.1f}%)")
    # 7. Annual product trends (if multiple years available)
    if len(df['Year'].unique()) > 1:
        print("\nCalculating annual product trends...")
        def calculate_product_metrics(year_data):
            """Calculate product metrics for a year"""
            product_revenue = year_data.groupby(ITEM_COLUMN)[REVENUE_COLUMN].sum()
            # Get top 5 products
            top_5 = product_revenue.nlargest(5)
            return dict(top_5)
        annual_product_df = calculate_annual_metrics(df, calculate_product_metrics, ltm_start, ltm_end)
        # 8. Create visualizations
        print("\nGenerating charts...")
        ensure_directories()
        # Chart 1: Top Products Revenue (Bar Chart)
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=CHART_SIZES['wide'])
        top_10_revenue = top_10['Revenue'].values / 1e6
        top_10_names = top_10[ITEM_COLUMN].values
        ax1.barh(range(len(top_10)), top_10_revenue, color='#2E86AB')
        ax1.set_yticks(range(len(top_10)))
        ax1.set_yticklabels([name[:30] + '...' if len(name) > 30 else name for name in top_10_names])
        ax1.set_xlabel('Revenue (Millions USD)')
        ax1.set_title(f'Top 10 Products by Revenue\n({period_label})', fontsize=12, fontweight='bold')
        setup_revenue_chart(ax1)
        ax1.set_ylabel('')
        # Chart 2: Revenue Distribution (Pie Chart for top 10)
        if len(product_metrics) > 10:
            other_revenue = product_metrics.iloc[10:]['Revenue'].sum()
            pie_data = list(top_10['Revenue'].values) + [other_revenue]
            pie_labels = list(top_10[ITEM_COLUMN].values) + ['Other']
        else:
            pie_data = product_metrics['Revenue'].values
            pie_labels = product_metrics[ITEM_COLUMN].values
        pie_data_millions = [x / 1e6 for x in pie_data]
        ax2.pie(pie_data_millions, labels=pie_labels, autopct='%1.1f%%', startangle=90)
        ax2.set_title('Revenue Distribution\n(Top Products)', fontsize=12, fontweight='bold')
        plt.suptitle(f'Product Performance Analysis - {COMPANY_NAME}', 
                     fontsize=14, fontweight='bold', y=1.02)
        plt.tight_layout()
        save_chart(fig, 'product_performance.png')
        plt.close()
    else:
        # Single chart if only one year
        print("\nGenerating chart...")
        ensure_directories()
        fig, ax = plt.subplots(figsize=CHART_SIZES['medium'])
        top_10_revenue = top_10['Revenue'].values / 1e6
        top_10_names = top_10[ITEM_COLUMN].values
        ax.barh(range(len(top_10)), top_10_revenue, color='#2E86AB')
        ax.set_yticks(range(len(top_10)))
        ax.set_yticklabels([name[:40] + '...' if len(name) > 40 else name for name in top_10_names])
        ax.set_xlabel('Revenue (Millions USD)')
        ax.set_title(f'Top 10 Products by Revenue - {COMPANY_NAME}\n({period_label})', 
                     fontsize=14, fontweight='bold')
        setup_revenue_chart(ax)
        ax.set_ylabel('')
        plt.tight_layout()
        save_chart(fig, 'product_performance.png')
        plt.close()
    # 9. Validate revenue
    print("\nValidating revenue...")
    validate_revenue(df, ANALYSIS_NAME)
    print(f"\n{ANALYSIS_NAME} complete!")
    print(f"Charts saved to: {OUTPUT_DIR}")
 # ============================================================================
 # RUN ANALYSIS
 # ============================================================================
 if __name__ == "__main__":
    main()
--- a/export_utils.py
+++ b/export_utils.py
@@ -0,0 +1,238 @@
 """
 Export utilities for analysis results
 Provides functions to export DataFrames and summary data to CSV and Excel
 Usage:
    from export_utils import export_to_csv, export_to_excel, export_summary_table
    # Export DataFrame to CSV
    export_to_csv(df, 'results.csv')
    # Export DataFrame to Excel
    export_to_excel(df, 'results.xlsx', sheet_name='Data')
    # Export summary table
    export_summary_table({'Metric1': 100, 'Metric2': 200}, 'summary.xlsx')
 """
 import pandas as pd
 from pathlib import Path
 from config import REPORTS_DIR, ensure_directories
 def export_to_csv(df, filename, output_dir=None, index=True):
    """
    Export DataFrame to CSV with proper formatting
    Args:
        df: DataFrame to export
        filename: Output filename (e.g., 'results.csv')
        output_dir: Output directory (defaults to config.REPORTS_DIR)
        index: Whether to include index in export (default: True)
    Returns:
        Path to exported file
    """
    if output_dir is None:
        output_dir = REPORTS_DIR
    else:
        output_dir = Path(output_dir)
    ensure_directories()
    output_dir.mkdir(exist_ok=True)
    filepath = output_dir / filename
    df.to_csv(filepath, index=index, encoding='utf-8-sig')
    print(f"Exported to CSV: {filepath}")
    return filepath
 def export_to_excel(df, filename, sheet_name='Data', output_dir=None, index=True):
    """
    Export DataFrame to Excel with formatting
    Args:
        df: DataFrame to export
        filename: Output filename (e.g., 'results.xlsx')
        sheet_name: Excel sheet name (default: 'Data')
        output_dir: Output directory (defaults to config.REPORTS_DIR)
        index: Whether to include index in export (default: True)
    Returns:
        Path to exported file
    Raises:
        ImportError: If openpyxl is not installed
    """
    try:
        import openpyxl
    except ImportError:
        raise ImportError(
            "openpyxl is required for Excel export. Install with: pip install openpyxl"
        )
    if output_dir is None:
        output_dir = REPORTS_DIR
    else:
        output_dir = Path(output_dir)
    ensure_directories()
    output_dir.mkdir(exist_ok=True)
    filepath = output_dir / filename
    # Create Excel writer
    with pd.ExcelWriter(filepath, engine='openpyxl') as writer:
        df.to_excel(writer, sheet_name=sheet_name, index=index)
        # Auto-adjust column widths
        worksheet = writer.sheets[sheet_name]
        for idx, col in enumerate(df.columns, 1):
            max_length = max(
                df[col].astype(str).map(len).max(),
                len(str(col))
            )
            # Cap at 50 characters for readability
            adjusted_width = min(max_length + 2, 50)
            worksheet.column_dimensions[chr(64 + idx)].width = adjusted_width
    print(f"Exported to Excel: {filepath}")
    return filepath
 def export_summary_table(data_dict, filename, output_dir=None, title=None):
    """
    Export summary statistics to formatted table (Excel)
    Args:
        data_dict: Dictionary of {metric_name: value} pairs
        filename: Output filename (e.g., 'summary.xlsx')
        output_dir: Output directory (defaults to config.REPORTS_DIR)
        title: Optional title for the summary table
    Returns:
        Path to exported file
    Example:
        export_summary_table({
            'Total Revenue': 1000000,
            'Customer Count': 500,
            'Average Order Value': 2000
        }, 'summary.xlsx')
    """
    try:
        import openpyxl
    except ImportError:
        raise ImportError(
            "openpyxl is required for Excel export. Install with: pip install openpyxl"
        )
    if output_dir is None:
        output_dir = REPORTS_DIR
    else:
        output_dir = Path(output_dir)
    ensure_directories()
    output_dir.mkdir(exist_ok=True)
    filepath = output_dir / filename
    # Create DataFrame from dictionary
    df = pd.DataFrame({
        'Metric': list(data_dict.keys()),
        'Value': list(data_dict.values())
    })
    # Format numeric values
    def format_value(val):
        if isinstance(val, (int, float)):
            if abs(val) >= 1e6:
                return f"${val / 1e6:.2f}m"
            elif abs(val) >= 1e3:
                return f"${val / 1e3:.2f}k"
            else:
                return f"${val:.2f}"
        return str(val)
    df['Formatted_Value'] = df['Value'].apply(format_value)
    # Create Excel writer
    with pd.ExcelWriter(filepath, engine='openpyxl') as writer:
        df.to_excel(writer, sheet_name='Summary', index=False)
        # Format worksheet
        worksheet = writer.sheets['Summary']
        # Set column widths
        worksheet.column_dimensions['A'].width = 30
        worksheet.column_dimensions['B'].width = 20
        worksheet.column_dimensions['C'].width = 20
        # Add title if provided
        if title:
            worksheet.insert_rows(1)
            worksheet.merge_cells('A1:C1')
            worksheet['A1'] = title
            worksheet['A1'].font = openpyxl.styles.Font(bold=True, size=14)
            worksheet['A1'].alignment = openpyxl.styles.Alignment(horizontal='center')
    print(f"Exported summary table to Excel: {filepath}")
    return filepath
 def export_multiple_sheets(data_dict, filename, output_dir=None):
    """
    Export multiple DataFrames to Excel with multiple sheets
    Args:
        data_dict: Dictionary of {sheet_name: DataFrame} pairs
        filename: Output filename (e.g., 'results.xlsx')
        output_dir: Output directory (defaults to config.REPORTS_DIR)
    Returns:
        Path to exported file
    Example:
        export_multiple_sheets({
            'Revenue': revenue_df,
            'Customers': customer_df,
            'Products': product_df
        }, 'analysis_results.xlsx')
    """
    try:
        import openpyxl
    except ImportError:
        raise ImportError(
            "openpyxl is required for Excel export. Install with: pip install openpyxl"
        )
    if output_dir is None:
        output_dir = REPORTS_DIR
    else:
        output_dir = Path(output_dir)
    ensure_directories()
    output_dir.mkdir(exist_ok=True)
    filepath = output_dir / filename
    # Create Excel writer
    with pd.ExcelWriter(filepath, engine='openpyxl') as writer:
        for sheet_name, df in data_dict.items():
            # Truncate sheet name to 31 characters (Excel limit)
            safe_sheet_name = sheet_name[:31]
            df.to_excel(writer, sheet_name=safe_sheet_name, index=True)
            # Auto-adjust column widths
            worksheet = writer.sheets[safe_sheet_name]
            for idx, col in enumerate(df.columns, 1):
                max_length = max(
                    df[col].astype(str).map(len).max(),
                    len(str(col))
                )
                adjusted_width = min(max_length + 2, 50)
                col_letter = openpyxl.utils.get_column_letter(idx)
                worksheet.column_dimensions[col_letter].width = adjusted_width
    print(f"Exported {len(data_dict)} sheets to Excel: {filepath}")
    return filepath
--- a/generate_sample_data.py
+++ b/generate_sample_data.py
@@ -0,0 +1,184 @@
 """
 Sample data generator for testing and demonstrations
 Generates realistic sample sales data
 Usage:
    python generate_sample_data.py
    # Or import and use programmatically:
    from generate_sample_data import generate_sample_sales_data
    df = generate_sample_sales_data(num_customers=100, num_products=50, years=[2021, 2022, 2023])
 """
 import pandas as pd
 import numpy as np
 from pathlib import Path
 from datetime import datetime, timedelta
 import random
 def generate_sample_sales_data(
    num_customers=100,
    num_products=50,
    years=[2021, 2022, 2023, 2024, 2025],
    transactions_per_month=500,
    output_file='sample_sales_data.csv'
 ):
    """
    Generate realistic sample sales data
    Args:
        num_customers: Number of unique customers
        num_products: Number of unique products
        years: List of years to generate data for
        transactions_per_month: Average transactions per month
        output_file: Output CSV filename
    Returns:
        DataFrame: Generated sales data
    """
    print(f"Generating sample sales data...")
    print(f"  Customers: {num_customers}")
    print(f"  Products: {num_products}")
    print(f"  Years: {years}")
    # Generate customer names
    customer_names = [f"Customer_{i:04d}" for i in range(1, num_customers + 1)]
    # Generate product names
    product_names = [f"Product_{i:04d}" for i in range(1, num_products + 1)]
    # Generate transactions
    transactions = []
    for year in years:
        for month in range(1, 13):
            # Skip future months
            current_date = datetime.now()
            if year > current_date.year or (year == current_date.year and month > current_date.month):
                continue
            # Generate transactions for this month
            num_transactions = int(np.random.normal(transactions_per_month, transactions_per_month * 0.2))
            num_transactions = max(10, num_transactions)  # At least 10 transactions
            for _ in range(num_transactions):
                # Random date within month
                if month == 2:
                    max_day = 28
                elif month in [4, 6, 9, 11]:
                    max_day = 30
                else:
                    max_day = 31
                day = random.randint(1, max_day)
                invoice_date = datetime(year, month, day)
                # Random customer and product
                customer = random.choice(customer_names)
                product = random.choice(product_names)
                # Generate quantity (most transactions are small)
                quantity = int(np.random.lognormal(mean=1.5, sigma=1.0))
                quantity = max(1, min(quantity, 100))  # Cap at 100
                # Generate revenue (with some correlation to quantity)
                base_price = np.random.lognormal(mean=5, sigma=1.5)
                revenue = base_price * quantity
                # Add some variation
                revenue *= np.random.uniform(0.8, 1.2)
                revenue = round(revenue, 2)
                transactions.append({
                    'InvoiceDate': invoice_date,
                    'Customer': customer,
                    'Item': product,
                    'Quantity': quantity,
                    'USD': revenue,
                    'Year': year,
                    'Month': month
                })
    # Create DataFrame
    df = pd.DataFrame(transactions)
    # Sort by date
    df = df.sort_values('InvoiceDate').reset_index(drop=True)
    # Add some missing dates (realistic data quality issue)
    missing_date_pct = 0.05  # 5% missing dates
    num_missing = int(len(df) * missing_date_pct)
    missing_indices = np.random.choice(df.index, size=num_missing, replace=False)
    df.loc[missing_indices, 'InvoiceDate'] = pd.NaT
    # Save to CSV
    output_path = Path(output_file)
    df.to_csv(output_path, index=False)
    print(f"\n✅ Sample data generated: {output_path}")
    print(f"   Rows: {len(df):,}")
    print(f"   Date range: {df['InvoiceDate'].min()} to {df['InvoiceDate'].max()}")
    print(f"   Total revenue: ${df['USD'].sum() / 1e6:.2f}m")
    return df
 def generate_sample_data_for_template():
    """
    Generate sample data matching template's expected structure
    Uses config.py column names
    """
    from config import (
        REVENUE_COLUMN, DATE_COLUMN, CUSTOMER_COLUMN, ITEM_COLUMN,
        QUANTITY_COLUMN, ANALYSIS_YEARS
    )
    print("Generating sample data for template...")
    df = generate_sample_sales_data(
        num_customers=200,
        num_products=100,
        years=ANALYSIS_YEARS,
        transactions_per_month=1000,
        output_file='sample_sales_data.csv'
    )
    # Rename columns to match config (if different)
    column_mapping = {
        'USD': REVENUE_COLUMN,
        'InvoiceDate': DATE_COLUMN,
        'Customer': CUSTOMER_COLUMN,
        'Item': ITEM_COLUMN,
        'Quantity': QUANTITY_COLUMN
    }
    # Only rename if different
    for old_name, new_name in column_mapping.items():
        if old_name in df.columns and old_name != new_name:
            df = df.rename(columns={old_name: new_name})
    # Save
    output_path = Path('sample_sales_data.csv')
    df.to_csv(output_path, index=False)
    print(f"\n✅ Sample data saved to: {output_path}")
    print(f"   Ready to use with sales_analysis_template")
    return df
 # ============================================================================
 # MAIN
 # ============================================================================
 if __name__ == "__main__":
    """Generate sample data"""
    import sys
    if len(sys.argv) > 1:
        # Custom generation
        num_customers = int(sys.argv[1]) if len(sys.argv) > 1 else 100
        num_products = int(sys.argv[2]) if len(sys.argv) > 2 else 50
        generate_sample_sales_data(
            num_customers=num_customers,
            num_products=num_products
        )
    else:
        # Generate for template
        generate_sample_data_for_template()
--- a/logger_config.py
+++ b/logger_config.py
@@ -0,0 +1,197 @@
 """
 Logging configuration for analysis scripts
 Provides structured logging with file and console output
 Usage:
    from logger_config import get_logger
    logger = get_logger('my_analysis')
    logger.info("Analysis started")
    logger.warning("Low data quality detected")
    logger.error("Failed to load data")
 """
 import logging
 import sys
 from pathlib import Path
 from datetime import datetime
 from config import COMPANY_NAME, OUTPUT_DIR
 # Global logger instance
 _logger = None
 def setup_logging(log_level=logging.INFO, log_file=None, analysis_name=None):
    """
    Setup logging configuration
    Args:
        log_level: Logging level (DEBUG, INFO, WARNING, ERROR)
        log_file: Path to log file (defaults to logs/analysis_YYYYMMDD_HHMMSS.log)
        analysis_name: Name of analysis for log file naming
    Returns:
        logging.Logger: Configured logger instance
    """
    global _logger
    # Create logs directory
    logs_dir = Path('logs')
    logs_dir.mkdir(exist_ok=True)
    # Default log file name
    if log_file is None:
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        if analysis_name:
            safe_name = analysis_name.lower().replace(' ', '_').replace('/', '_')
            log_file = logs_dir / f"{safe_name}_{timestamp}.log"
        else:
            log_file = logs_dir / f"analysis_{timestamp}.log"
    else:
        log_file = Path(log_file)
        log_file.parent.mkdir(parents=True, exist_ok=True)
    # Create logger
    logger = logging.getLogger(analysis_name or 'analysis')
    logger.setLevel(log_level)
    # Remove existing handlers to avoid duplicates
    logger.handlers = []
    # Create formatters
    detailed_formatter = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s - %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S'
    )
    console_formatter = logging.Formatter(
        '%(levelname)s - %(message)s'
    )
    # File handler (detailed)
    file_handler = logging.FileHandler(log_file, encoding='utf-8')
    file_handler.setLevel(log_level)
    file_handler.setFormatter(detailed_formatter)
    logger.addHandler(file_handler)
    # Console handler (simpler)
    console_handler = logging.StreamHandler(sys.stdout)
    console_handler.setLevel(log_level)
    console_handler.setFormatter(console_formatter)
    logger.addHandler(console_handler)
    # Log startup message
    logger.info(f"="*60)
    logger.info(f"Analysis: {analysis_name or 'Unknown'}")
    logger.info(f"Company: {COMPANY_NAME}")
    logger.info(f"Log File: {log_file}")
    logger.info(f"="*60)
    _logger = logger
    return logger
 def get_logger(analysis_name=None, log_level=logging.INFO):
    """
    Get or create logger instance
    Args:
        analysis_name: Name of analysis
        log_level: Logging level (default: INFO)
    Returns:
        logging.Logger: Logger instance
    """
    global _logger
    if _logger is None:
        _logger = setup_logging(log_level=log_level, analysis_name=analysis_name)
    return _logger
 def log_analysis_start(analysis_name, logger=None):
    """
    Log analysis start
    Args:
        analysis_name: Name of analysis
        logger: Logger instance (creates one if None)
    """
    if logger is None:
        logger = get_logger(analysis_name)
    logger.info(f"Starting analysis: {analysis_name}")
    logger.info(f"Timestamp: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
 def log_analysis_end(analysis_name, success=True, logger=None):
    """
    Log analysis completion
    Args:
        analysis_name: Name of analysis
        success: Whether analysis completed successfully
        logger: Logger instance (creates one if None)
    """
    if logger is None:
        logger = get_logger(analysis_name)
    if success:
        logger.info(f"Analysis completed successfully: {analysis_name}")
    else:
        logger.error(f"Analysis failed: {analysis_name}")
    logger.info(f"Timestamp: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    logger.info("="*60)
 def log_data_loading(df, logger=None):
    """
    Log data loading summary
    Args:
        df: Loaded DataFrame
        logger: Logger instance (creates one if None)
    """
    if logger is None:
        logger = get_logger()
    logger.info(f"Data loaded: {len(df):,} rows, {len(df.columns)} columns")
    from config import REVENUE_COLUMN, DATE_COLUMN
    if REVENUE_COLUMN in df.columns:
        revenue = df[REVENUE_COLUMN].sum()
        logger.info(f"Total revenue: ${revenue / 1e6:.2f}m")
    if DATE_COLUMN in df.columns:
        date_coverage = df[DATE_COLUMN].notna().sum() / len(df) * 100
        logger.info(f"Date coverage: {date_coverage:.1f}%")
 def log_error(error, logger=None, context=None):
    """
    Log error with context
    Args:
        error: Exception or error message
        logger: Logger instance (creates one if None)
        context: Additional context string
    """
    if logger is None:
        logger = get_logger()
    error_msg = str(error)
    if context:
        error_msg = f"{context}: {error_msg}"
    logger.error(error_msg, exc_info=True)
 # ============================================================================
 # EXAMPLE USAGE
 # ============================================================================
 if __name__ == "__main__":
    """Example usage"""
    logger = setup_logging(log_level=logging.DEBUG, analysis_name="Example Analysis")
    logger.debug("This is a debug message")
    logger.info("This is an info message")
    logger.warning("This is a warning message")
    logger.error("This is an error message")
    log_analysis_start("Example Analysis", logger)
    log_analysis_end("Example Analysis", success=True, logger)
--- a/report_generator.py
+++ b/report_generator.py
@@ -0,0 +1,228 @@
 """
 Report generation utility
 Combines multiple charts and data into a PDF report
 Usage:
    from report_generator import generate_pdf_report
    # Generate PDF report
    generate_pdf_report(
        charts=['chart1.png', 'chart2.png'],
        title='Sales Analysis Report',
        summary_data={'Total Revenue': 1000000}
    )
 """
 from pathlib import Path
 from datetime import datetime
 from config import COMPANY_NAME, OUTPUT_DIR, REPORTS_DIR, ensure_directories
 def generate_pdf_report(
    charts,
    title=None,
    summary_data=None,
    output_filename=None,
    output_dir=None
 ):
    """
    Generate PDF report from charts and summary data
    Args:
        charts: List of chart file paths (PNG files)
        title: Report title (defaults to company name + date)
        summary_data: Dictionary of summary metrics
        output_filename: Output PDF filename (defaults to report_YYYYMMDD_HHMMSS.pdf)
        output_dir: Output directory (defaults to config.REPORTS_DIR)
    Returns:
        Path: Path to generated PDF file
    Raises:
        ImportError: If reportlab is not installed
    """
    try:
        from reportlab.lib.pagesizes import letter, A4
        from reportlab.lib.units import inch
        from reportlab.lib import colors
        from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Image, Table, TableStyle, PageBreak
        from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
        from reportlab.lib.enums import TA_CENTER, TA_LEFT
    except ImportError:
        raise ImportError(
            "reportlab is required for PDF generation. Install with: pip install reportlab"
        )
    if output_dir is None:
        output_dir = REPORTS_DIR
    else:
        output_dir = Path(output_dir)
    ensure_directories()
    output_dir.mkdir(exist_ok=True)
    # Default filename
    if output_filename is None:
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        output_filename = f"report_{timestamp}.pdf"
    output_path = output_dir / output_filename
    # Create PDF document
    doc = SimpleDocTemplate(
        str(output_path),
        pagesize=letter,
        rightMargin=0.75*inch,
        leftMargin=0.75*inch,
        topMargin=0.75*inch,
        bottomMargin=0.75*inch
    )
    # Container for PDF elements
    story = []
    # Styles
    styles = getSampleStyleSheet()
    title_style = ParagraphStyle(
        'CustomTitle',
        parent=styles['Heading1'],
        fontSize=20,
        textColor=colors.HexColor('#2E86AB'),
        spaceAfter=30,
        alignment=TA_CENTER
    )
    heading_style = ParagraphStyle(
        'CustomHeading',
        parent=styles['Heading2'],
        fontSize=14,
        textColor=colors.HexColor('#2E86AB'),
        spaceAfter=12
    )
    # Title
    if title is None:
        title = f"{COMPANY_NAME} Sales Analysis Report"
    story.append(Paragraph(title, title_style))
    story.append(Spacer(1, 0.2*inch))
    # Report metadata
    metadata_text = f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
    story.append(Paragraph(metadata_text, styles['Normal']))
    story.append(Spacer(1, 0.3*inch))
    # Summary data table
    if summary_data:
        story.append(Paragraph("Summary", heading_style))
        # Create table
        table_data = [['Metric', 'Value']]
        for key, value in summary_data.items():
            # Format value
            if isinstance(value, (int, float)):
                if abs(value) >= 1e6:
                    formatted_value = f"${value / 1e6:.2f}m"
                elif abs(value) >= 1e3:
                    formatted_value = f"${value / 1e3:.2f}k"
                else:
                    formatted_value = f"${value:.2f}"
            else:
                formatted_value = str(value)
            table_data.append([key, formatted_value])
        table = Table(table_data, colWidths=[3*inch, 2*inch])
        table.setStyle(TableStyle([
            ('BACKGROUND', (0, 0), (-1, 0), colors.HexColor('#2E86AB')),
            ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
            ('ALIGN', (0, 0), (-1, -1), 'LEFT'),
            ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
            ('FONTSIZE', (0, 0), (-1, 0), 12),
            ('BOTTOMPADDING', (0, 0), (-1, 0), 12),
            ('BACKGROUND', (0, 1), (-1, -1), colors.beige),
            ('GRID', (0, 0), (-1, -1), 1, colors.black),
            ('ROWBACKGROUNDS', (0, 1), (-1, -1), [colors.white, colors.lightgrey])
        ]))
        story.append(table)
        story.append(Spacer(1, 0.3*inch))
    # Add charts
    if charts:
        story.append(Paragraph("Charts", heading_style))
        for i, chart_path in enumerate(charts, 1):
            chart_path = Path(chart_path)
            if not chart_path.exists():
                print(f"Warning: Chart not found: {chart_path}")
                continue
            # Add chart title
            chart_title = f"Chart {i}: {chart_path.stem.replace('_', ' ').title()}"
            story.append(Paragraph(chart_title, styles['Heading3']))
            story.append(Spacer(1, 0.1*inch))
            # Add image
            try:
                img = Image(str(chart_path), width=6*inch, height=4*inch)
                story.append(img)
            except Exception as e:
                error_msg = f"Error loading chart: {e}"
                story.append(Paragraph(error_msg, styles['Normal']))
            # Add page break between charts (except last one)
            if i < len(charts):
                story.append(PageBreak())
    # Build PDF
    doc.build(story)
    print(f"PDF report generated: {output_path}")
    return output_path
 def generate_simple_report(charts, title=None, output_filename=None):
    """
    Generate a simple PDF report (wrapper with defaults)
    Args:
        charts: List of chart file paths
        title: Report title
        output_filename: Output filename
    Returns:
        Path: Path to generated PDF
    """
    return generate_pdf_report(
        charts=charts,
        title=title,
        output_filename=output_filename
    )
 # ============================================================================
 # EXAMPLE USAGE
 # ============================================================================
 if __name__ == "__main__":
    """Example usage"""
    from config import OUTPUT_DIR
    # Find charts in output directory
    chart_files = list(OUTPUT_DIR.glob('*.png'))
    if chart_files:
        print(f"Found {len(chart_files)} charts")
        # Generate report
        report_path = generate_pdf_report(
            charts=[str(f) for f in chart_files[:5]],  # Limit to 5 charts
            title="Sales Analysis Report",
            summary_data={
                'Total Charts': len(chart_files),
                'Report Date': datetime.now().strftime('%Y-%m-%d')
            }
        )
        print(f"Report saved to: {report_path}")
    else:
        print("No charts found in output directory")
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,30 @@
 # Python dependencies for Sales Analysis Template
 # Install with: pip install -r requirements.txt
 # Core data analysis
 pandas>=2.0.0
 numpy>=1.24.0
 # Visualization
 matplotlib>=3.7.0
 seaborn>=0.12.0
 # Export utilities (optional - uncomment if needed)
 # openpyxl>=3.1.0  # For Excel export (export_utils.py)
 # Interactive visualizations (optional - uncomment if needed)
 # plotly>=5.17.0  # For interactive charts (analysis_utils.py)
 # Report generation (optional - uncomment if needed)
 # reportlab>=4.0.0  # For PDF reports (report_generator.py)
 # Statistical analysis (optional - uncomment if needed)
 # scipy>=1.10.0  # For statistical analysis, product lifecycle (statistical_utils.py)
 # Testing (optional - uncomment if needed)
 # pytest>=7.4.0  # For unit tests
 # Advanced analysis (optional - uncomment if needed)
 # pmdarima>=2.0.0  # For time series forecasting
 # mlxtend>=0.22.0  # For market basket analysis
 # scikit-learn>=1.3.0  # For machine learning analyses
--- a/run_all_analyses.py
+++ b/run_all_analyses.py
@@ -0,0 +1,185 @@
 """
 Batch runner for all analysis scripts
 Runs all analyses in sequence and generates a summary report
 To use:
 1. Add your analysis scripts to the ANALYSIS_SCRIPTS list below
 2. Run: python run_all_analyses.py
 """
 import subprocess
 import sys
 from pathlib import Path
 from datetime import datetime
 import time
 # ============================================================================
 # CONFIGURATION
 # ============================================================================
 # List of analysis scripts to run
 # TODO: Add your analysis scripts here
 ANALYSIS_SCRIPTS = [
    # Example structure - customize for your analyses:
    # 'check_annual_revenue.py',
    # 'revenue_analysis.py',
    # 'geographic_analysis.py',
    # 'customer_segmentation.py',
    # 'product_analysis.py',
    # Add your analysis scripts here...
 ]
 # Timeout per script (in seconds)
 SCRIPT_TIMEOUT = 600  # 10 minutes
 # ============================================================================
 # HELPER FUNCTIONS
 # ============================================================================
 def run_script(script_path):
    """Run a single analysis script"""
    script_name = Path(script_path).name
    print(f"\n{'='*60}")
    print(f"Running: {script_name}")
    print(f"{'='*60}")
    start_time = time.time()
    try:
        result = subprocess.run(
            [sys.executable, script_path],
            capture_output=True,
            text=True,
            timeout=SCRIPT_TIMEOUT
        )
        elapsed = time.time() - start_time
        if result.returncode == 0:
            print(f"✅ {script_name} completed successfully ({elapsed:.1f}s)")
            if result.stdout:
                # Print last 10 lines of output
                lines = result.stdout.strip().split('\n')
                if len(lines) > 10:
                    print("  ... (output truncated)")
                    for line in lines[-10:]:
                        print(f"  {line}")
                else:
                    for line in lines:
                        print(f"  {line}")
            return True, elapsed, None
        else:
            print(f"❌ {script_name} failed ({elapsed:.1f}s)")
            if result.stderr:
                print(f"  Error: {result.stderr[:500]}")
            return False, elapsed, result.stderr
    except subprocess.TimeoutExpired:
        elapsed = time.time() - start_time
        print(f"⏱️  {script_name} timed out after {elapsed:.1f}s")
        return False, elapsed, "Timeout"
    except Exception as e:
        elapsed = time.time() - start_time
        print(f"❌ {script_name} error: {str(e)}")
        return False, elapsed, str(e)
 # ============================================================================
 # MAIN FUNCTION
 # ============================================================================
 def main():
    """Run all analysis scripts"""
    from config import COMPANY_NAME
    print(f"\n{'='*60}")
    print(f"{COMPANY_NAME} Sales Analysis - Batch Runner")
    print(f"Started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    print(f"{'='*60}\n")
    # Check which scripts exist
    existing_scripts = []
    missing_scripts = []
    for script in ANALYSIS_SCRIPTS:
        script_path = Path(script)
        if script_path.exists():
            existing_scripts.append(script)
        else:
            missing_scripts.append(script)
    if missing_scripts:
        print(f"⚠️  Warning: {len(missing_scripts)} scripts not found:")
        for script in missing_scripts:
            print(f"   - {script}")
        print()
    if not existing_scripts:
        print("❌ No analysis scripts found!")
        print("   Please add analysis scripts to ANALYSIS_SCRIPTS list in run_all_analyses.py")
        return
    print(f"Found {len(existing_scripts)} analysis scripts to run\n")
    # Run scripts
    results = []
    total_start = time.time()
    for script in existing_scripts:
        success, elapsed, error = run_script(script)
        results.append({
            'script': script,
            'success': success,
            'elapsed': elapsed,
            'error': error
        })
    total_elapsed = time.time() - total_start
    # Print summary
    print(f"\n{'='*60}")
    print("Batch Run Summary")
    print(f"{'='*60}\n")
    successful = [r for r in results if r['success']]
    failed = [r for r in results if not r['success']]
    print(f"Total scripts: {len(results)}")
    print(f"✅ Successful: {len(successful)}")
    print(f"❌ Failed: {len(failed)}")
    print(f"⏱️  Total time: {total_elapsed/60:.1f} minutes\n")
    if failed:
        print("Failed scripts:")
        for r in failed:
            print(f"  ❌ {r['script']} ({r['elapsed']:.1f}s)")
            if r['error']:
                print(f"     Error: {r['error'][:100]}")
        print()
    # Save summary to file
    summary_file = Path('analysis_run_summary.txt')
    with open(summary_file, 'w') as f:
        f.write(f"{COMPANY_NAME} Sales Analysis - Batch Run Summary\n")
        f.write(f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
        f.write(f"{'='*60}\n\n")
        f.write(f"Total scripts: {len(results)}\n")
        f.write(f"Successful: {len(successful)}\n")
        f.write(f"Failed: {len(failed)}\n")
        f.write(f"Total time: {total_elapsed/60:.1f} minutes\n\n")
        if successful:
            f.write("Successful scripts:\n")
            for r in successful:
                f.write(f"  ✅ {r['script']} ({r['elapsed']:.1f}s)\n")
            f.write("\n")
        if failed:
            f.write("Failed scripts:\n")
            for r in failed:
                f.write(f"  ❌ {r['script']} ({r['elapsed']:.1f}s)\n")
                if r['error']:
                    f.write(f"     Error: {r['error']}\n")
    print(f"Summary saved to: {summary_file}")
 if __name__ == "__main__":
    main()
--- a/setup_wizard.py
+++ b/setup_wizard.py
@@ -0,0 +1,240 @@
 """
 Interactive setup wizard for configuring the sales analysis template
 Asks clarifying questions to configure config.py for your specific company and data
 """
 import os
 import sys
 from pathlib import Path
 def print_header(text):
    """Print a formatted header"""
    print("\n" + "="*70)
    print(f"  {text}")
    print("="*70 + "\n")
 def ask_question(prompt, default=None, validator=None):
    """
    Ask a question and return the answer
    Args:
        prompt: Question to ask
        default: Default value if user just presses Enter
        validator: Optional function to validate input
    Returns:
        User's answer (or default)
    """
    if default:
        full_prompt = f"{prompt} [{default}]: "
    else:
        full_prompt = f"{prompt}: "
    while True:
        answer = input(full_prompt).strip()
        if not answer and default:
            return default
        elif not answer:
            print("  Please provide an answer.")
            continue
        if validator:
            try:
                return validator(answer)
            except Exception as e:
                print(f"  Invalid input: {e}")
                continue
        return answer
 def validate_yes_no(answer):
    """Validate yes/no answer"""
    answer_lower = answer.lower()
    if answer_lower in ['y', 'yes', 'true', '1']:
        return True
    elif answer_lower in ['n', 'no', 'false', '0']:
        return False
    else:
        raise ValueError("Please answer 'yes' or 'no'")
 def validate_int(answer):
    """Validate integer answer"""
    return int(answer)
 def validate_file_exists(answer):
    """Validate that file exists"""
    if not Path(answer).exists():
        raise ValueError(f"File not found: {answer}")
    return answer
 def main():
    """Run the setup wizard"""
    print_header("Sales Analysis Template - Setup Wizard")
    print("This wizard will help you configure the template for your company's data.")
    print("You can press Enter to accept defaults (shown in brackets).\n")
    responses = {}
    # Company Information
    print_header("Company Information")
    responses['company_name'] = ask_question("Company Name", default="Your Company Name")
    responses['analysis_date'] = ask_question("Analysis Date (YYYY-MM-DD)", default="2026-01-12")
    # Data File
    print_header("Data File Configuration")
    print("Where is your sales data CSV file located?")
    data_file = ask_question("Data file name (e.g., sales_data.csv)", default="sales_data.csv")
    # Check if file exists
    if Path(data_file).exists():
        print(f"  ✓ Found: {data_file}")
    else:
        print(f"  ⚠ Warning: {data_file} not found. Make sure to place it in the template directory.")
    responses['data_file'] = data_file
    # Column Mapping
    print_header("Column Mapping")
    print("What are the column names in your CSV file?")
    print("(Press Enter to accept defaults if your columns match common names)\n")
    responses['revenue_column'] = ask_question("Revenue/Amount column name", default="USD")
    responses['date_column'] = ask_question("Primary date column name", default="InvoiceDate")
    has_fallback = ask_question("Do you have fallback date columns (Month, Year)?", default="yes", validator=validate_yes_no)
    if has_fallback:
        fallback_str = ask_question("Fallback date columns (comma-separated)", default="Month, Year")
        responses['date_fallback'] = [col.strip() for col in fallback_str.split(',')]
    else:
        responses['date_fallback'] = []
    responses['customer_column'] = ask_question("Customer/Account column name", default="Customer")
    responses['item_column'] = ask_question("Item/Product column name", default="Item")
    has_quantity = ask_question("Do you have a Quantity column?", default="yes", validator=validate_yes_no)
    if has_quantity:
        responses['quantity_column'] = ask_question("Quantity column name", default="Quantity")
    else:
        responses['quantity_column'] = None
    # Date Range
    print_header("Date Range Configuration")
    responses['min_year'] = ask_question("Minimum year to include in analysis", default="2021", validator=validate_int)
    responses['max_date'] = ask_question("Maximum date (YYYY-MM-DD)", default="2025-09-30")
    years_str = ask_question("Analysis years (comma-separated, e.g., 2021,2022,2023,2024,2025)", default="2021,2022,2023,2024,2025")
    responses['analysis_years'] = [int(y.strip()) for y in years_str.split(',')]
    # LTM Configuration
    print_header("LTM (Last Twelve Months) Configuration")
    print("LTM is used for the most recent partial year to enable apples-to-apples comparison.")
    print("Example: If your latest data is through September 2025, use Oct 2024 - Sep 2025.\n")
    use_ltm = ask_question("Do you need LTM for the most recent year?", default="yes", validator=validate_yes_no)
    responses['ltm_enabled'] = use_ltm
    if use_ltm:
        responses['ltm_start_month'] = ask_question("LTM start month (1-12)", default="10", validator=validate_int)
        responses['ltm_start_year'] = ask_question("LTM start year", default="2024", validator=validate_int)
        responses['ltm_end_month'] = ask_question("LTM end month (1-12)", default="9", validator=validate_int)
        responses['ltm_end_year'] = ask_question("LTM end year", default="2025", validator=validate_int)
    else:
        responses['ltm_start_month'] = 10
        responses['ltm_start_year'] = 2024
        responses['ltm_end_month'] = 9
        responses['ltm_end_year'] = 2025
    # Exclusion Filters
    print_header("Exclusion Filters (Optional)")
    use_exclusions = ask_question("Do you need to exclude specific segments (e.g., test accounts, business units)?", default="no", validator=validate_yes_no)
    responses['exclusions_enabled'] = use_exclusions
    if use_exclusions:
        responses['exclude_column'] = ask_question("Column name to filter on", default="Country")
        exclude_values_str = ask_question("Values to exclude (comma-separated)", default="")
        responses['exclude_values'] = [v.strip() for v in exclude_values_str.split(',') if v.strip()]
    else:
        responses['exclude_column'] = None
        responses['exclude_values'] = []
    # Generate config.py
    print_header("Generating Configuration")
    print("Updating config.py with your settings...")
    # Read current config.py
    config_path = Path('config.py')
    if not config_path.exists():
        print("ERROR: config.py not found!")
        return
    with open(config_path, 'r', encoding='utf-8') as f:
        config_content = f.read()
    # Replace values
    replacements = {
        "COMPANY_NAME = \"Your Company Name\"": f"COMPANY_NAME = \"{responses['company_name']}\"",
        "ANALYSIS_DATE = \"2026-01-12\"": f"ANALYSIS_DATE = \"{responses['analysis_date']}\"",
        "DATA_FILE = 'sales_data.csv'": f"DATA_FILE = '{responses['data_file']}'",
        "REVENUE_COLUMN = 'USD'": f"REVENUE_COLUMN = '{responses['revenue_column']}'",
        "DATE_COLUMN = 'InvoiceDate'": f"DATE_COLUMN = '{responses['date_column']}'",
        "DATE_FALLBACK_COLUMNS = ['Month', 'Year']": f"DATE_FALLBACK_COLUMNS = {responses['date_fallback']}",
        "CUSTOMER_COLUMN = 'Customer'": f"CUSTOMER_COLUMN = '{responses['customer_column']}'",
        "ITEM_COLUMN = 'Item'": f"ITEM_COLUMN = '{responses['item_column']}'",
        "QUANTITY_COLUMN = 'Quantity'": f"QUANTITY_COLUMN = '{responses['quantity_column']}'" if responses['quantity_column'] else "QUANTITY_COLUMN = None",
        "MIN_YEAR = 2021": f"MIN_YEAR = {responses['min_year']}",
        "MAX_DATE = pd.Timestamp('2025-09-30')": f"MAX_DATE = pd.Timestamp('{responses['max_date']}')",
        "ANALYSIS_YEARS = [2021, 2022, 2023, 2024, 2025]": f"ANALYSIS_YEARS = {responses['analysis_years']}",
        "LTM_ENABLED = True": f"LTM_ENABLED = {responses['ltm_enabled']}",
        "LTM_START_MONTH = 10": f"LTM_START_MONTH = {responses['ltm_start_month']}",
        "LTM_START_YEAR = 2024": f"LTM_START_YEAR = {responses['ltm_start_year']}",
        "LTM_END_MONTH = 9": f"LTM_END_MONTH = {responses['ltm_end_month']}",
        "LTM_END_YEAR = 2025": f"LTM_END_YEAR = {responses['ltm_end_year']}",
    }
    # Handle exclusions
    if responses['exclusions_enabled']:
        exclusions_config = f"""EXCLUSION_FILTERS = {{
    'enabled': True,
    'exclude_by_column': '{responses['exclude_column']}',
    'exclude_values': {responses['exclude_values']}
 }}"""
        # Replace the exclusion filters section
        import re
        pattern = r"EXCLUSION_FILTERS = \{.*?\}"
        config_content = re.sub(pattern, exclusions_config, config_content, flags=re.DOTALL)
    else:
        exclusions_config = """EXCLUSION_FILTERS = {
    'enabled': False,
    'exclude_by_column': None,
    'exclude_values': []
 }"""
        import re
        pattern = r"EXCLUSION_FILTERS = \{.*?\}"
        config_content = re.sub(pattern, exclusions_config, config_content, flags=re.DOTALL)
    # Apply replacements
    for old, new in replacements.items():
        if old in config_content:
            config_content = config_content.replace(old, new)
    # Write updated config
    with open(config_path, 'w', encoding='utf-8') as f:
        f.write(config_content)
    print("  ✓ Configuration updated successfully!")
    # Summary
    print_header("Setup Complete")
    print("Your configuration has been saved to config.py")
    print("\nNext steps:")
    print("1. Place your data file in the template directory (if not already there)")
    print("2. Test data loading: python -c \"from data_loader import load_sales_data; from config import get_data_path; df = load_sales_data(get_data_path()); print(f'Loaded {len(df):,} rows')\"")
    print("3. Review config.py and adjust any settings as needed")
    print("4. Start creating your analysis scripts using analysis_template.py")
    print("\nFor help, see README.md")
 if __name__ == "__main__":
    try:
        main()
    except KeyboardInterrupt:
        print("\n\nSetup cancelled by user.")
        sys.exit(0)
--- a/statistical_utils.py
+++ b/statistical_utils.py
@@ -0,0 +1,321 @@
 """
 Statistical analysis utilities
 Common statistical operations for sales analysis
 Usage:
    from statistical_utils import calculate_yoy_growth, calculate_cagr, calculate_correlation
    # Calculate year-over-year growth
    growth = calculate_yoy_growth(current_value=100, previous_value=90)
    # Calculate CAGR
    cagr = calculate_cagr(start_value=100, end_value=150, periods=3)
 """
 import pandas as pd
 import numpy as np
 from scipy import stats
 def calculate_yoy_growth(current, previous):
    """
    Calculate year-over-year growth percentage
    Args:
        current: Current period value
        previous: Previous period value
    Returns:
        float: Growth percentage (can be negative)
    Example:
        calculate_yoy_growth(110, 100)  # Returns 10.0
        calculate_yoy_growth(90, 100)   # Returns -10.0
    """
    if previous == 0:
        return np.nan if current == 0 else np.inf
    return ((current - previous) / previous) * 100
 def calculate_cagr(start_value, end_value, periods):
    """
    Calculate Compound Annual Growth Rate (CAGR)
    Args:
        start_value: Starting value
        end_value: Ending value
        periods: Number of periods (years)
    Returns:
        float: CAGR as percentage
    Example:
        calculate_cagr(100, 150, 3)  # Returns ~14.47%
    """
    if start_value <= 0 or periods <= 0:
        return np.nan
    if end_value <= 0:
        return np.nan
    cagr = ((end_value / start_value) ** (1 / periods) - 1) * 100
    return cagr
 def calculate_correlation(df, col1, col2):
    """
    Calculate correlation between two columns
    Args:
        df: DataFrame
        col1: First column name
        col2: Second column name
    Returns:
        float: Correlation coefficient (-1 to 1)
    """
    if col1 not in df.columns or col2 not in df.columns:
        return np.nan
    # Convert to numeric
    series1 = pd.to_numeric(df[col1], errors='coerce')
    series2 = pd.to_numeric(df[col2], errors='coerce')
    # Remove NaN pairs
    valid_mask = series1.notna() & series2.notna()
    if valid_mask.sum() < 2:
        return np.nan
    correlation = series1[valid_mask].corr(series2[valid_mask])
    return correlation
 def calculate_trend_slope(y_values):
    """
    Calculate linear trend slope
    Args:
        y_values: Array-like of y values
    Returns:
        float: Slope of linear trend
    """
    if len(y_values) < 2:
        return np.nan
    x_values = np.arange(len(y_values))
    # Remove NaN values
    valid_mask = ~np.isnan(y_values)
    if valid_mask.sum() < 2:
        return np.nan
    x_valid = x_values[valid_mask]
    y_valid = y_values[valid_mask]
    slope, intercept, r_value, p_value, std_err = stats.linregress(x_valid, y_valid)
    return slope
 def calculate_percent_change(series, periods=1):
    """
    Calculate percent change over periods
    Args:
        series: Pandas Series
        periods: Number of periods to shift (default: 1)
    Returns:
        Series: Percent change
    """
    return series.pct_change(periods=periods) * 100
 def calculate_moving_average(series, window=3):
    """
    Calculate moving average
    Args:
        series: Pandas Series
        window: Window size for moving average
    Returns:
        Series: Moving average
    """
    return series.rolling(window=window, center=False).mean()
 def calculate_volatility(series, window=12):
    """
    Calculate rolling volatility (standard deviation)
    Args:
        series: Pandas Series
        window: Window size for rolling calculation
    Returns:
        Series: Rolling volatility
    """
    return series.rolling(window=window, center=False).std()
 def calculate_z_score(value, mean, std):
    """
    Calculate z-score
    Args:
        value: Value to score
        mean: Mean of distribution
        std: Standard deviation of distribution
    Returns:
        float: Z-score
    """
    if std == 0:
        return np.nan
    return (value - mean) / std
 def test_statistical_significance(group1, group2, alpha=0.05):
    """
    Test statistical significance between two groups (t-test)
    Args:
        group1: First group (array-like)
        group2: Second group (array-like)
        alpha: Significance level (default: 0.05)
    Returns:
        dict: Test results with p-value, significant flag, etc.
    """
    group1 = np.array(group1)
    group2 = np.array(group2)
    # Remove NaN values
    group1 = group1[~np.isnan(group1)]
    group2 = group2[~np.isnan(group2)]
    if len(group1) < 2 or len(group2) < 2:
        return {
            'p_value': np.nan,
            'significant': False,
            'test_statistic': np.nan,
            'error': 'Insufficient data'
        }
    # Perform t-test
    t_statistic, p_value = stats.ttest_ind(group1, group2)
    return {
        'p_value': float(p_value),
        'significant': p_value < alpha,
        'test_statistic': float(t_statistic),
        'alpha': alpha,
        'group1_mean': float(np.mean(group1)),
        'group2_mean': float(np.mean(group2)),
        'group1_std': float(np.std(group1)),
        'group2_std': float(np.std(group2))
    }
 def calculate_confidence_interval(series, confidence=0.95):
    """
    Calculate confidence interval for a series
    Args:
        series: Pandas Series
        confidence: Confidence level (default: 0.95 for 95%)
    Returns:
        dict: Mean, lower bound, upper bound
    """
    series_clean = series.dropna()
    if len(series_clean) == 0:
        return {
            'mean': np.nan,
            'lower': np.nan,
            'upper': np.nan,
            'confidence': confidence
        }
    mean = series_clean.mean()
    std = series_clean.std()
    n = len(series_clean)
    # Calculate standard error
    se = std / np.sqrt(n)
    # Calculate critical value (z-score for normal distribution)
    alpha = 1 - confidence
    z_critical = stats.norm.ppf(1 - alpha/2)
    margin = z_critical * se
    return {
        'mean': float(mean),
        'lower': float(mean - margin),
        'upper': float(mean + margin),
        'confidence': confidence,
        'margin': float(margin)
    }
 def calculate_annual_growth_rates(values, years):
    """
    Calculate year-over-year growth rates for annual data
    Args:
        values: Array-like of annual values
        years: Array-like of corresponding years
    Returns:
        DataFrame: Years, values, and growth rates
    """
    df = pd.DataFrame({
        'Year': years,
        'Value': values
    })
    df['YoY_Growth'] = calculate_percent_change(df['Value'])
    df['YoY_Change'] = df['Value'].diff()
    return df
 def calculate_seasonality_index(monthly_series):
    """
    Calculate seasonality index for monthly data
    Args:
        monthly_series: Series with datetime index (monthly frequency)
    Returns:
        Series: Seasonality index (1.0 = average, >1.0 = above average, <1.0 = below average)
    """
    if not isinstance(monthly_series.index, pd.DatetimeIndex):
        raise ValueError("Series must have DatetimeIndex")
    # Extract month
    monthly_series = monthly_series.copy()
    monthly_series['Month'] = monthly_series.index.month
    # Calculate average by month
    monthly_avg = monthly_series.groupby('Month').mean()
    overall_avg = monthly_series.mean()
    # Calculate seasonality index
    seasonality = monthly_avg / overall_avg
    return seasonality
 # ============================================================================
 # EXAMPLE USAGE
 # ============================================================================
 if __name__ == "__main__":
    """Example usage"""
    # YoY Growth
    growth = calculate_yoy_growth(110, 100)
    print(f"Year-over-year growth: {growth:.2f}%")
    # CAGR
    cagr = calculate_cagr(100, 150, 3)
    print(f"CAGR: {cagr:.2f}%")
    # Sample data for correlation
    df = pd.DataFrame({
        'Revenue': [100, 110, 120, 130, 140],
        'Quantity': [10, 11, 12, 13, 14]
    })
    corr = calculate_correlation(df, 'Revenue', 'Quantity')
    print(f"Correlation: {corr:.2f}")
--- a/tests/test_analysis_utils.py
+++ b/tests/test_analysis_utils.py
@@ -0,0 +1,85 @@
 """
 Unit tests for analysis_utils.py
 """
 import pytest
 import pandas as pd
 import numpy as np
 from pathlib import Path
 import sys
 # Add parent directory to path
 sys.path.insert(0, str(Path(__file__).parent.parent))
 from analysis_utils import (
    millions_formatter, thousands_formatter,
    get_millions_formatter, get_thousands_formatter,
    format_currency, calculate_price_per_unit,
    sort_mixed_years, safe_year_labels
 )
 class TestFormatters:
    """Test formatting functions"""
    def test_millions_formatter(self):
        """Test millions formatter"""
        assert millions_formatter(10.5, None) == '$10.5m'
        assert millions_formatter(0, None) == '$0.0m'
        assert millions_formatter(100.0, None) == '$100.0m'
    def test_thousands_formatter(self):
        """Test thousands formatter"""
        assert thousands_formatter(10.5, None) == '$10.5k'
        assert thousands_formatter(0, None) == '$0.0k'
    def test_format_currency(self):
        """Test currency formatting"""
        assert format_currency(1000000) == '$1.00m'
        assert format_currency(1000, millions=False) == '$1.00k'
        assert format_currency(np.nan) == 'N/A'
 class TestPriceCalculation:
    """Test price calculation functions"""
    def test_calculate_price_per_unit(self):
        """Test price per unit calculation"""
        df = pd.DataFrame({
            'Quantity': [10, 20, 30],
            'Revenue': [100, 200, 300]
        })
        price = calculate_price_per_unit(df, 'Quantity', 'Revenue')
        assert price == 10.0  # (100+200+300) / (10+20+30)
    def test_calculate_price_per_unit_with_outliers(self):
        """Test price calculation excludes outliers"""
        df = pd.DataFrame({
            'Quantity': [10, 20, 30, 2000],  # 2000 is outlier
            'Revenue': [100, 200, 300, 10000]
        })
        # Should exclude quantity > 1000 by default
        price = calculate_price_per_unit(df, 'Quantity', 'Revenue')
        assert price == 10.0  # Only first 3 rows
 class TestYearHandling:
    """Test year handling functions"""
    def test_sort_mixed_years(self):
        """Test sorting mixed int/str years"""
        df = pd.DataFrame({
            'Year': [2023, '2025 (LTM)', 2024, 2022],
            'Value': [100, 150, 120, 90]
        })
        sorted_df = sort_mixed_years(df, 'Year')
        assert sorted_df['Year'].iloc[0] == 2022
        assert sorted_df['Year'].iloc[-1] == '2025 (LTM)'
    def test_safe_year_labels(self):
        """Test year label conversion"""
        years = [2021, 2022, '2025 (LTM)']
        labels = safe_year_labels(years)
        assert labels == ['2021', '2022', '2025 (LTM)']
 if __name__ == "__main__":
    pytest.main([__file__, '-v'])
--- a/tests/test_config_validator.py
+++ b/tests/test_config_validator.py
@@ -0,0 +1,45 @@
 """
 Unit tests for config_validator.py
 """
 import pytest
 import pandas as pd
 from pathlib import Path
 import sys
 # Add parent directory to path
 sys.path.insert(0, str(Path(__file__).parent.parent))
 from config_validator import validate_config
 class TestConfigValidator:
    """Test configuration validation"""
    def test_validate_config_missing_column(self):
        """Test validation catches missing columns"""
        df = pd.DataFrame({
            'SomeColumn': [1, 2, 3]
        })
        errors, warnings = validate_config(df)
        # Should have errors for missing required columns
        assert len(errors) > 0
        assert any('not found' in error.lower() for error in errors)
    def test_validate_config_valid_data(self):
        """Test validation with valid data"""
        df = pd.DataFrame({
            'InvoiceDate': pd.to_datetime(['2023-01-01', '2023-02-01']),
            'USD': [100.0, 200.0],
            'Year': [2023, 2023]
        })
        errors, warnings = validate_config(df)
        # Should have minimal errors (may have warnings about missing optional columns)
        # But should not have critical errors if basic structure is correct
        critical_errors = [e for e in errors if 'not found' in e.lower() and 'USD' in e or 'InvoiceDate' in e]
        assert len(critical_errors) == 0
 if __name__ == "__main__":
    pytest.main([__file__, '-v'])
--- a/tests/test_data_loader.py
+++ b/tests/test_data_loader.py
@@ -0,0 +1,68 @@
 """
 Integration tests for data_loader.py
 """
 import pytest
 import pandas as pd
 import numpy as np
 from pathlib import Path
 import sys
 import tempfile
 import os
 # Add parent directory to path
 sys.path.insert(0, str(Path(__file__).parent.parent))
 from data_loader import load_sales_data, validate_data_structure
 class TestDataLoader:
    """Test data loading functions"""
    def test_load_sales_data_basic(self):
        """Test basic data loading"""
        # Create temporary CSV
        with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False) as f:
            f.write('InvoiceDate,USD,Customer\n')
            f.write('2023-01-01,100.0,Customer1\n')
            f.write('2023-02-01,200.0,Customer2\n')
            temp_path = f.name
        try:
            # Temporarily update config
            import config
            original_data_file = config.DATA_FILE
            config.DATA_FILE = Path(temp_path).name
            df = load_sales_data(Path(temp_path))
            assert len(df) == 2
            assert 'Year' in df.columns
            assert 'YearMonth' in df.columns
            # Restore config
            config.DATA_FILE = original_data_file
        finally:
            os.unlink(temp_path)
    def test_validate_data_structure(self):
        """Test data structure validation"""
        # Valid DataFrame
        df_valid = pd.DataFrame({
            'InvoiceDate': pd.to_datetime(['2023-01-01', '2023-02-01']),
            'USD': [100.0, 200.0]
        })
        is_valid, msg = validate_data_structure(df_valid)
        assert is_valid
        assert msg == "OK"
        # Invalid DataFrame (missing column)
        df_invalid = pd.DataFrame({
            'InvoiceDate': pd.to_datetime(['2023-01-01'])
        })
        is_valid, msg = validate_data_structure(df_invalid)
        assert not is_valid
        assert 'Missing required column' in msg
 if __name__ == "__main__":
    pytest.main([__file__, '-v'])
--- a/validate_revenue.py
+++ b/validate_revenue.py
@@ -0,0 +1,95 @@
 """
 Revenue validation utility
 Validates that revenue calculations are consistent across analyses
 """
 import pandas as pd
 from config import (
    REVENUE_COLUMN, ANALYSIS_YEARS, VALIDATION_ENABLED,
    EXPECTED_REVENUE, REVENUE_TOLERANCE_PCT, LTM_ENABLED,
    get_ltm_period
 )
 from analysis_utils import get_annual_data
 def validate_revenue(dataframe: pd.DataFrame, analysis_name: str = "Analysis") -> None:
    """
    Print annual revenue summary for validation.
    This function helps ensure that:
    1. Data loading is working correctly
    2. Revenue calculations are consistent
    3. Filters are not accidentally excluding too much data
    Args:
        dataframe: DataFrame with revenue and date columns (should have REVENUE_COLUMN and Year)
        analysis_name: Name of the analysis (for logging/display)
    Example:
        >>> validate_revenue(df, "Revenue Analysis")
        >>> # Prints annual revenue summary by year
    """
    df = dataframe.copy()
    # Ensure date column is datetime
    from config import DATE_COLUMN
    if DATE_COLUMN in df.columns:
        df[DATE_COLUMN] = pd.to_datetime(df[DATE_COLUMN], errors='coerce', format='mixed')
    # Filter to analysis years
    df = df[df['Year'].isin(ANALYSIS_YEARS)]
    # Calculate annual revenue
    annual_revenue = {}
    ltm_start, ltm_end = get_ltm_period() if LTM_ENABLED else (None, None)
    for year in sorted(ANALYSIS_YEARS):
        if year in df['Year'].unique():
            year_data, year_label = get_annual_data(df, year, ltm_start, ltm_end)
            if len(year_data) > 0:
                revenue = year_data[REVENUE_COLUMN].sum()
                annual_revenue[year_label] = revenue
    # Print summary
    print(f"\n{'='*60}")
    print(f"Annual Revenue Validation - {analysis_name}")
    print(f"{'='*60}")
    if annual_revenue:
        for year_label, revenue in annual_revenue.items():
            formatted = f"${revenue / 1e6:.2f}m"
            print(f"  {year_label}: {formatted}")
        # Validation against expected values
        if VALIDATION_ENABLED and EXPECTED_REVENUE:
            print(f"\nValidation Check:")
            all_valid = True
            for year_label, actual_revenue in annual_revenue.items():
                # Try to match year label to expected revenue
                year_key = None
                if isinstance(year_label, str):
                    # Extract year number from label (e.g., "2025 (LTM 9/2025)" -> 2025)
                    import re
                    year_match = re.search(r'(\d{4})', str(year_label))
                    if year_match:
                        year_key = int(year_match.group(1))
                else:
                    year_key = year_label
                if year_key in EXPECTED_REVENUE:
                    expected = EXPECTED_REVENUE[year_key]
                    tolerance = expected * REVENUE_TOLERANCE_PCT
                    diff = abs(actual_revenue - expected)
                    if diff <= tolerance:
                        print(f"  ✓ {year_label}: Within tolerance ({diff/1e6:.2f}m difference)")
                    else:
                        print(f"  ✗ {year_label}: Outside tolerance (expected ${expected/1e6:.2f}m, got ${actual_revenue/1e6:.2f}m, diff: ${diff/1e6:.2f}m)")
                        all_valid = False
            if all_valid:
                print("  All validations passed!")
            else:
                print("  WARNING: Some validations failed. Check data loading and filters.")
    else:
        print("  No revenue data found for analysis years")
    print(f"{'='*60}\n")