sales-data-analysis/validate_revenue.py

"""
Revenue validation utility
Validates that revenue calculations are consistent across analyses
"""
import pandas as pd
from config import (
    REVENUE_COLUMN, ANALYSIS_YEARS, VALIDATION_ENABLED,
    EXPECTED_REVENUE, REVENUE_TOLERANCE_PCT, LTM_ENABLED,
    get_ltm_period
)
from analysis_utils import get_annual_data

def validate_revenue(dataframe: pd.DataFrame, analysis_name: str = "Analysis") -> None:
    """
    Print annual revenue summary for validation.

    This function helps ensure that:
    1. Data loading is working correctly
    2. Revenue calculations are consistent
    3. Filters are not accidentally excluding too much data

    Args:
        dataframe: DataFrame with revenue and date columns (should have REVENUE_COLUMN and Year)
        analysis_name: Name of the analysis (for logging/display)

    Example:
        >>> validate_revenue(df, "Revenue Analysis")
        >>> # Prints annual revenue summary by year
    """
    df = dataframe.copy()

    # Ensure date column is datetime
    from config import DATE_COLUMN
    if DATE_COLUMN in df.columns:
        df[DATE_COLUMN] = pd.to_datetime(df[DATE_COLUMN], errors='coerce', format='mixed')

    # Filter to analysis years
    df = df[df['Year'].isin(ANALYSIS_YEARS)]

    # Calculate annual revenue
    annual_revenue = {}
    ltm_start, ltm_end = get_ltm_period() if LTM_ENABLED else (None, None)

    for year in sorted(ANALYSIS_YEARS):
        if year in df['Year'].unique():
            year_data, year_label = get_annual_data(df, year, ltm_start, ltm_end)
            if len(year_data) > 0:
                revenue = year_data[REVENUE_COLUMN].sum()
                annual_revenue[year_label] = revenue

    # Print summary
    print(f"\n{'='*60}")
    print(f"Annual Revenue Validation - {analysis_name}")
    print(f"{'='*60}")

    if annual_revenue:
        for year_label, revenue in annual_revenue.items():
            formatted = f"${revenue / 1e6:.2f}m"
            print(f"  {year_label}: {formatted}")

        # Validation against expected values
        if VALIDATION_ENABLED and EXPECTED_REVENUE:
            print(f"\nValidation Check:")
            all_valid = True
            for year_label, actual_revenue in annual_revenue.items():
                # Try to match year label to expected revenue
                year_key = None
                if isinstance(year_label, str):
                    # Extract year number from label (e.g., "2025 (LTM 9/2025)" -> 2025)
                    import re
                    year_match = re.search(r'(\d{4})', str(year_label))
                    if year_match:
                        year_key = int(year_match.group(1))
                else:
                    year_key = year_label

                if year_key in EXPECTED_REVENUE:
                    expected = EXPECTED_REVENUE[year_key]
                    tolerance = expected * REVENUE_TOLERANCE_PCT
                    diff = abs(actual_revenue - expected)

                    if diff <= tolerance:
                        print(f"  ✓ {year_label}: Within tolerance ({diff/1e6:.2f}m difference)")
                    else:
                        print(f"  ✗ {year_label}: Outside tolerance (expected ${expected/1e6:.2f}m, got ${actual_revenue/1e6:.2f}m, diff: ${diff/1e6:.2f}m)")
                        all_valid = False

            if all_valid:
                print("  All validations passed!")
            else:
                print("  WARNING: Some validations failed. Check data loading and filters.")
    else:
        print("  No revenue data found for analysis years")

    print(f"{'='*60}\n")