From cf0b596449046f706cd3273d3d8d0de9f2267e6d Mon Sep 17 00:00:00 2001 From: Jonathan Pressnell Date: Fri, 6 Feb 2026 09:16:34 -0500 Subject: [PATCH] Initial commit: sales analysis template Co-authored-by: Cursor --- .cursor/rules/advanced_analysis_patterns.md | 307 ++++++++++ .cursor/rules/ai_assistant_guide.md | 316 +++++++++++ .cursor/rules/analysis_patterns.md | 161 ++++++ .cursor/rules/chart_formatting.md | 111 ++++ .cursor/rules/code_quality.md | 389 +++++++++++++ .cursor/rules/common_errors.md | 109 ++++ .cursor/rules/data_loading.md | 69 +++ .cursor/rules/error_handling.md | 276 +++++++++ .cursor/rules/ltm_methodology.md | 89 +++ EXAMPLES.md | 203 +++++++ QUICK_START.md | 175 ++++++ README.md | 589 ++++++++++++++++++++ SETUP_CHECKLIST.md | 118 ++++ TEMPLATE_OVERVIEW.md | 150 +++++ TEMPLATE_SUMMARY.md | 254 +++++++++ analysis_template.py | 147 +++++ analysis_utils.py | 510 +++++++++++++++++ config.py | 277 +++++++++ config_validator.py | 214 +++++++ data_loader.py | 224 ++++++++ data_processing.py | 285 ++++++++++ data_quality.py | 344 ++++++++++++ examples/annual_revenue_trend.py | 134 +++++ examples/cohort_analysis.py | 218 ++++++++ examples/customer_segmentation.py | 213 +++++++ examples/product_performance.py | 203 +++++++ export_utils.py | 238 ++++++++ generate_sample_data.py | 184 ++++++ logger_config.py | 197 +++++++ report_generator.py | 228 ++++++++ requirements.txt | 30 + run_all_analyses.py | 185 ++++++ setup_wizard.py | 240 ++++++++ statistical_utils.py | 321 +++++++++++ tests/test_analysis_utils.py | 85 +++ tests/test_config_validator.py | 45 ++ tests/test_data_loader.py | 68 +++ validate_revenue.py | 95 ++++ 38 files changed, 8001 insertions(+) create mode 100644 .cursor/rules/advanced_analysis_patterns.md create mode 100644 .cursor/rules/ai_assistant_guide.md create mode 100644 .cursor/rules/analysis_patterns.md create mode 100644 .cursor/rules/chart_formatting.md create mode 100644 .cursor/rules/code_quality.md create mode 100644 .cursor/rules/common_errors.md create mode 100644 .cursor/rules/data_loading.md create mode 100644 .cursor/rules/error_handling.md create mode 100644 .cursor/rules/ltm_methodology.md create mode 100644 EXAMPLES.md create mode 100644 QUICK_START.md create mode 100644 README.md create mode 100644 SETUP_CHECKLIST.md create mode 100644 TEMPLATE_OVERVIEW.md create mode 100644 TEMPLATE_SUMMARY.md create mode 100644 analysis_template.py create mode 100644 analysis_utils.py create mode 100644 config.py create mode 100644 config_validator.py create mode 100644 data_loader.py create mode 100644 data_processing.py create mode 100644 data_quality.py create mode 100644 examples/annual_revenue_trend.py create mode 100644 examples/cohort_analysis.py create mode 100644 examples/customer_segmentation.py create mode 100644 examples/product_performance.py create mode 100644 export_utils.py create mode 100644 generate_sample_data.py create mode 100644 logger_config.py create mode 100644 report_generator.py create mode 100644 requirements.txt create mode 100644 run_all_analyses.py create mode 100644 setup_wizard.py create mode 100644 statistical_utils.py create mode 100644 tests/test_analysis_utils.py create mode 100644 tests/test_config_validator.py create mode 100644 tests/test_data_loader.py create mode 100644 validate_revenue.py diff --git a/.cursor/rules/advanced_analysis_patterns.md b/.cursor/rules/advanced_analysis_patterns.md new file mode 100644 index 0000000..ae2f504 --- /dev/null +++ b/.cursor/rules/advanced_analysis_patterns.md @@ -0,0 +1,307 @@ +# Advanced Analysis Patterns + +This document provides patterns for sophisticated, production-grade analyses that leverage the full capabilities of the template framework. + +## ⭐ Using Cursor AI Effectively + +When working in Cursor, you can ask the AI to: +- "Create a cohort analysis script using the template patterns" +- "Add statistical significance testing to this analysis" +- "Generate a multi-dimensional analysis with product, customer, and geography" +- "Create a forecasting analysis with confidence intervals" + +The AI will automatically use these patterns and utilities. + +## Advanced Analysis Types + +### 1. Multi-Dimensional Analysis + +**Pattern:** Analyze across multiple dimensions simultaneously (e.g., Product Ɨ Customer Ɨ Geography) + +```python +from data_loader import load_sales_data +from analysis_utils import calculate_annual_metrics, get_ltm_period_config +from config import REVENUE_COLUMN, ITEM_COLUMN, CUSTOMER_COLUMN, REGION_COLUMN + +df = load_sales_data(get_data_path()) + +# Multi-dimensional pivot +pivot = df.pivot_table( + index=[ITEM_COLUMN, CUSTOMER_COLUMN], + columns=REGION_COLUMN, + values=REVENUE_COLUMN, + aggfunc='sum', + fill_value=0 +) + +# Or use data_processing helper +from data_processing import create_pivot_table +pivot = create_pivot_table( + df, + index=[ITEM_COLUMN, CUSTOMER_COLUMN], + columns=REGION_COLUMN, + values=REVENUE_COLUMN +) +``` + +### 2. Cohort Analysis with Retention Metrics + +**Pattern:** Track customer cohorts over time with retention and revenue metrics + +```python +from examples.cohort_analysis import create_cohorts, calculate_cohort_metrics + +df_cohort = create_cohorts(df) +cohort_metrics = calculate_cohort_metrics(df_cohort) + +# Calculate Net Revenue Retention (NRR) +nrr = cohort_metrics.groupby('Cohort').agg({ + 'Revenue_Retention': lambda x: x.iloc[-1] if len(x) > 0 else 0 +}) +``` + +### 3. Statistical Significance Testing + +**Pattern:** Compare segments with statistical tests + +```python +from statistical_utils import test_statistical_significance + +# Compare two groups +group1 = df[df['Segment'] == 'A'][REVENUE_COLUMN] +group2 = df[df['Segment'] == 'B'][REVENUE_COLUMN] + +result = test_statistical_significance(group1, group2) +if result['significant']: + print(f"Significant difference (p={result['p_value']:.4f})") +``` + +### 4. Price-Volume-Mix (PVM) Decomposition + +**Pattern:** Decompose revenue changes into price, volume, and mix effects + +```python +from config import QUANTITY_COLUMN, REVENUE_COLUMN + +def pvm_decomposition(df_base, df_current): + """Decompose revenue change into price, volume, mix effects""" + base_price = df_base[REVENUE_COLUMN].sum() / df_base[QUANTITY_COLUMN].sum() + current_price = df_current[REVENUE_COLUMN].sum() / df_current[QUANTITY_COLUMN].sum() + + base_volume = df_base[QUANTITY_COLUMN].sum() + current_volume = df_current[QUANTITY_COLUMN].sum() + + # Price effect + price_effect = (current_price - base_price) * base_volume + + # Volume effect + volume_effect = (current_volume - base_volume) * base_price + + # Mix effect (residual) + total_change = df_current[REVENUE_COLUMN].sum() - df_base[REVENUE_COLUMN].sum() + mix_effect = total_change - price_effect - volume_effect + + return { + 'price_effect': price_effect, + 'volume_effect': volume_effect, + 'mix_effect': mix_effect, + 'total_change': total_change + } +``` + +### 5. Time Series Forecasting + +**Pattern:** Forecast future revenue with confidence intervals + +```python +from data_processing import prepare_time_series +from statistical_utils import calculate_confidence_interval + +# Prepare time series +ts = prepare_time_series(df, freq='M') + +# Simple forecast (extend trend) +from scipy import stats +x = np.arange(len(ts)) +slope, intercept, r_value, p_value, std_err = stats.linregress(x, ts.values) + +# Forecast next 12 months +future_x = np.arange(len(ts), len(ts) + 12) +forecast = slope * future_x + intercept + +# Calculate confidence intervals +ci = calculate_confidence_interval(ts, confidence=0.95) +``` + +### 6. Customer Lifetime Value (CLV) Analysis + +**Pattern:** Calculate CLV using historical data + +```python +from config import CUSTOMER_COLUMN, REVENUE_COLUMN, DATE_COLUMN + +def calculate_clv(df, years=3): + """Calculate customer lifetime value""" + customer_metrics = df.groupby(CUSTOMER_COLUMN).agg({ + REVENUE_COLUMN: 'sum', + DATE_COLUMN: ['min', 'max', 'count'] + }).reset_index() + + customer_metrics.columns = [CUSTOMER_COLUMN, 'Total_Revenue', 'First_Purchase', 'Last_Purchase', 'Order_Count'] + + # Calculate customer age (years) + customer_metrics['Customer_Age_Years'] = ( + (customer_metrics['Last_Purchase'] - customer_metrics['First_Purchase']).dt.days / 365.25 + ) + + # Annual revenue + customer_metrics['Annual_Revenue'] = customer_metrics['Total_Revenue'] / customer_metrics['Customer_Age_Years'].replace(0, 1) + + # Projected CLV + customer_metrics['CLV'] = customer_metrics['Annual_Revenue'] * years + + return customer_metrics +``` + +### 7. Market Basket Analysis + +**Pattern:** Find product associations and cross-sell opportunities + +```python +from mlxtend.frequent_patterns import apriori, association_rules +from mlxtend.preprocessing import TransactionEncoder + +# Prepare transaction data +transactions = df.groupby(INVOICE_NUMBER_COLUMN)[ITEM_COLUMN].apply(list).tolist() + +# Encode transactions +te = TransactionEncoder() +te_ary = te.fit(transactions).transform(transactions) +df_encoded = pd.DataFrame(te_ary, columns=te.columns_) + +# Find frequent itemsets +frequent_itemsets = apriori(df_encoded, min_support=0.01, use_colnames=True) + +# Generate association rules +rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.5) +``` + +### 8. Segmentation with Machine Learning + +**Pattern:** Advanced customer segmentation using clustering + +```python +from sklearn.cluster import KMeans +from sklearn.preprocessing import StandardScaler + +# Prepare features +features = df.groupby(CUSTOMER_COLUMN).agg({ + REVENUE_COLUMN: ['sum', 'mean', 'count'], + DATE_COLUMN: lambda x: (x.max() - x.min()).days +}).reset_index() +features.columns = [CUSTOMER_COLUMN, 'Total_Revenue', 'Avg_Order', 'Order_Count', 'Customer_Tenure'] + +# Scale features +scaler = StandardScaler() +features_scaled = scaler.fit_transform(features[['Total_Revenue', 'Avg_Order', 'Order_Count', 'Customer_Tenure']]) + +# Cluster +kmeans = KMeans(n_clusters=5, random_state=42) +features['Segment'] = kmeans.fit_predict(features_scaled) +``` + +### 9. Anomaly Detection + +**Pattern:** Identify unusual patterns in data + +```python +from statistical_utils import calculate_z_score + +# Calculate z-scores for revenue +mean_revenue = df[REVENUE_COLUMN].mean() +std_revenue = df[REVENUE_COLUMN].std() + +df['Revenue_Z_Score'] = df[REVENUE_COLUMN].apply( + lambda x: calculate_z_score(x, mean_revenue, std_revenue) +) + +# Flag anomalies (|z| > 3) +df['Is_Anomaly'] = df['Revenue_Z_Score'].abs() > 3 +``` + +### 10. Competitive Analysis Framework + +**Pattern:** Compare performance across dimensions + +```python +from statistical_utils import calculate_yoy_growth, calculate_cagr + +def competitive_analysis(df, dimension_col): + """Compare performance across dimension (e.g., products, regions)""" + analysis = df.groupby(dimension_col).agg({ + REVENUE_COLUMN: ['sum', 'mean', 'count'] + }).reset_index() + analysis.columns = [dimension_col, 'Total_Revenue', 'Avg_Order', 'Order_Count'] + + # Calculate growth rates + for year in sorted(df['Year'].unique())[1:]: + prev_year = year - 1 + current = df[df['Year'] == year].groupby(dimension_col)[REVENUE_COLUMN].sum() + previous = df[df['Year'] == prev_year].groupby(dimension_col)[REVENUE_COLUMN].sum() + + growth = calculate_yoy_growth(current, previous) + analysis[f'Growth_{year}'] = growth + + return analysis +``` + +## Best Practices for Advanced Analyses + +1. **Always validate data quality first:** + ```python + from data_quality import generate_data_quality_report + report = generate_data_quality_report(df) + ``` + +2. **Use logging for complex analyses:** + ```python + from logger_config import get_logger + logger = get_logger('advanced_analysis') + logger.info("Starting complex analysis...") + ``` + +3. **Export intermediate results:** + ```python + from export_utils import export_to_excel + export_to_excel(intermediate_df, 'intermediate_results.xlsx') + ``` + +4. **Generate comprehensive reports:** + ```python + from report_generator import generate_pdf_report + generate_pdf_report(charts=['chart1.png', 'chart2.png'], summary_data=summary) + ``` + +5. **Test statistical significance:** + ```python + from statistical_utils import test_statistical_significance + # Always test before making conclusions + ``` + +## Cursor AI Prompts for Advanced Analyses + +When using Cursor, try these prompts: + +- **"Create a cohort retention analysis with heatmaps"** +- **"Build a price-volume-mix decomposition analysis"** +- **"Generate a customer lifetime value analysis with segmentation"** +- **"Create a forecasting model with confidence intervals"** +- **"Build a multi-dimensional analysis across product, customer, and geography"** +- **"Create an anomaly detection analysis for unusual transactions"** + +The AI will automatically use these patterns and the template utilities. + +--- + +**Last Updated:** January 2026 +**For:** Advanced users and AI-assisted development diff --git a/.cursor/rules/ai_assistant_guide.md b/.cursor/rules/ai_assistant_guide.md new file mode 100644 index 0000000..6b382ff --- /dev/null +++ b/.cursor/rules/ai_assistant_guide.md @@ -0,0 +1,316 @@ +# AI Assistant Guide for Sales Analysis Template + +This guide helps you effectively use Cursor's AI assistant to create sophisticated sales analyses. + +## šŸŽÆ Quick Start with AI + +### Basic Prompt Structure + +When asking the AI to create an analysis, use this structure: + +``` +Create a [ANALYSIS_TYPE] analysis that: +1. [Specific requirement 1] +2. [Specific requirement 2] +3. Uses the sales_analysis_template patterns +4. Includes [specific visualizations/metrics] +``` + +### Example Prompts + +**Simple Analysis:** +``` +Create an annual revenue trend analysis using the template patterns, +with LTM support and proper chart formatting. +``` + +**Advanced Analysis:** +``` +Create a customer cohort retention analysis that: +1. Groups customers by first purchase month +2. Calculates retention rates for 12 periods +3. Shows revenue retention metrics +4. Creates heatmap visualizations +5. Uses the template's cohort analysis patterns +``` + +**Multi-Dimensional Analysis:** +``` +Create a product performance analysis across regions that: +1. Analyzes top products by revenue +2. Shows regional distribution +3. Calculates growth rates by region +4. Creates multi-panel visualizations +5. Exports results to Excel +``` + +## šŸ“‹ Template-Aware Prompts + +The AI automatically knows about: +- `data_loader.py` - Always use this for loading data +- `analysis_utils.py` - Use utilities for formatting, LTM, etc. +- `config.py` - Use config values, never hardcode +- Template patterns - Follows best practices automatically + +### What the AI Knows + +When you mention the template, the AI will: +- āœ… Use `load_sales_data()` instead of `pd.read_csv()` +- āœ… Use `setup_revenue_chart()` for charts +- āœ… Divide revenue by 1e6 before plotting +- āœ… Use config values from `config.py` +- āœ… Apply exclusion filters if configured +- āœ… Validate data after loading +- āœ… Use LTM patterns correctly + +## šŸ”§ Common AI Tasks + +### 1. Create New Analysis Script + +**Prompt:** +``` +Create a new analysis script called [name].py that: +- Follows the template structure +- Analyzes [specific metric/dimension] +- Creates [type of visualization] +- Uses template utilities +``` + +**AI will:** +- Copy structure from `analysis_template.py` +- Use proper imports +- Follow template patterns +- Include validation + +### 2. Add Advanced Features + +**Prompt:** +``` +Add statistical significance testing to [analysis].py: +- Compare [group1] vs [group2] +- Show p-values and confidence intervals +- Use statistical_utils functions +``` + +### 3. Fix Common Issues + +**Prompt:** +``` +Fix the chart formatting in [analysis].py - it's showing scientific notation. +``` + +**AI will:** +- Add `data / 1e6` conversion +- Use `setup_revenue_chart()` +- Fix formatting issues + +### 4. Enhance Existing Analysis + +**Prompt:** +``` +Enhance [analysis].py to: +- Add export to Excel functionality +- Include data quality checks +- Add logging +- Generate PDF report +``` + +## šŸš€ Advanced AI Prompts + +### Multi-Step Analysis + +``` +Create a comprehensive customer analysis that: +1. Segments customers using RFM +2. Calculates CLV for each segment +3. Identifies at-risk customers +4. Creates cohort retention analysis +5. Generates PDF report with all charts +``` + +### Data Quality First + +``` +Before running the analysis, check data quality: +1. Run data quality report +2. Fix any critical issues +3. Validate configuration +4. Then proceed with analysis +``` + +### Statistical Analysis + +``` +Add statistical analysis to [analysis].py: +- Calculate year-over-year growth with significance testing +- Show confidence intervals for forecasts +- Test differences between segments +- Use statistical_utils functions +``` + +## šŸ’” Pro Tips + +### 1. Reference Existing Examples + +``` +Create an analysis similar to examples/customer_segmentation.py +but for product segmentation instead. +``` + +### 2. Use Template Utilities + +``` +Use the template's export_utils to save results to Excel, +and report_generator to create a PDF report. +``` + +### 3. Leverage Cursor Rules + +The AI automatically reads `.cursor/rules/` files, so you can say: +``` +Follow the advanced_analysis_patterns.md guide to create +a price-volume-mix decomposition analysis. +``` + +### 4. Iterative Development + +``` +Start with a basic version, then enhance it: +1. First version: Simple revenue trend +2. Add: Statistical significance +3. Add: Export functionality +4. Add: PDF report generation +``` + +## šŸŽØ Visualization Prompts + +### Create Specific Chart Types + +``` +Create a heatmap showing [metric] across [dimension1] and [dimension2], +using seaborn and following template chart formatting. +``` + +``` +Create an interactive Plotly chart for [analysis], +saving it as HTML using the template's interactive chart functions. +``` + +### Multi-Panel Visualizations + +``` +Create a 2x2 subplot showing: +- Top left: Revenue trend +- Top right: Customer count trend +- Bottom left: Average order value +- Bottom right: Growth rates +All using template chart formatting. +``` + +## šŸ“Š Data Analysis Prompts + +### Cohort Analysis + +``` +Create a cohort analysis that: +1. Groups customers by first purchase month +2. Tracks retention for 12 periods +3. Calculates revenue retention +4. Creates retention heatmap +5. Uses examples/cohort_analysis.py as reference +``` + +### Forecasting + +``` +Create a revenue forecasting analysis: +1. Prepare time series data +2. Fit trend model +3. Forecast next 12 months +4. Show confidence intervals +5. Use statistical_utils for calculations +``` + +### Segmentation + +``` +Create an advanced customer segmentation: +1. Calculate RFM scores +2. Apply clustering algorithm +3. Analyze segment characteristics +4. Create segment visualizations +5. Export segment data to Excel +``` + +## šŸ” Debugging with AI + +### Fix Errors + +``` +I'm getting [error message] in [file].py. +Fix it using template best practices. +``` + +### Optimize Performance + +``` +Optimize [analysis].py for large datasets: +- Use efficient pandas operations +- Add progress indicators +- Consider data sampling if needed +``` + +### Improve Code Quality + +``` +Refactor [analysis].py to: +- Use more template utilities +- Follow template patterns better +- Add proper error handling +- Include logging +``` + +## šŸ“ Documentation Prompts + +### Add Documentation + +``` +Add comprehensive docstrings to [analysis].py following +the template's documentation style. +``` + +### Create README + +``` +Create a README for [analysis].py explaining: +- What it does +- How to run it +- What outputs it generates +- Dependencies required +``` + +## šŸŽÆ Best Practices for AI Interaction + +1. **Be Specific:** Mention template files and utilities by name +2. **Reference Examples:** Point to existing examples when relevant +3. **Iterate:** Start simple, then add complexity +4. **Use Template Terms:** Mention "LTM", "config values", "template patterns" +5. **Ask for Validation:** Request data quality checks and validation + +## Example Full Workflow + +``` +1. "Check my configuration using config_validator.py" +2. "Run data quality report on my data" +3. "Create a revenue trend analysis using template patterns" +4. "Add statistical significance testing to the analysis" +5. "Export results to Excel and generate PDF report" +6. "Create a cohort analysis similar to the example" +``` + +The AI will guide you through each step using template best practices. + +--- + +**Last Updated:** January 2026 +**For:** Cursor AI users working with sales_analysis_template diff --git a/.cursor/rules/analysis_patterns.md b/.cursor/rules/analysis_patterns.md new file mode 100644 index 0000000..1d1cd67 --- /dev/null +++ b/.cursor/rules/analysis_patterns.md @@ -0,0 +1,161 @@ +# Common Analysis Patterns + +## ⭐ RECOMMENDED: Use Utilities + +**Always prefer `analysis_utils.py` and `config.py` over manual implementations:** +- Consistent formatting +- Fewer errors +- Easier maintenance +- Standardized output + +## Standard Script Structure (Using Utilities) + +**RECOMMENDED:** Use `analysis_utils.py` and `config.py` for consistency: + +```python +# 1. IMPORTS +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +from data_loader import load_sales_data, validate_data_structure +from validate_revenue import validate_revenue +from analysis_utils import ( + get_ltm_period_config, get_annual_data, calculate_annual_metrics, + get_millions_formatter, setup_revenue_chart, save_chart, + format_currency, print_annual_summary, sort_mixed_years, + apply_exclusion_filters +) +from config import ( + DATA_FILE, OUTPUT_DIR, CHART_SIZES, ensure_directories, + get_data_path, REVENUE_COLUMN, COMPANY_NAME +) + +# 2. LOAD DATA (ALWAYS use data_loader) +df = load_sales_data(get_data_path()) + +# 3. VALIDATE DATA STRUCTURE +is_valid, msg = validate_data_structure(df) +if not is_valid: + print(f"ERROR: {msg}") + return + +# 4. APPLY EXCLUSION FILTERS (if configured) +df = apply_exclusion_filters(df) + +# 5. SETUP LTM (if doing annual comparisons and LTM is enabled) +ltm_start, ltm_end = get_ltm_period_config() + +# 6. DATA PREPARATION +# Convert columns, filter data, create derived columns + +# 7. ANALYSIS LOGIC +# Use calculate_annual_metrics() for annual aggregations + +# 8. VISUALIZATIONS +# Use setup_revenue_chart() and save_chart() from analysis_utils + +# 9. VALIDATION +validate_revenue(df, "Analysis Name") +``` + +## Annual Aggregation Pattern + +**RECOMMENDED:** Use `calculate_annual_metrics()` from `analysis_utils.py`: + +```python +from analysis_utils import calculate_annual_metrics, get_ltm_period_config +from config import REVENUE_COLUMN + +ltm_start, ltm_end = get_ltm_period_config() + +def calculate_metrics(year_data): + """Calculate metrics for a single year""" + return { + 'Revenue': year_data[REVENUE_COLUMN].sum(), + # ... other metrics + } + +annual_df = calculate_annual_metrics(df, calculate_metrics, ltm_start, ltm_end) +``` + +## Chart Formatting Pattern + +**ALWAYS use this pattern for revenue charts:** + +```python +from analysis_utils import setup_revenue_chart, save_chart +from config import CHART_SIZES + +fig, ax = plt.subplots(figsize=CHART_SIZES['medium']) + +# Divide data by 1e6 BEFORE plotting +ax.plot(data / 1e6, ...) +# OR +ax.bar(x, values / 1e6, ...) + +# Apply formatter automatically +setup_revenue_chart(ax) + +# Save chart +save_chart(fig, 'chart_name.png') +plt.close() +``` + +## Mixed Type Handling + +When dealing with year columns that may contain mixed int/str types (e.g., "2025 (LTM 9/2025)"): + +```python +from analysis_utils import sort_mixed_years + +# Sort DataFrame by year +df_sorted = sort_mixed_years(df, year_col='Year') + +# For chart labels +years = df_sorted['Year'].tolist() +x_pos = range(len(years)) +ax.set_xticks(x_pos) +ax.set_xticklabels(years, rotation=45, ha='right') +``` + +## Price Calculation Pattern + +```python +from analysis_utils import calculate_price_per_unit +from config import QUANTITY_COLUMN, REVENUE_COLUMN + +# Calculate average price per unit (excludes outliers automatically) +price_per_unit = calculate_price_per_unit(df, QUANTITY_COLUMN, REVENUE_COLUMN) +``` + +## Exclusion Filters Pattern + +If you need to exclude specific segments (e.g., test accounts, business units): + +```python +from analysis_utils import apply_exclusion_filters + +# Configure in config.py: +# EXCLUSION_FILTERS = { +# 'enabled': True, +# 'exclude_by_column': 'Country', +# 'exclude_values': ['KVT', 'Test'] +# } + +df = apply_exclusion_filters(df) +``` + +## Using Configuration Values + +**ALWAYS use config values instead of hardcoding:** + +```python +from config import ( + REVENUE_COLUMN, # Use this instead of 'USD' or 'Amount' + CUSTOMER_COLUMN, # Use this instead of 'Customer' + DATE_COLUMN, # Use this instead of 'InvoiceDate' + COMPANY_NAME, # Use this for titles + ANALYSIS_YEARS, # Use this for year filtering + CHART_SIZES, # Use this for figure sizes +) +``` diff --git a/.cursor/rules/chart_formatting.md b/.cursor/rules/chart_formatting.md new file mode 100644 index 0000000..a6c4905 --- /dev/null +++ b/.cursor/rules/chart_formatting.md @@ -0,0 +1,111 @@ +# Chart Formatting Rules + +## ⭐ RECOMMENDED: Use analysis_utils.py + +**Prefer utility functions:** +```python +from analysis_utils import setup_revenue_chart, save_chart, get_millions_formatter +from config import CHART_SIZES, OUTPUT_DIR + +fig, ax = plt.subplots(figsize=CHART_SIZES['medium']) +ax.plot(data / 1e6, ...) +setup_revenue_chart(ax) # Applies formatter automatically +save_chart(fig, 'chart.png') # Saves to charts/ directory +``` + +## Revenue Charts: Millions Formatter + +**ALWAYS use this pattern for revenue charts:** + +```python +from analysis_utils import setup_revenue_chart + +# Divide data by 1e6 BEFORE plotting +ax.plot(data / 1e6, ...) +# OR +ax.bar(x, values / 1e6, ...) + +# Apply formatter automatically +setup_revenue_chart(ax) +``` + +**Manual approach (if not using utilities):** +```python +from matplotlib.ticker import FuncFormatter + +def millions_formatter(x, pos): + return f'${x:.1f}m' + +ax.plot(data / 1e6, ...) +ax.yaxis.set_major_formatter(FuncFormatter(millions_formatter)) +ax.set_ylabel('Revenue (Millions USD)') +``` + +## Thousands Formatter (for smaller values) + +```python +from analysis_utils import get_thousands_formatter + +ax.xaxis.set_major_formatter(get_thousands_formatter()) +ax.barh(x, values / 1e3, ...) +ax.set_xlabel('Value (Thousands USD)') +``` + +## Chart Labeling with LTM + +**If LTM is enabled, ALWAYS include LTM notation:** + +```python +from config import get_ltm_label, COMPANY_NAME + +title = f'Annual Revenue Trend - {COMPANY_NAME}' +ltm_label = get_ltm_label() +if ltm_label: + title += f'\n({ltm_label})' +ax.set_title(title) +``` + +## Chart Sizes + +**Use predefined sizes from config:** +```python +from config import CHART_SIZES + +fig, ax = plt.subplots(figsize=CHART_SIZES['medium']) # (10, 6) +# Options: 'small' (6, 4), 'medium' (10, 6), 'large' (12, 8), 'wide' (14, 6) +``` + +## Common Mistakes + +āŒ **WRONG:** +```python +ax.plot(revenue, ...) # Shows scientific notation (1e8) +``` + +āœ… **CORRECT:** +```python +ax.plot(revenue / 1e6, ...) # Divide first +setup_revenue_chart(ax) # Then format +``` + +## Saving Charts + +**ALWAYS use save_chart() utility:** +```python +from analysis_utils import save_chart + +save_chart(fig, 'chart_name.png') # Saves to charts/ with proper settings +plt.close() # Don't forget to close! +``` + +## Chart Styling + +**Configure style in config.py:** +```python +# In config.py: +CHART_STYLE = 'seaborn-v0_8' # Options: 'default', 'ggplot', 'seaborn-v0_8' + +# In your script: +import matplotlib.pyplot as plt +plt.style.use(CHART_STYLE) # Apply before creating figures +``` diff --git a/.cursor/rules/code_quality.md b/.cursor/rules/code_quality.md new file mode 100644 index 0000000..b37f428 --- /dev/null +++ b/.cursor/rules/code_quality.md @@ -0,0 +1,389 @@ +# Code Quality & Best Practices + +**Comprehensive guide for writing Cursor-optimized code in the sales analysis template.** + +This document combines code quality standards and Cursor best practices to ensure AI assistants can effectively understand, modify, and extend the codebase. + +## Type Hints + +### When to Use Type Hints + +Use type hints for: +- Function parameters +- Return values +- Class attributes +- Complex data structures + +### Example Pattern + +```python +from typing import Dict, List, Optional, Tuple +import pandas as pd + +def calculate_annual_metrics( + df: pd.DataFrame, + metrics_func: callable, + ltm_start: Optional[pd.Period] = None, + ltm_end: Optional[pd.Period] = None +) -> pd.DataFrame: + """ + Calculate annual metrics for all years + + Args: + df: DataFrame with 'Year' and 'YearMonth' columns + metrics_func: Function that takes a DataFrame and returns a dict of metrics + ltm_start: LTM start period (defaults to config if None) + ltm_end: LTM end period (defaults to config if None) + + Returns: + DataFrame with 'Year' index and metric columns + """ + # Implementation +``` + +## Docstrings + +### Docstring Format + +All functions should use Google-style docstrings: + +```python +def function_name(param1: type, param2: type) -> return_type: + """ + Brief description of what the function does. + + More detailed explanation if needed. Can span multiple lines. + Explain any complex logic or important considerations. + + Args: + param1: Description of param1 + param2: Description of param2 + + Returns: + Description of return value + + Raises: + ValueError: When and why this exception is raised + + Example: + >>> result = function_name(value1, value2) + >>> print(result) + expected_output + """ +``` + +### Required Elements + +- Brief one-line summary +- Detailed description (if needed) +- Args section (all parameters) +- Returns section (return value) +- Raises section (if exceptions raised) +- Example section (for complex functions) + +## Variable Naming + +### Conventions + +- **Descriptive names:** `customer_revenue` not `cr` +- **Consistent prefixes:** `df_` for DataFrames, `annual_` for annual metrics +- **Clear abbreviations:** `ltm` for Last Twelve Months (well-known) +- **Avoid single letters:** Except for loop variables (`i`, `j`, `k`) + +### Good Examples + +```python +# Good +customer_revenue_by_year = df.groupby(['Customer', 'Year'])[REVENUE_COLUMN].sum() +annual_metrics_df = calculate_annual_metrics(df, metrics_func) +ltm_start_period, ltm_end_period = get_ltm_period_config() + +# Bad +cr = df.groupby(['C', 'Y'])['R'].sum() +am = calc(df, mf) +s, e = get_ltm() +``` + +## Error Messages + +### Structure + +Error messages should be: +1. **Specific:** What exactly went wrong +2. **Actionable:** How to fix it +3. **Contextual:** Where it occurred +4. **Helpful:** Reference to documentation + +### Good Error Messages + +```python +# Good +raise ValueError( + f"Required column '{REVENUE_COLUMN}' not found in data.\n" + f"Available columns: {list(df.columns)}\n" + f"Please update config.py REVENUE_COLUMN to match your data.\n" + f"See .cursor/rules/data_loading.md for more help." +) + +# Bad +raise ValueError("Column not found") +``` + +## Code Comments + +### When to Comment + +- Complex logic that isn't immediately obvious +- Business rules or domain-specific knowledge +- Workarounds or non-obvious solutions +- Performance considerations +- TODO items with context + +### Comment Style + +```python +# Good: Explains WHY, not WHAT +# Use LTM for most recent year to enable apples-to-apples comparison +# with full calendar years (avoids partial year bias) +if year == LTM_END_YEAR and LTM_ENABLED: + year_data = get_ltm_data(df, ltm_start, ltm_end) + +# Bad: States the obvious +# Check if year equals LTM_END_YEAR +if year == LTM_END_YEAR: +``` + +## Function Design + +### Single Responsibility + +Each function should do one thing well: + +```python +# Good: Single responsibility +def calculate_revenue(df: pd.DataFrame) -> float: + """Calculate total revenue from DataFrame""" + return df[REVENUE_COLUMN].sum() + +def calculate_customer_count(df: pd.DataFrame) -> int: + """Calculate unique customer count""" + return df[CUSTOMER_COLUMN].nunique() + +# Bad: Multiple responsibilities +def calculate_metrics(df): + """Calculate revenue and customer count""" + revenue = df[REVENUE_COLUMN].sum() + customers = df[CUSTOMER_COLUMN].nunique() + return revenue, customers +``` + +### Function Length + +- Keep functions under 50 lines when possible +- Break complex functions into smaller helper functions +- Use descriptive function names that explain purpose + +## Import Organization + +### Standard Order + +1. Standard library imports +2. Third-party imports (pandas, numpy, matplotlib) +3. Local/template imports (data_loader, analysis_utils, config) + +### Example + +```python +# Standard library +from pathlib import Path +from typing import Dict, Optional +from datetime import datetime + +# Third-party +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt + +# Template imports +from data_loader import load_sales_data, validate_data_structure +from analysis_utils import calculate_annual_metrics, setup_revenue_chart +from config import REVENUE_COLUMN, CHART_SIZES, COMPANY_NAME +``` + +## Constants and Configuration + +### Use Config Values + +```python +# Good: From config +from config import REVENUE_COLUMN, DATE_COLUMN +revenue = df[REVENUE_COLUMN].sum() + +# Bad: Hardcoded +revenue = df['USD'].sum() +``` + +### Magic Numbers + +Avoid magic numbers - use named constants or config: + +```python +# Good: Named constant +MILLIONS_DIVISOR = 1e6 +revenue_millions = revenue / MILLIONS_DIVISOR + +# Or from config +CHART_DPI = 300 # In config.py + +# Bad: Magic number +revenue_millions = revenue / 1000000 +``` + +## Testing Considerations + +### Testable Code + +Write code that's easy to test: +- Pure functions when possible (no side effects) +- Dependency injection for external dependencies +- Clear inputs and outputs + +### Example + +```python +# Good: Testable +def calculate_metrics(year_data: pd.DataFrame, revenue_col: str) -> Dict: + """Calculate metrics - easy to test with sample data""" + return { + 'Revenue': year_data[revenue_col].sum(), + 'Count': len(year_data) + } + +# Harder to test: Depends on global config +def calculate_metrics(year_data): + """Uses global REVENUE_COLUMN - harder to test""" + return {'Revenue': year_data[REVENUE_COLUMN].sum()} +``` + +## AI-Friendly Patterns + +### Clear Intent + +Code should clearly express intent: + +```python +# Good: Intent is clear +customers_with_revenue = df[df[REVENUE_COLUMN] > 0][CUSTOMER_COLUMN].unique() + +# Less clear: Requires understanding of pandas +customers_with_revenue = df.loc[df[REVENUE_COLUMN] > 0, CUSTOMER_COLUMN].unique() +``` + +### Explicit Over Implicit + +```python +# Good: Explicit +if LTM_ENABLED and ltm_start is not None and ltm_end is not None: + use_ltm = True +else: + use_ltm = False + +# Less clear: Implicit truthiness +use_ltm = LTM_ENABLED and ltm_start and ltm_end +``` + +## Documentation for AI + +### Help AI Understand Context + +Add comments that help AI understand business context: + +```python +# LTM (Last Twelve Months) is used for the most recent partial year +# to enable fair comparison with full calendar years. +# Example: If latest data is through Sep 2025, use Oct 2024 - Sep 2025 +if year == LTM_END_YEAR and LTM_ENABLED: + # Use 12-month rolling period instead of partial calendar year + year_data = get_ltm_data(df, ltm_start, ltm_end) +``` + +## Cursor-Specific Optimizations + +### AI-Friendly Code Structure + +Code should be structured so Cursor AI can: +1. **Understand intent** - Clear function names and comments +2. **Generate code** - Follow established patterns +3. **Fix errors** - Actionable error messages +4. **Extend functionality** - Modular, reusable functions + +### Example: AI-Generated Code Pattern + +When AI generates code, it should automatically: +```python +# AI recognizes this pattern and replicates it +def main(): + # 1. Load data (AI knows to use data_loader) + df = load_sales_data(get_data_path()) + + # 2. Validate (AI knows to check structure) + is_valid, msg = validate_data_structure(df) + if not is_valid: + print(f"ERROR: {msg}") + return + + # 3. Apply filters (AI knows exclusion filters) + df = apply_exclusion_filters(df) + + # 4. Analysis logic (AI follows template patterns) + # ... + + # 5. Create charts (AI knows formatting rules) + # ... + + # 6. Validate revenue (AI knows to validate) + validate_revenue(df, ANALYSIS_NAME) +``` + +### Help AI Generate Better Code + +Add context comments that help AI: +```python +# LTM (Last Twelve Months) is used for the most recent partial year +# to enable fair comparison with full calendar years. +# Example: If latest data is through Sep 2025, use Oct 2024 - Sep 2025 +# This avoids partial-year bias in year-over-year comparisons. +if year == LTM_END_YEAR and LTM_ENABLED: + # Use 12-month rolling period instead of partial calendar year + year_data = get_ltm_data(df, ltm_start, ltm_end) + year_label = get_ltm_label() # Returns "2025 (LTM 9/2025)" +``` + +## Summary Checklist + +For Cursor-optimized code: +- āœ… Comprehensive docstrings with examples +- āœ… Type hints on functions +- āœ… Descriptive variable names +- āœ… Clear comments for business logic +- āœ… Structured error messages +- āœ… Consistent code patterns +- āœ… Use config values (never hardcode) +- āœ… Follow template utilities +- āœ… Include validation steps +- āœ… Reference documentation + +## Summary + +Follow these standards to ensure: +1. AI can understand code structure +2. AI can modify code safely +3. AI can generate new code following patterns +4. Code is maintainable and readable +5. Errors are clear and actionable +6. Cursor AI can assist effectively + +--- + +**Last Updated:** January 2026 +**For:** Cursor AI optimization and human developers diff --git a/.cursor/rules/common_errors.md b/.cursor/rules/common_errors.md new file mode 100644 index 0000000..effe0d7 --- /dev/null +++ b/.cursor/rules/common_errors.md @@ -0,0 +1,109 @@ +# Common Errors and Troubleshooting + +**Quick reference for fixing common issues. For error handling patterns when writing code, see `error_handling.md`.** + +## Data Loading Errors + +### Error: "Data file not found" +**Cause:** DATA_FILE path in config.py is incorrect +**Fix:** +1. Check that your CSV file exists +2. Update `DATA_FILE` in config.py with correct filename +3. If file is in a subdirectory, set `DATA_DIR` in config.py + +### Error: "Required column 'USD' not found" +**Cause:** Column name in data doesn't match config +**Fix:** +1. Check your CSV column names +2. Update `REVENUE_COLUMN` in config.py to match your data +3. Update other column mappings (DATE_COLUMN, CUSTOMER_COLUMN, etc.) + +### Error: "All InvoiceDate values are NaN" +**Cause:** Date column parsing failed +**Fix:** +1. Check date format in your CSV +2. Add fallback date columns to `DATE_FALLBACK_COLUMNS` in config.py +3. Ensure at least one date column exists (Month, Year, etc.) + +## Analysis Errors + +### Error: "DataFrame is empty" after filtering +**Cause:** Date range or year filters too restrictive +**Fix:** +1. Check `MIN_YEAR` and `MAX_DATE` in config.py +2. Check `ANALYSIS_YEARS` includes years in your data +3. Verify date parsing worked (check data_loader output) + +### Error: Charts show scientific notation (1e8) +**Cause:** Forgot to divide by 1e6 before plotting +**Fix:** +```python +# WRONG: +ax.plot(revenue, ...) + +# CORRECT: +ax.plot(revenue / 1e6, ...) +setup_revenue_chart(ax) +``` + +### Error: "Year column has mixed types" +**Cause:** LTM year is string "2025 (LTM 9/2025)" while others are int +**Fix:** +```python +from analysis_utils import sort_mixed_years +df_sorted = sort_mixed_years(df, year_col='Year') +``` + +## Configuration Errors + +### Error: LTM not working correctly +**Cause:** LTM configuration incorrect +**Fix:** +1. Check `LTM_ENABLED = True` in config.py +2. Verify `LTM_START_MONTH`, `LTM_START_YEAR`, `LTM_END_MONTH`, `LTM_END_YEAR` +3. Ensure dates are within your data range + +### Error: Exclusion filters not working +**Cause:** Filter configuration incorrect +**Fix:** +1. Check `EXCLUSION_FILTERS['enabled'] = True` +2. Verify `exclude_by_column` matches a column in your data +3. Check `exclude_values` list is correct + +## Import Errors + +### Error: "No module named 'config'" +**Cause:** Running script from wrong directory +**Fix:** +1. Run scripts from template root directory +2. Or add template directory to Python path + +### Error: "No module named 'data_loader'" +**Cause:** Missing import or wrong directory +**Fix:** +1. Ensure all template files are in the same directory +2. Check import statements match file names + +## Best Practices to Avoid Errors + +1. **Always use utilities:** Use `analysis_utils.py` functions instead of manual code +2. **Validate data:** Run `validate_data_structure()` after loading +3. **Check config:** Verify all column names match your data (use `config_validator.py`) +4. **Test incrementally:** Test data loading before running full analysis +5. **Read error messages:** They usually tell you exactly what's wrong +6. **Use Cursor AI:** Ask AI to fix errors - it knows template patterns + +## Using Cursor AI to Fix Errors + +When you encounter an error, ask Cursor AI: +``` +"Fix this error: [paste error message]" +``` + +The AI will: +- āœ… Understand the error context +- āœ… Reference template patterns +- āœ… Suggest specific fixes +- āœ… Use template utilities correctly + +**See also:** `.cursor/rules/error_handling.md` for how to write error messages that help AI fix issues. diff --git a/.cursor/rules/data_loading.md b/.cursor/rules/data_loading.md new file mode 100644 index 0000000..b6acc5a --- /dev/null +++ b/.cursor/rules/data_loading.md @@ -0,0 +1,69 @@ +# Data Loading Rules + +## CRITICAL: Always Use data_loader.py + +**NEVER load data directly with `pd.read_csv()`. Always use:** + +```python +from data_loader import load_sales_data +from config import get_data_path +df = load_sales_data(get_data_path()) +``` + +## Why This Matters + +The `data_loader.py` implements intelligent fallback logic to ensure 100% date coverage: + +1. **Primary:** Parse primary date column (from config.DATE_COLUMN) +2. **Fallback 1:** Use fallback date columns if primary is missing (from config.DATE_FALLBACK_COLUMNS) +3. **Fallback 2:** Use Year column if both missing +4. **Result:** Maximum date coverage possible + +## What data_loader.py Provides + +- **Date Column:** Properly parsed datetime with fallback logic +- **Year:** Extracted year (100% coverage via fallback) +- **YearMonth:** Period format for monthly aggregations +- **Revenue Column:** Converted to numeric (from config.REVENUE_COLUMN) + +## Column Configuration + +Before using, configure column names in `config.py`: +- `REVENUE_COLUMN`: Your revenue/amount column name +- `DATE_COLUMN`: Primary date column name +- `DATE_FALLBACK_COLUMNS`: List of fallback date columns +- `CUSTOMER_COLUMN`: Customer/account column name +- Other columns as needed + +## Common Mistakes + +āŒ **WRONG:** +```python +df = pd.read_csv('sales_data.csv') +df['Date'] = pd.to_datetime(df['Date'], errors='coerce') +df = df.dropna(subset=['Date']) # May drop significant data! +``` + +āœ… **CORRECT:** +```python +from data_loader import load_sales_data +from config import get_data_path +df = load_sales_data(get_data_path()) # Uses fallback logic +``` + +## Data File Location + +The data file path is configured in `config.py`: +- `DATA_FILE`: Filename (e.g., 'sales_data.csv') +- `DATA_DIR`: Optional subdirectory (defaults to current directory) +- Use `get_data_path()` to get the full path + +## Validation + +After loading, validate data structure: +```python +from data_loader import validate_data_structure +is_valid, msg = validate_data_structure(df) +if not is_valid: + print(f"ERROR: {msg}") +``` diff --git a/.cursor/rules/error_handling.md b/.cursor/rules/error_handling.md new file mode 100644 index 0000000..38d3545 --- /dev/null +++ b/.cursor/rules/error_handling.md @@ -0,0 +1,276 @@ +# Error Handling Best Practices + +This guide defines how to handle errors in a way that's helpful for both users and AI assistants. + +## Error Message Structure + +### Required Elements + +Every error message should include: +1. **What went wrong** - Specific error description +2. **Where it occurred** - File/function context +3. **Why it happened** - Root cause explanation +4. **How to fix** - Actionable steps +5. **Reference** - Link to relevant documentation + +### Template + +```python +raise ErrorType( + f"[What] - [Specific description]\n" + f"\n" + f"Context: [Where/When this occurred]\n" + f"Reason: [Why this happened]\n" + f"\n" + f"Solution:\n" + f"1. [Step 1]\n" + f"2. [Step 2]\n" + f"\n" + f"For more help, see: [Documentation reference]" +) +``` + +## Common Error Patterns + +### Data Loading Errors + +```python +# Good: Comprehensive error message +if REVENUE_COLUMN not in df.columns: + available_cols = list(df.columns)[:10] # Show first 10 + raise ValueError( + f"Required column '{REVENUE_COLUMN}' not found in data.\n" + f"\n" + f"Context: Loading data from {filepath}\n" + f"Available columns: {available_cols}\n" + f"\n" + f"Solution:\n" + f"1. Check your CSV file column names\n" + f"2. Update REVENUE_COLUMN in config.py to match your data\n" + f"3. Run: python config_validator.py to validate configuration\n" + f"\n" + f"For more help, see: .cursor/rules/data_loading.md" + ) + +# Bad: Vague error +if REVENUE_COLUMN not in df.columns: + raise ValueError("Column not found") +``` + +### Configuration Errors + +```python +# Good: Actionable error +if LTM_ENABLED and (LTM_START is None or LTM_END is None): + raise ValueError( + f"LTM configuration error: LTM_ENABLED is True but LTM period is not set.\n" + f"\n" + f"Context: Configuration in config.py\n" + f"Current values: LTM_ENABLED={LTM_ENABLED}, LTM_START={LTM_START}, LTM_END={LTM_END}\n" + f"\n" + f"Solution:\n" + f"1. Set LTM_START_MONTH, LTM_START_YEAR, LTM_END_MONTH, LTM_END_YEAR in config.py\n" + f"2. Or set LTM_ENABLED = False if you don't need LTM\n" + f"3. Run: python config_validator.py to check configuration\n" + f"\n" + f"For more help, see: .cursor/rules/ltm_methodology.md" + ) +``` + +### Data Quality Errors + +```python +# Good: Helpful data quality error +if date_coverage < 0.5: # Less than 50% coverage + raise ValueError( + f"Data quality issue: Only {date_coverage:.1%} of rows have valid dates.\n" + f"\n" + f"Context: Date parsing in data_loader.py\n" + f"Rows with dates: {date_count:,} / {total_rows:,}\n" + f"\n" + f"Solution:\n" + f"1. Check date format in your CSV file\n" + f"2. Add fallback date columns to DATE_FALLBACK_COLUMNS in config.py\n" + f"3. Ensure at least one date column (Month, Year) exists\n" + f"4. Run: python data_quality.py to analyze data quality\n" + f"\n" + f"For more help, see: .cursor/rules/data_loading.md" + ) +``` + +## Error Handling Patterns + +### Try-Except with Context + +```python +# Good: Provides context and recovery options +try: + df = load_sales_data(get_data_path()) +except FileNotFoundError as e: + error_msg = ( + f"Data file not found: {e}\n" + f"\n" + f"Context: Attempting to load data for analysis\n" + f"Expected file: {get_data_path()}\n" + f"\n" + f"Solution:\n" + f"1. Check that your CSV file exists at the expected location\n" + f"2. Update DATA_FILE in config.py with correct filename\n" + f"3. Or update DATA_DIR if file is in a subdirectory\n" + f"4. Run: python setup_wizard.py to reconfigure\n" + f"\n" + f"For more help, see: .cursor/rules/common_errors.md" + ) + raise FileNotFoundError(error_msg) from e +``` + +### Validation with Helpful Messages + +```python +# Good: Validates and provides specific guidance +def validate_data_structure(df: pd.DataFrame) -> Tuple[bool, str]: + """ + Validate DataFrame has required structure + + Returns: + Tuple[bool, str]: (is_valid, error_message) + If is_valid is False, error_message contains actionable guidance + """ + errors = [] + + if REVENUE_COLUMN not in df.columns: + errors.append( + f"Missing required column '{REVENUE_COLUMN}'. " + f"Update REVENUE_COLUMN in config.py to match your data." + ) + + if DATE_COLUMN not in df.columns: + errors.append( + f"Missing required column '{DATE_COLUMN}'. " + f"Update DATE_COLUMN in config.py or add fallback columns." + ) + + if len(df) == 0: + errors.append( + f"DataFrame is empty. Check date filters (MIN_YEAR, MAX_DATE) in config.py." + ) + + if errors: + error_msg = "Data validation failed:\n" + "\n".join(f" - {e}" for e in errors) + error_msg += "\n\nRun: python config_validator.py for detailed validation" + return False, error_msg + + return True, "OK" +``` + +## Warning Messages + +### When to Use Warnings + +Use warnings (not errors) for: +- Non-critical data quality issues +- Optional features that aren't configured +- Deprecated functionality +- Performance considerations + +### Warning Format + +```python +import warnings + +# Good: Informative warning +if date_coverage < 0.9: # Less than 90% but not critical + warnings.warn( + f"Date coverage is {date_coverage:.1%} ({missing_count:,} rows missing dates).\n" + f"Consider adding fallback date columns to improve coverage.\n" + f"See .cursor/rules/data_loading.md for details.", + UserWarning + ) +``` + +## Logging Errors + +### Use Structured Logging + +```python +from logger_config import get_logger + +logger = get_logger('analysis_name') + +try: + df = load_sales_data(get_data_path()) +except Exception as e: + logger.error( + f"Failed to load data: {e}", + exc_info=True, # Include stack trace + extra={ + 'file_path': str(get_data_path()), + 'config_file': 'config.py', + 'suggestion': 'Run config_validator.py to check configuration' + } + ) + raise +``` + +## AI-Friendly Error Messages + +### Help AI Understand and Fix + +Error messages should help AI assistants: +1. Understand what went wrong +2. Know where to look for fixes +3. Suggest specific solutions +4. Reference relevant documentation + +```python +# Good: AI can parse and act on this +if column not in df.columns: + raise ValueError( + f"Column '{column}' not found.\n" + f"Available: {list(df.columns)}\n" + f"Fix: Update {column}_COLUMN in config.py\n" + f"See: .cursor/rules/data_loading.md" + ) + +# Bad: AI has no context +if column not in df.columns: + raise ValueError("Not found") +``` + +## Error Recovery + +### Provide Recovery Options + +```python +# Good: Offers recovery path +def load_sales_data(filepath=None): + try: + df = pd.read_csv(filepath) + except FileNotFoundError: + # Suggest alternatives + suggestions = [ + f"1. Check file path: {filepath}", + f"2. Update DATA_FILE in config.py", + f"3. Run: python setup_wizard.py", + f"4. Generate sample data: python generate_sample_data.py" + ] + raise FileNotFoundError( + f"Data file not found: {filepath}\n" + f"\n" + f"Options:\n" + "\n".join(suggestions) + ) +``` + +## Summary + +Good error handling: +- āœ… Specific and actionable +- āœ… Provides context +- āœ… Suggests solutions +- āœ… References documentation +- āœ… Helps both users and AI assistants + +--- + +**Last Updated:** January 2026 +**For:** Error handling in sales_analysis_template diff --git a/.cursor/rules/ltm_methodology.md b/.cursor/rules/ltm_methodology.md new file mode 100644 index 0000000..51e4d0e --- /dev/null +++ b/.cursor/rules/ltm_methodology.md @@ -0,0 +1,89 @@ +# LTM (Last Twelve Months) Methodology Rules + +## ⭐ RECOMMENDED: Use analysis_utils.py + +**Prefer utility functions:** +```python +from analysis_utils import get_ltm_period_config, get_annual_data, calculate_annual_metrics +from config import get_ltm_period, get_ltm_label + +ltm_start, ltm_end = get_ltm_period_config() +year_data, year_label = get_annual_data(df, 2025, ltm_start, ltm_end) +``` + +## What is LTM? + +**LTM (Last Twelve Months)** = Rolling 12-month period for the most recent partial year +- **Purpose:** Apples-to-apples comparison with full calendar years +- **Example:** If latest data is through September 2025, use Oct 2024 - Sep 2025 (12 months) + +## When to Use LTM + +- **Full calendar years (2021-2024):** Use complete year data +- **Most recent partial year (2025):** Use LTM if you only have partial year data +- **Complete years only:** Disable LTM in config if all years are complete + +## Configuration + +**Configure in config.py:** +```python +LTM_ENABLED = True # Set to False if all years are complete +LTM_START_MONTH = 10 # Month number (1-12) +LTM_START_YEAR = 2024 +LTM_END_MONTH = 9 +LTM_END_YEAR = 2025 +``` + +## Implementation Pattern + +```python +from analysis_utils import get_ltm_period_config, get_annual_data + +ltm_start, ltm_end = get_ltm_period_config() + +for year in sorted(df['Year'].unique()): + year_data, year_label = get_annual_data(df, year, ltm_start, ltm_end) + # year_label will be "2025 (LTM 9/2025)" for LTM year, or "2025" for regular year +``` + +## Labeling Requirements + +**ALWAYS label LTM year with notation in:** +- Chart titles +- Chart x-axis labels +- Table headers +- Print statements +- Report text + +**Example:** +```python +from config import get_ltm_label + +ltm_label = get_ltm_label() # Returns "2025 (LTM 9/2025)" or None +if ltm_label: + title = f'Annual Revenue Trend\n({ltm_label})' +``` + +## Common Mistakes + +āŒ **WRONG:** +```python +year_2025_data = df[df['Year'] == 2025] # Uses partial year (not comparable) +``` + +āœ… **CORRECT:** +```python +from analysis_utils import get_annual_data +ltm_start, ltm_end = get_ltm_period_config() +year_2025_data, year_label = get_annual_data(df, 2025, ltm_start, ltm_end) +``` + +## Disabling LTM + +If all years in your analysis are complete calendar years: +```python +# In config.py: +LTM_ENABLED = False +``` + +Then all years will be treated as full calendar years. diff --git a/EXAMPLES.md b/EXAMPLES.md new file mode 100644 index 0000000..2d5ed1f --- /dev/null +++ b/EXAMPLES.md @@ -0,0 +1,203 @@ +# Example Analysis Scripts + +This directory contains working example analysis scripts that demonstrate how to use the sales analysis template framework. + +## Available Examples + +### 1. Annual Revenue Trend (`examples/annual_revenue_trend.py`) + +**Purpose:** Simple annual revenue analysis with LTM support + +**What it demonstrates:** +- Loading data using `data_loader` +- Calculating annual metrics with LTM +- Creating a revenue trend chart +- Following template best practices + +**Usage:** +```bash +python examples/annual_revenue_trend.py +``` + +**Output:** +- Chart: `charts/annual_revenue_trend.png` +- Console output with annual revenue summary + +--- + +### 2. Customer Segmentation (`examples/customer_segmentation.py`) + +**Purpose:** Customer segmentation using RFM (Recency, Frequency, Monetary) methodology + +**What it demonstrates:** +- Customer-level aggregation +- RFM scoring and segmentation +- Segment analysis and visualization +- Multiple chart generation + +**Usage:** +```bash +python examples/customer_segmentation.py +``` + +**Output:** +- Chart: `charts/customer_segmentation.png` +- Console output with segment summary + +**Segments:** +- **Champions:** High recency, frequency, and monetary value +- **Loyal Customers:** Regular customers with good value +- **At Risk:** Recent but declining frequency +- **Hibernating:** Low recency, may need reactivation +- **Potential Loyalists:** Good recency and frequency, lower value +- **Need Attention:** Mixed signals, need engagement + +--- + +### 3. Product Performance (`examples/product_performance.py`) + +**Purpose:** Product mix and performance analysis + +**What it demonstrates:** +- Product-level aggregation +- Product performance metrics +- Top products identification +- Product mix visualization + +**Usage:** +```bash +python examples/product_performance.py +``` + +**Output:** +- Chart: `charts/product_performance.png` +- Console output with top products summary + +--- + +## How to Use Examples + +### Step 1: Configure Template + +Before running examples, ensure your template is configured: + +```bash +python setup_wizard.py +``` + +Or manually update `config.py` with your data file and column mappings. + +### Step 2: Prepare Data + +Place your sales data CSV file in the template directory, or update `DATA_DIR` in `config.py`. + +Alternatively, generate sample data for testing: + +```bash +python generate_sample_data.py +``` + +### Step 3: Run Example + +```bash +python examples/annual_revenue_trend.py +``` + +### Step 4: Customize + +Copy an example script and modify it for your needs: + +```bash +cp examples/annual_revenue_trend.py my_analysis.py +# Edit my_analysis.py +python my_analysis.py +``` + +--- + +## Example Patterns + +### Pattern 1: Simple Annual Analysis + +```python +from data_loader import load_sales_data +from analysis_utils import calculate_annual_metrics, get_ltm_period_config +from config import REVENUE_COLUMN + +df = load_sales_data(get_data_path()) +ltm_start, ltm_end = get_ltm_period_config() + +def calculate_metrics(year_data): + return {'Revenue': year_data[REVENUE_COLUMN].sum()} + +annual_df = calculate_annual_metrics(df, calculate_metrics, ltm_start, ltm_end) +``` + +### Pattern 2: Customer-Level Analysis + +```python +from config import CUSTOMER_COLUMN, REVENUE_COLUMN + +customer_metrics = df.groupby(CUSTOMER_COLUMN).agg({ + REVENUE_COLUMN: 'sum', + DATE_COLUMN: 'count' +}).reset_index() +``` + +### Pattern 3: Product-Level Analysis + +```python +from config import ITEM_COLUMN, REVENUE_COLUMN + +product_metrics = df.groupby(ITEM_COLUMN)[REVENUE_COLUMN].sum().sort_values(ascending=False) +top_10 = product_metrics.head(10) +``` + +--- + +## Learning Path + +1. **Start with:** `annual_revenue_trend.py` - Simplest example +2. **Then try:** `product_performance.py` - More complex aggregation +3. **Advanced:** `customer_segmentation.py` - Multi-step analysis with custom logic + +--- + +## Troubleshooting + +**"Module not found" errors:** +- Ensure you're running from the template root directory +- Check that all template files are present + +**"Data file not found" errors:** +- Run `setup_wizard.py` to configure data file path +- Or update `DATA_FILE` in `config.py` + +**"Column not found" errors:** +- Update column mappings in `config.py` +- Run `python config_validator.py` to check configuration + +--- + +## Advanced Examples + +For more sophisticated analyses, see: +- `.cursor/rules/advanced_analysis_patterns.md` - Advanced analysis patterns +- `.cursor/rules/ai_assistant_guide.md` - How to use Cursor AI effectively + +## Next Steps + +After running examples: + +1. Review the generated charts +2. Examine the code to understand patterns +3. Copy an example and customize for your analysis +4. Check `.cursor/rules/analysis_patterns.md` for more patterns +5. Read `.cursor/rules/advanced_analysis_patterns.md` for advanced techniques +6. Use Cursor AI with prompts from `ai_assistant_guide.md` +7. Read `README.md` for comprehensive documentation + +--- + +**Last Updated:** January 2026 +**Template Version:** 1.0 diff --git a/QUICK_START.md b/QUICK_START.md new file mode 100644 index 0000000..58568b7 --- /dev/null +++ b/QUICK_START.md @@ -0,0 +1,175 @@ +# Quick Start Guide + +**For Cursor Users:** This template is optimized for Cursor AI. Just ask: *"Create a revenue analysis using the template"* and the AI will handle everything. + +## šŸš€ Get Started in 5 Minutes + +### Step 1: Install Dependencies +```bash +pip install -r requirements.txt +``` + +### Step 2: Run Setup Wizard +```bash +python setup_wizard.py +``` + +The wizard will ask you: +- Company name +- Data file location +- Column names in your CSV +- Date range +- LTM configuration (if needed) + +### Step 3: Test Data Loading +```bash +python -c "from data_loader import load_sales_data; from config import get_data_path; df = load_sales_data(get_data_path()); print(f'āœ“ Loaded {len(df):,} rows')" +``` + +### Step 4: Run Example Analysis (Recommended) +```bash +# Try an example first to see how it works +python examples/annual_revenue_trend.py +``` + +### Step 5: Create Your First Analysis +```bash +cp analysis_template.py my_analysis.py +# Or copy an example +cp examples/annual_revenue_trend.py my_analysis.py +# Edit my_analysis.py +python my_analysis.py +``` + +--- + +## šŸ“‹ Essential Configuration Checklist + +Before running analyses, verify in `config.py`: + +- [ ] `COMPANY_NAME` - Your company name +- [ ] `DATA_FILE` - Your CSV filename +- [ ] `REVENUE_COLUMN` - Your revenue column name +- [ ] `DATE_COLUMN` - Your date column name +- [ ] `CUSTOMER_COLUMN` - Your customer column name +- [ ] `ANALYSIS_YEARS` - Years to include +- [ ] `MIN_YEAR` and `MAX_DATE` - Date range +- [ ] `LTM_ENABLED` - Set to False if all years complete + +--- + +## šŸ’” Common Patterns + +### Load Data +```python +from data_loader import load_sales_data +from config import get_data_path + +df = load_sales_data(get_data_path()) +``` + +### Calculate Annual Metrics +```python +from analysis_utils import calculate_annual_metrics, get_ltm_period_config +from config import REVENUE_COLUMN + +ltm_start, ltm_end = get_ltm_period_config() + +def calculate_metrics(year_data): + return {'Revenue': year_data[REVENUE_COLUMN].sum()} + +annual_df = calculate_annual_metrics(df, calculate_metrics, ltm_start, ltm_end) +``` + +### Create Chart +```python +from analysis_utils import setup_revenue_chart, save_chart +from config import CHART_SIZES +import matplotlib.pyplot as plt + +fig, ax = plt.subplots(figsize=CHART_SIZES['medium']) +ax.plot(data / 1e6, ...) # Divide by 1e6! +setup_revenue_chart(ax) +save_chart(fig, 'chart.png') +plt.close() +``` + +--- + +## āš ļø Critical Rules + +1. **ALWAYS use `data_loader.py`** - Never `pd.read_csv()` directly +2. **ALWAYS divide by 1e6** before plotting revenue +3. **ALWAYS use `setup_revenue_chart()`** for revenue charts +4. **ALWAYS use config values** - Never hardcode column names +5. **ALWAYS validate data** after loading + +## šŸ’” New Utilities + +### Data Quality Check +```bash +python -c "from data_quality import generate_data_quality_report, print_data_quality_report; from data_loader import load_sales_data; from config import get_data_path; df = load_sales_data(get_data_path()); report = generate_data_quality_report(df); print_data_quality_report(report)" +``` + +### Configuration Validation +```bash +python config_validator.py +``` + +### Export Results +```python +from export_utils import export_to_excel +export_to_excel(df, 'results.xlsx') +``` + +### Generate Sample Data +```bash +python generate_sample_data.py +``` + +--- + +## šŸ› Quick Troubleshooting + +**"Data file not found"** +→ Check `DATA_FILE` in config.py + +**"Column not found"** +→ Update column mappings in config.py + +**Charts show 1e8 (scientific notation)** +→ Divide by 1e6 before plotting: `ax.plot(data / 1e6, ...)` + +**"DataFrame is empty"** +→ Check `MIN_YEAR`, `MAX_DATE`, and `ANALYSIS_YEARS` in config.py + +--- + +## šŸŽÆ Using Cursor AI (Recommended) + +This template is optimized for Cursor. Instead of manual setup, just ask: + +``` +"Create a revenue trend analysis using template patterns" +``` + +The AI will: +- āœ… Use all template utilities automatically +- āœ… Follow best practices +- āœ… Include proper validation +- āœ… Generate production-ready code + +**See:** `.cursor/rules/ai_assistant_guide.md` for complete prompt library + +## šŸ“š Next Steps + +- **Run examples:** Try `examples/annual_revenue_trend.py` to see it in action +- **Check data quality:** Run `python data_quality.py` to analyze your data +- **Validate config:** Run `python config_validator.py` to check configuration +- **Read documentation:** See `README.md` for comprehensive guide +- **Review patterns:** Check `.cursor/rules/` for detailed patterns +- **See examples:** Check `EXAMPLES.md` for example script guide + +--- + +**Need help?** Check `.cursor/rules/common_errors.md` for detailed troubleshooting. diff --git a/README.md b/README.md new file mode 100644 index 0000000..0e13abd --- /dev/null +++ b/README.md @@ -0,0 +1,589 @@ +# Sales Analysis Template + +**A best-in-class, reusable template for sales invoice detail analysis** + +**Optimized for Cursor AI** - Just ask the AI to create analyses and it handles everything automatically. + +This template provides a complete framework for analyzing sales data from any company. It's designed to be: +- **Flexible:** Works with different column names, date formats, and data structures +- **Automated:** Interactive setup wizard configures everything for your company +- **AI-Optimized:** Fully optimized for Cursor - AI knows all patterns and generates code automatically +- **Production-Ready:** Includes error handling, validation, and best practices + +--- + +## šŸš€ Quick Start + +### 1. Setup (Automated) + +Run the interactive setup wizard: + +```bash +python setup_wizard.py +``` + +The wizard will ask you about: +- Company name and analysis date +- Data file location +- Column names in your CSV +- Date range and LTM configuration +- Exclusion filters (if needed) + +### 2. Manual Setup (Alternative) + +If you prefer to configure manually: + +1. **Update `config.py`** with your company-specific settings: + - `COMPANY_NAME`: Your company name + - `DATA_FILE`: Your CSV filename + - `REVENUE_COLUMN`: Your revenue/amount column name + - `DATE_COLUMN`: Your primary date column + - Column mappings for Customer, Item, etc. + - Date range and LTM settings + +2. **Place your data file** in the template directory (or update `DATA_DIR` in config.py) + +### 3. Test Data Loading + +Verify your configuration works: + +```bash +python -c "from data_loader import load_sales_data; from config import get_data_path; df = load_sales_data(get_data_path()); print(f'Loaded {len(df):,} rows')" +``` + +### 4. Create Your First Analysis + +Copy the template and customize: + +```bash +cp analysis_template.py my_first_analysis.py +# Edit my_first_analysis.py with your analysis logic +python my_first_analysis.py +``` + +--- + +## šŸ“ Project Structure + +``` +sales_analysis_template/ +ā”œā”€ā”€ README.md # This file +ā”œā”€ā”€ QUICK_START.md # Quick start guide +ā”œā”€ā”€ TEMPLATE_OVERVIEW.md # High-level overview +ā”œā”€ā”€ TEMPLATE_SUMMARY.md # Comprehensive template summary +ā”œā”€ā”€ EXAMPLES.md # Example scripts guide +ā”œā”€ā”€ SETUP_CHECKLIST.md # Setup verification checklist +ā”œā”€ā”€ requirements.txt # Python dependencies +ā”œā”€ā”€ setup_wizard.py # Interactive setup wizard +│ +ā”œā”€ā”€ config.py # ⭐ Configuration (customize for your company) +ā”œā”€ā”€ config_validator.py # Configuration validation utility +│ +ā”œā”€ā”€ data_loader.py # ⭐ Data loading with fallback logic +ā”œā”€ā”€ data_quality.py # Data quality reporting +ā”œā”€ā”€ data_processing.py # Data transformation utilities +│ +ā”œā”€ā”€ analysis_utils.py # ⭐ Common utilities (formatters, LTM, helpers) +ā”œā”€ā”€ statistical_utils.py # Statistical analysis utilities +ā”œā”€ā”€ validate_revenue.py # Revenue validation utility +│ +ā”œā”€ā”€ export_utils.py # Export to CSV/Excel +ā”œā”€ā”€ report_generator.py # PDF report generation +ā”œā”€ā”€ logger_config.py # Logging configuration +│ +ā”œā”€ā”€ analysis_template.py # Template for creating new analyses +ā”œā”€ā”€ run_all_analyses.py # Batch runner for all scripts +ā”œā”€ā”€ generate_sample_data.py # Generate sample data for testing +│ +ā”œā”€ā”€ examples/ # Example analysis scripts +│ ā”œā”€ā”€ annual_revenue_trend.py # Simple annual revenue analysis +│ ā”œā”€ā”€ customer_segmentation.py # RFM customer segmentation +│ ā”œā”€ā”€ cohort_analysis.py # Customer cohort analysis +│ └── product_performance.py # Product performance analysis +│ +ā”œā”€ā”€ tests/ # Unit tests +│ ā”œā”€ā”€ test_data_loader.py # Data loader tests +│ ā”œā”€ā”€ test_analysis_utils.py # Analysis utils tests +│ └── test_config_validator.py # Config validator tests +│ +└── .cursor/ + └── rules/ # Cursor IDE rules (auto-loaded) + ā”œā”€ā”€ ai_assistant_guide.md # Complete AI assistant guide + ā”œā”€ā”€ advanced_analysis_patterns.md # Advanced techniques + ā”œā”€ā”€ analysis_patterns.md # Common analysis patterns + ā”œā”€ā”€ chart_formatting.md # Chart formatting rules + ā”œā”€ā”€ code_quality.md # Code quality standards + ā”œā”€ā”€ common_errors.md # Error troubleshooting + ā”œā”€ā”€ data_loading.md # Data loading patterns + ā”œā”€ā”€ error_handling.md # Error handling patterns + └── ltm_methodology.md # LTM methodology +``` + +--- + +## šŸ”§ Configuration Guide + +### Required Configuration + +**In `config.py`, you MUST configure:** + +1. **Company Information:** + ```python + COMPANY_NAME = "Your Company Name" + ``` + +2. **Data File:** + ```python + DATA_FILE = 'your_sales_data.csv' + ``` + +3. **Column Mappings:** + ```python + REVENUE_COLUMN = 'USD' # Your revenue column name + DATE_COLUMN = 'InvoiceDate' # Your date column name + CUSTOMER_COLUMN = 'Customer' # Your customer column name + ``` + +4. **Date Range:** + ```python + MIN_YEAR = 2021 + MAX_DATE = pd.Timestamp('2025-09-30') + ANALYSIS_YEARS = [2021, 2022, 2023, 2024, 2025] + ``` + +### Optional Configuration + +**LTM (Last Twelve Months):** +```python +LTM_ENABLED = True # Set to False if all years are complete +LTM_START_MONTH = 10 +LTM_START_YEAR = 2024 +LTM_END_MONTH = 9 +LTM_END_YEAR = 2025 +``` + +**Exclusion Filters:** +```python +EXCLUSION_FILTERS = { + 'enabled': True, + 'exclude_by_column': 'Country', + 'exclude_values': ['Test', 'KVT'] +} +``` + +**See `config.py` for all available options and detailed comments.** + +--- + +## šŸ“Š Data Requirements + +### Required Columns + +Your CSV file must have: +- **Revenue column:** A numeric column with sales amounts (configured as `REVENUE_COLUMN`) +- **Date column:** At least one date column (configured as `DATE_COLUMN`) + +### Recommended Columns + +For full analysis capabilities, include: +- **Customer/Account:** For customer segmentation and analysis +- **Item/Product:** For product analysis +- **Quantity:** For price calculations +- **Geographic:** Region, Country for geographic analysis +- **Segments:** Technology, EndMarket, ProductGroup for segmentation + +### Date Column Fallback + +The data loader supports fallback logic: +1. **Primary:** Uses `DATE_COLUMN` (e.g., InvoiceDate) +2. **Fallback 1:** Uses columns in `DATE_FALLBACK_COLUMNS` (e.g., Month, Year) +3. **Fallback 2:** Constructs from Year column if available + +This ensures maximum date coverage even if some rows have missing dates. + +--- + +## šŸ’» Creating Analysis Scripts + +### Using the Template + +1. **Copy the template:** + ```bash + cp analysis_template.py my_analysis.py + ``` + +2. **Update configuration:** + ```python + ANALYSIS_NAME = "My Analysis" + DESCRIPTION = "Description of what this analysis does" + ``` + +3. **Implement your logic:** + - Use `calculate_annual_metrics()` for annual aggregations + - Use `setup_revenue_chart()` and `save_chart()` for visualizations + - Follow patterns from `.cursor/rules/analysis_patterns.md` + +4. **Run your analysis:** + ```bash + python my_analysis.py + ``` + +### Standard Pattern + +```python +from data_loader import load_sales_data, validate_data_structure +from analysis_utils import ( + get_ltm_period_config, calculate_annual_metrics, + setup_revenue_chart, save_chart, apply_exclusion_filters +) +from config import get_data_path, REVENUE_COLUMN, CHART_SIZES + +# Load and validate +df = load_sales_data(get_data_path()) +is_valid, msg = validate_data_structure(df) +if not is_valid: + print(f"ERROR: {msg}") + return + +# Apply filters +df = apply_exclusion_filters(df) + +# Calculate metrics +ltm_start, ltm_end = get_ltm_period_config() +annual_df = calculate_annual_metrics(df, calculate_metrics, ltm_start, ltm_end) + +# Create charts +fig, ax = plt.subplots(figsize=CHART_SIZES['medium']) +ax.plot(data / 1e6, ...) +setup_revenue_chart(ax) +save_chart(fig, 'chart.png') +``` + +--- + +## šŸŽÆ Key Features + +### 1. Flexible Data Loading + +- Handles different column names via configuration +- Fallback logic for date parsing (100% coverage) +- Automatic validation and error reporting + +### 2. LTM (Last Twelve Months) Support + +- Automatic LTM calculation for partial years +- Apples-to-apples comparison with full calendar years +- Configurable LTM periods + +### 3. Standardized Chart Formatting + +- Automatic millions formatter for revenue charts +- Consistent styling and sizing +- Professional output ready for reports +- Optional interactive charts with Plotly + +### 4. Exclusion Filters + +- Easy configuration for excluding segments +- Useful for excluding test accounts, business units, etc. + +### 5. Revenue Validation + +- Automatic validation after each analysis +- Ensures data loading is working correctly +- Optional validation against expected values + +### 6. Example Scripts + +- Working examples for common analyses +- Demonstrates best practices +- Easy to customize and extend + +### 7. Data Export + +- Export results to CSV and Excel +- Formatted summary tables +- Multiple sheet support + +### 8. Data Quality Reporting + +- Comprehensive data quality checks +- Missing value analysis +- Outlier detection +- Data profiling + +### 9. Configuration Validation + +- Early error detection +- Validates column mappings +- Checks date ranges and LTM configuration + +### 10. Statistical Utilities + +- Year-over-year growth calculations +- CAGR (Compound Annual Growth Rate) +- Correlation analysis +- Statistical significance testing + +### 11. Report Generation + +- Combine multiple charts into PDF reports +- Professional formatting +- Summary tables and metadata + +### 12. Logging Infrastructure + +- Structured logging with file and console output +- Analysis execution tracking +- Configurable log levels + +--- + +## šŸ“š Documentation + +### For AI Agents (Cursor IDE) + +The `.cursor/rules/` directory contains comprehensive rules that are automatically loaded by Cursor: + +- **`ai_assistant_guide.md`:** Complete guide with ready-to-use prompts +- **`advanced_analysis_patterns.md`:** Advanced techniques (cohort, PVM, forecasting, etc.) +- **`analysis_patterns.md`:** Standard patterns for creating analyses +- **`data_loading.md`:** Always use `data_loader.py`, never `pd.read_csv()` directly +- **`chart_formatting.md`:** How to format charts correctly +- **`ltm_methodology.md`:** LTM implementation and usage +- **`common_errors.md`:** Troubleshooting guide +- **`code_quality.md`:** Code quality standards and Cursor best practices +- **`error_handling.md`:** How to write AI-friendly error messages + +### For Developers + +- **`config.py`:** Heavily commented with all configuration options +- **`analysis_template.py`:** Template with examples and comments +- **`analysis_utils.py`:** Well-documented utility functions + +--- + +## šŸ” Common Analysis Types + +This template supports all standard sales analyses: + +### Revenue Analyses +- Annual revenue trends +- Monthly revenue analysis +- Revenue by segment/product/geography + +### Customer Analyses +- Customer segmentation (RFM) +- Customer concentration +- Churn analysis +- Cohort analysis +- Customer lifetime value (CLV) + +### Product Analyses +- Product performance +- Product lifecycle +- BCG matrix +- Market basket analysis + +### Financial Analyses +- Price elasticity +- Contribution margin +- Price vs volume analysis + +### Advanced Analyses +- Seasonality analysis +- Time series forecasting +- Customer churn prediction + +**See `examples/` directory for working example scripts, or the original Dukane project for 24+ production analysis scripts.** + +--- + +## šŸ› ļø Dependencies + +Install required packages: + +```bash +pip install -r requirements.txt +``` + +**Core dependencies:** +- `pandas` - Data manipulation +- `numpy` - Numerical operations +- `matplotlib` - Charting +- `seaborn` - Enhanced visualizations + +**Optional dependencies** (uncomment in requirements.txt if needed): +- `openpyxl` - Excel export (export_utils.py) +- `plotly` - Interactive charts (analysis_utils.py) +- `reportlab` - PDF reports (report_generator.py) +- `scipy` - Statistical analysis (statistical_utils.py) +- `pytest` - Unit testing +- `pmdarima` - Time series forecasting +- `mlxtend` - Market basket analysis +- `scikit-learn` - Machine learning + +--- + +## āš ļø Important Notes + +### Always Use Utilities + +**āœ… DO:** +```python +from data_loader import load_sales_data +from analysis_utils import setup_revenue_chart, save_chart +from config import REVENUE_COLUMN, CHART_SIZES +``` + +**āŒ DON'T:** +```python +df = pd.read_csv('data.csv') # Use data_loader instead +ax.plot(revenue, ...) # Divide by 1e6 first, use setup_revenue_chart() +``` + +### Chart Formatting + +**ALWAYS divide revenue by 1e6 before plotting:** +```python +ax.plot(revenue / 1e6, ...) # Convert to millions +setup_revenue_chart(ax) # Apply formatter +``` + +### LTM Labeling + +**ALWAYS label LTM years correctly:** +```python +from config import get_ltm_label +ltm_label = get_ltm_label() # Returns "2025 (LTM 9/2025)" or None +if ltm_label: + title += f'\n({ltm_label})' +``` + +--- + +## šŸ› Troubleshooting + +### Data Loading Issues + +**Problem:** "Data file not found" +- **Solution:** Check `DATA_FILE` path in config.py +- **Solution:** Ensure file is in template directory or update `DATA_DIR` + +**Problem:** "Required column 'USD' not found" +- **Solution:** Update `REVENUE_COLUMN` in config.py to match your CSV +- **Solution:** Check all column mappings in config.py + +**Problem:** "All dates are NaN" +- **Solution:** Add fallback date columns to `DATE_FALLBACK_COLUMNS` +- **Solution:** Check date format in your CSV + +### Analysis Issues + +**Problem:** Charts show scientific notation (1e8) +- **Solution:** Divide by 1e6 before plotting: `ax.plot(data / 1e6, ...)` +- **Solution:** Use `setup_revenue_chart(ax)` to apply formatter + +**Problem:** "DataFrame is empty" after filtering +- **Solution:** Check `MIN_YEAR` and `MAX_DATE` in config.py +- **Solution:** Verify `ANALYSIS_YEARS` includes years in your data + +**See `.cursor/rules/common_errors.md` for more troubleshooting help.** + +--- + +## šŸ“ Example Workflow + +### Complete Analysis Workflow + +1. **Setup:** + ```bash + python setup_wizard.py + ``` + +2. **Test data loading:** + ```bash + python -c "from data_loader import load_sales_data; from config import get_data_path; df = load_sales_data(get_data_path()); print(f'āœ“ Loaded {len(df):,} rows')" + ``` + +3. **Create analysis:** + ```bash + cp analysis_template.py revenue_analysis.py + # Edit revenue_analysis.py + ``` + +4. **Run analysis:** + ```bash + python revenue_analysis.py + ``` + +5. **Add to batch runner:** + ```python + # In run_all_analyses.py: + ANALYSIS_SCRIPTS = [ + 'revenue_analysis.py', + # ... other analyses + ] + ``` + +6. **Run all analyses:** + ```bash + python run_all_analyses.py + ``` + +--- + +## šŸ¤ Best Practices + +1. **Always validate data** after loading: + ```python + is_valid, msg = validate_data_structure(df) + ``` + +2. **Use configuration values** instead of hardcoding: + ```python + from config import REVENUE_COLUMN # āœ… + revenue = df['USD'].sum() # āŒ Hardcoded + ``` + +3. **Apply exclusion filters** if configured: + ```python + df = apply_exclusion_filters(df) + ``` + +4. **Validate revenue** at end of each analysis: + ```python + validate_revenue(df, "Analysis Name") + ``` + +5. **Use utility functions** for consistency: + ```python + from analysis_utils import calculate_annual_metrics, setup_revenue_chart + ``` + +--- + +## šŸ“„ License + +This template is provided as-is for use in sales analysis projects. + +--- + +## šŸ™ Acknowledgments + +This template is based on best practices developed during the Dukane Corporation sales analysis project, which included 24+ production-ready analysis scripts and comprehensive documentation. + +--- + +## šŸ“ž Support + +For questions or issues: +1. Check `.cursor/rules/` for detailed patterns and troubleshooting +2. Review `config.py` comments for configuration options +3. See example analyses in the original Dukane project + +--- + +**Last Updated:** January 2026 +**Template Version:** 1.0 +**Status:** Production Ready diff --git a/SETUP_CHECKLIST.md b/SETUP_CHECKLIST.md new file mode 100644 index 0000000..3d956aa --- /dev/null +++ b/SETUP_CHECKLIST.md @@ -0,0 +1,118 @@ +# Setup Checklist + +Use this checklist to ensure your template is properly configured before running analyses. + +## āœ… Initial Setup + +- [ ] **Install dependencies** + ```bash + pip install -r requirements.txt + ``` + +- [ ] **Run setup wizard** + ```bash + python setup_wizard.py + ``` + +- [ ] **Place data file** in template directory (or update `DATA_DIR` in config.py) + +## āœ… Configuration Verification + +Open `config.py` and verify: + +- [ ] **Company Information** + - [ ] `COMPANY_NAME` is set + - [ ] `ANALYSIS_DATE` is current + +- [ ] **Data File** + - [ ] `DATA_FILE` matches your CSV filename + - [ ] File exists in expected location + +- [ ] **Column Mappings** + - [ ] `REVENUE_COLUMN` matches your CSV + - [ ] `DATE_COLUMN` matches your CSV + - [ ] `CUSTOMER_COLUMN` matches your CSV (if applicable) + - [ ] `ITEM_COLUMN` matches your CSV (if applicable) + - [ ] `QUANTITY_COLUMN` matches your CSV (if applicable) + +- [ ] **Date Configuration** + - [ ] `MIN_YEAR` is correct + - [ ] `MAX_DATE` is correct + - [ ] `ANALYSIS_YEARS` includes all years you want to analyze + +- [ ] **LTM Configuration** (if needed) + - [ ] `LTM_ENABLED` is set correctly + - [ ] `LTM_START_MONTH`, `LTM_START_YEAR` are correct + - [ ] `LTM_END_MONTH`, `LTM_END_YEAR` are correct + +- [ ] **Exclusion Filters** (if needed) + - [ ] `EXCLUSION_FILTERS['enabled']` is set correctly + - [ ] `exclude_by_column` matches a column in your data + - [ ] `exclude_values` list is correct + +## āœ… Data Loading Test + +- [ ] **Test data loading** + ```bash + python -c "from data_loader import load_sales_data; from config import get_data_path; df = load_sales_data(get_data_path()); print(f'āœ“ Loaded {len(df):,} rows')" + ``` + +- [ ] **Verify date coverage** + - Check output shows good date coverage (>95% recommended) + - Verify date range matches expectations + +- [ ] **Verify revenue column** + - Check that revenue values are numeric + - Verify no unexpected NaN values + +## āœ… First Analysis Test + +- [ ] **Copy template** + ```bash + cp analysis_template.py test_analysis.py + ``` + +- [ ] **Run test analysis** + ```bash + python test_analysis.py + ``` + +- [ ] **Verify outputs** + - [ ] Chart generated successfully + - [ ] Chart saved to `charts/` directory + - [ ] Revenue validation passed + - [ ] No errors in console output + +## āœ… Common Issues Check + +Before running full analyses, verify: + +- [ ] **Column names match** - All column mappings in config.py match your CSV +- [ ] **Date format works** - Dates are parsing correctly (check data_loader output) +- [ ] **Date range is correct** - MIN_YEAR and MAX_DATE include your data +- [ ] **LTM is configured** - If using LTM, dates are within your data range +- [ ] **Exclusions work** - If using exclusions, column and values are correct + +## āœ… Ready for Production + +Once all checks pass: + +- [ ] **Create your analyses** using `analysis_template.py` +- [ ] **Add to batch runner** in `run_all_analyses.py` +- [ ] **Run all analyses** to generate complete analysis suite + +--- + +## šŸ› Troubleshooting + +If any check fails: + +1. **Data loading issues:** See `.cursor/rules/data_loading.md` +2. **Configuration issues:** Review `config.py` comments +3. **Common errors:** See `.cursor/rules/common_errors.md` +4. **Pattern questions:** See `.cursor/rules/analysis_patterns.md` + +--- + +**Checklist Version:** 1.0 +**Last Updated:** January 2026 diff --git a/TEMPLATE_OVERVIEW.md b/TEMPLATE_OVERVIEW.md new file mode 100644 index 0000000..754fe92 --- /dev/null +++ b/TEMPLATE_OVERVIEW.md @@ -0,0 +1,150 @@ +# Sales Analysis Template - Overview + +**Start here for a high-level understanding of the template.** + +For detailed setup, see `QUICK_START.md`. For complete documentation, see `README.md`. + +## šŸŽÆ Purpose + +This template provides a **production-ready, reusable framework** for analyzing sales invoice detail data from any company. It's designed to be: + +- **Flexible:** Works with different column names, date formats, and data structures +- **Automated:** Interactive setup wizard configures everything +- **AI-Optimized:** Fully optimized for Cursor AI - just ask and the AI generates complete analyses +- **Best-in-Class:** Based on proven patterns from 24+ production analyses + +## šŸ“¦ What's Included + +### Core Framework +- **`config.py`** - Centralized configuration (customize for your company) +- **`data_loader.py`** - Intelligent data loading with fallback logic +- **`analysis_utils.py`** - Common utilities (formatters, LTM, helpers) +- **`validate_revenue.py`** - Revenue validation utility + +### Templates & Tools +- **`analysis_template.py`** - Template for creating new analyses +- **`run_all_analyses.py`** - Batch runner for all scripts +- **`setup_wizard.py`** - Interactive setup wizard + +### Documentation +- **`README.md`** - Comprehensive documentation +- **`QUICK_START.md`** - Quick reference guide +- **`.cursor/rules/`** - Cursor IDE rules for automation + +### Configuration +- **`requirements.txt`** - Python dependencies +- **`.gitignore`** - Git ignore patterns + +## šŸš€ Quick Start + +1. **Run setup wizard:** + ```bash + python setup_wizard.py + ``` + +2. **Test data loading:** + ```bash + python -c "from data_loader import load_sales_data; from config import get_data_path; df = load_sales_data(get_data_path()); print(f'āœ“ Loaded {len(df):,} rows')" + ``` + +3. **Create your first analysis:** + ```bash + cp analysis_template.py my_analysis.py + # Edit my_analysis.py + python my_analysis.py + ``` + +## šŸŽØ Key Features + +### 1. Flexible Data Loading +- Handles different column names via configuration +- Fallback logic for date parsing (100% coverage) +- Automatic validation + +### 2. LTM Support +- Automatic Last Twelve Months calculation +- Apples-to-apples comparison with full years +- Configurable periods + +### 3. Standardized Formatting +- Automatic millions formatter for revenue +- Consistent chart styling +- Professional output + +### 4. Exclusion Filters +- Easy configuration for excluding segments +- Useful for test accounts, business units, etc. + +### 5. AI Automation +- Comprehensive Cursor rules +- Automated agent assistance +- Best practices enforcement + +## šŸ“Š Analysis Types Supported + +This template supports all standard sales analyses: + +- **Revenue:** Annual trends, monthly analysis, by segment +- **Customer:** Segmentation, concentration, churn, CLV +- **Product:** Performance, lifecycle, BCG matrix +- **Financial:** Price elasticity, margins +- **Advanced:** Seasonality, forecasting, predictions + +## šŸ”§ Customization Points + +All customization happens in `config.py`: + +1. **Company Info:** Name, analysis date +2. **Data File:** Location, filename +3. **Column Mappings:** Revenue, date, customer, product, etc. +4. **Date Range:** Years, LTM configuration +5. **Filters:** Exclusion rules +6. **Chart Settings:** Sizes, styles, DPI + +## šŸ“š Documentation Structure + +- **`README.md`** - Complete guide (start here) +- **`QUICK_START.md`** - Quick start (includes Cursor tips) +- **`EXAMPLES.md`** - Example scripts guide +- **`TEMPLATE_SUMMARY.md`** - Comprehensive template overview +- **`.cursor/rules/`** - Detailed patterns for AI agents (auto-loaded by Cursor) +- **`config.py`** - Heavily commented configuration + +## šŸŽ“ Learning Path + +1. **Read:** `QUICK_START.md` (5 minutes) +2. **Run:** `setup_wizard.py` (2 minutes) +3. **Test:** Data loading (1 minute) +4. **Create:** First analysis using `analysis_template.py` (15 minutes) +5. **Explore:** `.cursor/rules/` for patterns (as needed) + +## šŸ’” Best Practices + +1. **Always use utilities** - Don't reinvent the wheel +2. **Use config values** - Never hardcode column names +3. **Validate data** - After loading and after analysis +4. **Follow patterns** - See `.cursor/rules/analysis_patterns.md` +5. **Test incrementally** - Test data loading before full analysis + +## šŸ” What Makes This "Best-in-Class" + +1. **Proven Patterns:** Based on 24+ production analyses +2. **Flexibility:** Works with any data structure +3. **Automation:** Setup wizard + AI-friendly rules +4. **Documentation:** Comprehensive guides and examples +5. **Error Handling:** Validation and troubleshooting built-in +6. **Consistency:** Standardized formatting and patterns + +## šŸ“ˆ Next Steps + +1. Run `setup_wizard.py` to configure for your company +2. Review `config.py` to understand all options +3. Create your first analysis using `analysis_template.py` +4. Explore `.cursor/rules/` for detailed patterns +5. Build your analysis suite + +--- + +**Template Version:** 1.0 +**Last Updated:** January 2026 +**Status:** Production Ready diff --git a/TEMPLATE_SUMMARY.md b/TEMPLATE_SUMMARY.md new file mode 100644 index 0000000..4b5af0b --- /dev/null +++ b/TEMPLATE_SUMMARY.md @@ -0,0 +1,254 @@ +# Sales Analysis Template - Summary + +**This document provides a comprehensive overview of the template structure and capabilities.** + +For quick start, see `QUICK_START.md`. For detailed documentation, see `README.md`. + +## šŸ“‹ What This Template Provides + +This template was created based on the comprehensive Dukane Corporation sales analysis project, which included 24+ production-ready analysis scripts. All best practices, patterns, and lessons learned have been distilled into this reusable template. + +## šŸ“ Complete File Structure + +``` +sales_analysis_template/ +ā”œā”€ā”€ README.md # Comprehensive documentation +ā”œā”€ā”€ QUICK_START.md # Quick reference guide +ā”œā”€ā”€ TEMPLATE_OVERVIEW.md # Template overview and features +ā”œā”€ā”€ TEMPLATE_SUMMARY.md # This file +ā”œā”€ā”€ EXAMPLES.md # Example scripts guide +ā”œā”€ā”€ SETUP_CHECKLIST.md # Setup verification checklist +ā”œā”€ā”€ requirements.txt # Python dependencies +ā”œā”€ā”€ .gitignore # Git ignore patterns +│ +ā”œā”€ā”€ Core Framework Files: +│ ā”œā”€ā”€ config.py # ⭐ Centralized configuration +│ ā”œā”€ā”€ config_validator.py # Configuration validation utility +│ ā”œā”€ā”€ data_loader.py # ⭐ Intelligent data loading +│ ā”œā”€ā”€ data_quality.py # Data quality reporting +│ ā”œā”€ā”€ data_processing.py # Data transformation utilities +│ ā”œā”€ā”€ analysis_utils.py # ⭐ Common utilities +│ ā”œā”€ā”€ statistical_utils.py # Statistical analysis utilities +│ └── validate_revenue.py # Revenue validation +│ +ā”œā”€ā”€ Utility Files: +│ ā”œā”€ā”€ export_utils.py # Export to CSV/Excel +│ ā”œā”€ā”€ report_generator.py # PDF report generation +│ ā”œā”€ā”€ logger_config.py # Logging configuration +│ └── generate_sample_data.py # Generate sample data for testing +│ +ā”œā”€ā”€ Templates & Tools: +│ ā”œā”€ā”€ analysis_template.py # Template for new analyses +│ ā”œā”€ā”€ run_all_analyses.py # Batch runner +│ └── setup_wizard.py # Interactive setup wizard +│ +ā”œā”€ā”€ examples/ # Example analysis scripts +│ ā”œā”€ā”€ annual_revenue_trend.py # Simple annual revenue analysis +│ ā”œā”€ā”€ customer_segmentation.py # RFM customer segmentation +│ ā”œā”€ā”€ cohort_analysis.py # Customer cohort analysis +│ └── product_performance.py # Product performance analysis +│ +ā”œā”€ā”€ tests/ # Unit tests +│ ā”œā”€ā”€ test_data_loader.py # Data loader tests +│ ā”œā”€ā”€ test_analysis_utils.py # Analysis utils tests +│ └── test_config_validator.py # Config validator tests +│ +└── .cursor/ + └── rules/ # Cursor IDE rules (auto-loaded) + ā”œā”€ā”€ ai_assistant_guide.md # Complete AI assistant guide + ā”œā”€ā”€ advanced_analysis_patterns.md # Advanced techniques + ā”œā”€ā”€ analysis_patterns.md # Analysis patterns + ā”œā”€ā”€ chart_formatting.md # Chart formatting rules + ā”œā”€ā”€ code_quality.md # Code quality standards + ā”œā”€ā”€ common_errors.md # Error troubleshooting + ā”œā”€ā”€ data_loading.md # Data loading patterns + ā”œā”€ā”€ error_handling.md # Error handling patterns + └── ltm_methodology.md # LTM methodology +``` + +## šŸŽÆ Key Features Implemented + +### 1. Flexible Configuration System +- **`config.py`**: Centralized configuration with extensive comments +- All column names, date ranges, and settings configurable +- No hardcoded values - everything comes from config + +### 2. Intelligent Data Loading +- **`data_loader.py`**: Fallback logic for date parsing +- Handles missing dates gracefully +- 100% date coverage via fallback columns +- Automatic validation and error reporting + +### 3. Comprehensive Utilities +- **`analysis_utils.py`**: All common functions in one place +- Chart formatters (millions, thousands) +- LTM calculation helpers +- Mixed type handling for years +- Price calculation utilities +- Exclusion filter helpers + +### 4. Interactive Setup +- **`setup_wizard.py`**: Asks clarifying questions +- Automatically configures `config.py` +- Validates inputs +- Provides next steps + +### 5. AI-Friendly Rules +- **`.cursor/rules/`**: Comprehensive Cursor IDE rules +- Auto-loaded by Cursor +- Enforces best practices +- Provides patterns and troubleshooting + +### 6. Production-Ready Templates +- **`analysis_template.py`**: Complete template with examples +- **`run_all_analyses.py`**: Batch runner with error handling +- Follows all best practices + +## šŸ”‘ Design Principles + +### Flexibility +- Works with any column names (configured in config.py) +- Handles different date formats +- Supports various data structures +- Optional features (LTM, exclusions) can be disabled + +### Automation +- Setup wizard asks all necessary questions +- Cursor rules guide AI agents automatically +- Batch runner handles multiple analyses +- Validation catches errors early + +### Best Practices +- Always use utilities (never reinvent the wheel) +- Consistent formatting across all analyses +- Proper error handling and validation +- Comprehensive documentation + +### Reusability +- Generic enough for any company +- Specific enough to be immediately useful +- Well-documented for future agents +- Easy to extend with new analyses + +## šŸ“Š Analysis Types Supported + +The template supports all standard sales analyses: + +### Revenue Analyses +- Annual revenue trends +- Monthly revenue analysis +- Revenue by segment/product/geography + +### Customer Analyses +- Customer segmentation (RFM) +- Customer concentration +- Churn analysis +- Cohort analysis +- Customer lifetime value (CLV) + +### Product Analyses +- Product performance +- Product lifecycle +- BCG matrix +- Market basket analysis + +### Financial Analyses +- Price elasticity +- Contribution margin +- Price vs volume analysis + +### Advanced Analyses +- Seasonality analysis +- Time series forecasting +- Customer churn prediction + +## šŸš€ Usage Workflow + +1. **Setup** (5 minutes) + - Run `setup_wizard.py` + - Answer questions about your data + - Configuration automatically updated + +2. **Test** (2 minutes) + - Test data loading + - Verify configuration works + +3. **Create** (15 minutes) + - Copy `analysis_template.py` + - Customize for your analysis + - Run and verify + +4. **Scale** (ongoing) + - Create multiple analyses + - Add to batch runner + - Generate complete analysis suite + +## šŸ’” What Makes This "Best-in-Class" + +1. **Proven Patterns**: Based on 24+ production analyses +2. **Comprehensive**: Covers all common analysis types +3. **Flexible**: Works with any data structure +4. **Automated**: Setup wizard + AI-friendly rules +5. **Documented**: Extensive documentation at every level +6. **Production-Ready**: Error handling, validation, best practices + +## šŸ“š Documentation Hierarchy + +1. **`QUICK_START.md`** - Start here (5-minute overview, includes Cursor tips) +2. **`README.md`** - Complete guide (comprehensive) +3. **`EXAMPLES.md`** - Example scripts guide +4. **`TEMPLATE_OVERVIEW.md`** - High-level overview +5. **`SETUP_CHECKLIST.md`** - Verification checklist +6. **`.cursor/rules/`** - Detailed patterns for AI agents (auto-loaded by Cursor) +7. **`config.py`** - Inline comments for all options + +## šŸŽ“ Learning Resources + +- **Quick Start**: `QUICK_START.md` - Get running in 5 minutes +- **Full Guide**: `README.md` - Complete documentation +- **Patterns**: `.cursor/rules/analysis_patterns.md` - Code patterns +- **Troubleshooting**: `.cursor/rules/common_errors.md` - Fix issues +- **Examples**: `analysis_template.py` - Working example + +## āœ… Quality Assurance + +All components include: +- āœ… Error handling +- āœ… Input validation +- āœ… Comprehensive comments +- āœ… Type hints where helpful +- āœ… Documentation strings +- āœ… Best practices enforcement + +## šŸ”„ Future Enhancements + +Potential additions (not included in v1.0): +- Example analysis scripts (can be added from Dukane project) +- Unit tests +- CI/CD configuration +- Docker containerization +- Additional visualization libraries + +## šŸ“ Notes for Users + +1. **First Time**: Start with `QUICK_START.md` and `setup_wizard.py` +2. **Configuration**: All customization in `config.py` +3. **Creating Analyses**: Use `analysis_template.py` as starting point +4. **AI Assistance**: Cursor rules are auto-loaded, just ask for help +5. **Troubleshooting**: Check `.cursor/rules/common_errors.md` first + +## šŸŽ‰ Success Criteria + +The template is ready when: +- āœ… Setup wizard runs successfully +- āœ… Data loads without errors +- āœ… First analysis generates charts +- āœ… All validations pass +- āœ… Documentation is clear + +--- + +**Template Version:** 1.0 +**Created:** January 2026 +**Based On:** Dukane Corporation Sales Analysis Project +**Status:** Production Ready āœ… diff --git a/analysis_template.py b/analysis_template.py new file mode 100644 index 0000000..429c617 --- /dev/null +++ b/analysis_template.py @@ -0,0 +1,147 @@ +""" +Template for creating new analysis scripts +Copy this file and modify for your specific analysis + +Usage: +1. Copy this file: cp analysis_template.py my_new_analysis.py +2. Update the ANALYSIS_NAME and DESCRIPTION +3. Implement your analysis logic in the main() function +4. Update the chart generation section +5. Run: python my_new_analysis.py +""" +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +import seaborn as sns +from pathlib import Path + +# Import utilities +from data_loader import load_sales_data, validate_data_structure +from validate_revenue import validate_revenue +from analysis_utils import ( + get_ltm_period_config, get_annual_data, calculate_annual_metrics, + get_millions_formatter, setup_revenue_chart, save_chart, + format_currency, print_annual_summary, sort_mixed_years, + apply_exclusion_filters +) +from config import ( + DATA_FILE, OUTPUT_DIR, ANALYSIS_YEARS, MAX_DATE, + CHART_SIZES, ensure_directories, get_data_path, COMPANY_NAME +) + +# ============================================================================ +# CONFIGURATION +# ============================================================================ + +ANALYSIS_NAME = "Template Analysis" +DESCRIPTION = "Template for new analyses - customize this for your specific analysis" + +# ============================================================================ +# MAIN ANALYSIS FUNCTION +# ============================================================================ + +def main(): + """Main analysis function""" + + print(f"\n{'='*60}") + print(f"{ANALYSIS_NAME}") + print(f"{'='*60}\n") + + # 1. Load data + print("Loading data...") + try: + df = load_sales_data(get_data_path()) + print(f"Loaded {len(df):,} transactions") + except Exception as e: + print(f"ERROR loading data: {e}") + return + + # 2. Validate data structure + is_valid, msg = validate_data_structure(df) + if not is_valid: + print(f"ERROR: {msg}") + return + print("Data validation passed") + + # 3. Apply exclusion filters (if configured) + df = apply_exclusion_filters(df) + + # 4. Filter by date range + from config import MIN_YEAR, DATE_COLUMN + df = df[df['Year'] >= MIN_YEAR] + if DATE_COLUMN in df.columns: + df = df[df[DATE_COLUMN] <= MAX_DATE] + + # 5. Setup LTM period (if enabled) + ltm_start, ltm_end = get_ltm_period_config() + if ltm_start and ltm_end: + print(f"LTM period: {ltm_start} to {ltm_end}") + + # 6. Prepare data + print("\nPreparing data...") + # Add your data preparation logic here + # Example: df['CustomColumn'] = df[REVENUE_COLUMN] * df[QUANTITY_COLUMN] + + # 7. Calculate annual metrics + print("\nCalculating annual metrics...") + + def calculate_metrics(year_data): + """Calculate metrics for a single year""" + from config import REVENUE_COLUMN + return { + 'Revenue': year_data[REVENUE_COLUMN].sum(), + # Add your custom metrics here + # 'CustomMetric': year_data['CustomColumn'].mean(), + } + + annual_df = calculate_annual_metrics(df, calculate_metrics, ltm_start, ltm_end) + + # 8. Print summary + print_annual_summary(annual_df, 'Revenue', 'Revenue') + + # 9. Create visualizations + print("Generating charts...") + ensure_directories() + + # Example chart: Annual revenue trend + fig, ax = plt.subplots(figsize=CHART_SIZES['medium']) + + # Prepare data for plotting (handle mixed types) + annual_df_sorted = sort_mixed_years(annual_df.reset_index(), 'Year') + years = annual_df_sorted['Year'].tolist() + revenue = annual_df_sorted['Revenue'].values / 1e6 # Convert to millions + + # Create chart + ax.plot(range(len(years)), revenue, marker='o', linewidth=2, markersize=8) + ax.set_xticks(range(len(years))) + ax.set_xticklabels(years, rotation=45, ha='right') + setup_revenue_chart(ax) + + # Add LTM notation to title if applicable + title = f'Annual Revenue Trend - {COMPANY_NAME}' + if ltm_start and ltm_end: + from config import get_ltm_label + ltm_label = get_ltm_label() + if ltm_label: + title += f'\n({ltm_label})' + ax.set_title(title) + + plt.tight_layout() + save_chart(fig, f'{ANALYSIS_NAME.lower().replace(" ", "_")}_trend.png') + plt.close() + + # Add more charts as needed... + + # 10. Validate revenue + print("\nValidating revenue...") + validate_revenue(df, ANALYSIS_NAME) + + print(f"\n{ANALYSIS_NAME} complete!") + print(f"Charts saved to: {OUTPUT_DIR}") + +# ============================================================================ +# RUN ANALYSIS +# ============================================================================ + +if __name__ == "__main__": + main() diff --git a/analysis_utils.py b/analysis_utils.py new file mode 100644 index 0000000..12d7c83 --- /dev/null +++ b/analysis_utils.py @@ -0,0 +1,510 @@ +""" +Common utilities for analysis scripts +Provides formatters, LTM setup, and helper functions + +This module is designed to work with any sales data structure +by using configuration from config.py +""" +import pandas as pd +import numpy as np +from matplotlib.ticker import FuncFormatter +from pathlib import Path +from config import ( + REVENUE_COLUMN, LTM_ENABLED, get_ltm_period, get_ltm_label, + OUTPUT_DIR, CHART_DPI, CHART_BBOX +) + +# ============================================================================ +# CHART FORMATTERS +# ============================================================================ + +def millions_formatter(x: float, pos: int) -> str: + """ + Format numbers in millions for chart display (e.g., $99.9m) + + This formatter is used with matplotlib FuncFormatter to display + revenue values in millions on chart axes. + + Args: + x: Numeric value (already in millions, e.g., 99.9 for $99.9m) + pos: Position parameter (required by FuncFormatter, not used) + + Returns: + str: Formatted string like "$99.9m" + + Example: + >>> from matplotlib.ticker import FuncFormatter + >>> formatter = FuncFormatter(millions_formatter) + >>> ax.yaxis.set_major_formatter(formatter) + """ + return f'${x:.1f}m' + +def thousands_formatter(x: float, pos: int) -> str: + """ + Format numbers in thousands for chart display (e.g., $99.9k) + + Args: + x: Numeric value (already in thousands) + pos: Position parameter (required by FuncFormatter, not used) + + Returns: + str: Formatted string like "$99.9k" + """ + return f'${x:.1f}k' + +def get_millions_formatter() -> FuncFormatter: + """ + Get FuncFormatter for millions + + Returns: + FuncFormatter: Configured formatter for millions display + """ + return FuncFormatter(millions_formatter) + +def get_thousands_formatter() -> FuncFormatter: + """ + Get FuncFormatter for thousands + + Returns: + FuncFormatter: Configured formatter for thousands display + """ + return FuncFormatter(thousands_formatter) + +# ============================================================================ +# LTM (Last Twelve Months) SETUP +# ============================================================================ + +def get_ltm_period_config(): + """ + Get LTM period boundaries from config + + Returns: + tuple: (ltm_start, ltm_end) as pd.Period objects, or (None, None) if disabled + """ + if LTM_ENABLED: + return get_ltm_period() + return None, None + +def get_annual_data(df, year, ltm_start=None, ltm_end=None): + """ + Get data for a specific year, using LTM for the most recent partial year + + Args: + df: DataFrame with 'Year' and 'YearMonth' columns + year: Year to extract (int) + ltm_start: LTM start period (defaults to config if None) + ltm_end: LTM end period (defaults to config if None) + + Returns: + tuple: (year_data DataFrame, year_label string) + """ + from config import LTM_END_YEAR + + # Get LTM period from config if not provided + if ltm_start is None or ltm_end is None: + ltm_start, ltm_end = get_ltm_period_config() + + # Use LTM for the most recent year if enabled + if LTM_ENABLED and ltm_start and ltm_end and year == LTM_END_YEAR: + if 'YearMonth' in df.columns: + year_data = df[(df['YearMonth'] >= ltm_start) & (df['YearMonth'] <= ltm_end)] + year_label = get_ltm_label() or str(year) + else: + # Fallback if YearMonth not available + year_data = df[df['Year'] == year] + year_label = str(year) + else: + # Use full calendar year + year_data = df[df['Year'] == year] + year_label = str(year) + + return year_data, year_label + +def calculate_annual_metrics(df, metrics_func, ltm_start=None, ltm_end=None): + """ + Calculate annual metrics for all years, using LTM for most recent year + + Args: + df: DataFrame with 'Year' and 'YearMonth' columns + metrics_func: Function that takes a DataFrame and returns a dict of metrics + ltm_start: LTM start period (defaults to config if None) + ltm_end: LTM end period (defaults to config if None) + + Returns: + DataFrame with 'Year' index and metric columns + """ + from config import ANALYSIS_YEARS + + if ltm_start is None or ltm_end is None: + ltm_start, ltm_end = get_ltm_period_config() + + annual_data = [] + for year in sorted(ANALYSIS_YEARS): + if year in df['Year'].unique(): + year_data, year_label = get_annual_data(df, year, ltm_start, ltm_end) + + if len(year_data) > 0: + metrics = metrics_func(year_data) + metrics['Year'] = year_label + annual_data.append(metrics) + + if not annual_data: + return pd.DataFrame() + + return pd.DataFrame(annual_data).set_index('Year') + +# ============================================================================ +# MIXED TYPE HANDLING +# ============================================================================ + +def create_year_sort_column(df, year_col='Year'): + """ + Create a numeric sort column for mixed int/str year columns + + Args: + df: DataFrame + year_col: Name of year column + + Returns: + Series with numeric sort values + """ + from config import LTM_END_YEAR + + def sort_value(x): + if isinstance(x, str) and str(LTM_END_YEAR) in str(x): + return float(LTM_END_YEAR) + 0.5 + elif isinstance(x, (int, float)): + return float(x) + else: + return 9999 + + return df[year_col].apply(sort_value) + +def sort_mixed_years(df, year_col='Year'): + """ + Sort DataFrame by year column that may contain mixed int/str types + + Args: + df: DataFrame + year_col: Name of year column + + Returns: + Sorted DataFrame + """ + df = df.copy() + df['_Year_Sort'] = create_year_sort_column(df, year_col) + df = df.sort_values('_Year_Sort').drop(columns=['_Year_Sort']) + return df + +def safe_year_labels(years): + """ + Convert year values to safe string labels for chart axes + + Args: + years: Iterable of year values (int or str) + + Returns: + List of string labels + """ + return [str(year) for year in years] + +# ============================================================================ +# CHART HELPERS +# ============================================================================ + +def setup_revenue_chart(ax, ylabel: str = 'Revenue (Millions USD)') -> None: + """ + Setup a chart axis for revenue display (millions) + + CRITICAL: Always use this function for revenue charts. It applies + the millions formatter and standard styling. + + IMPORTANT: Data must be divided by 1e6 BEFORE plotting: + ax.plot(revenue / 1e6, ...) # āœ… Correct + ax.plot(revenue, ...) # āŒ Wrong - will show scientific notation + + Args: + ax: Matplotlib axis object to configure + ylabel: Y-axis label (default: 'Revenue (Millions USD)') + + Returns: + None: Modifies ax in place + + Example: + >>> import matplotlib.pyplot as plt + >>> from analysis_utils import setup_revenue_chart + >>> fig, ax = plt.subplots() + >>> ax.plot(revenue_data / 1e6, marker='o') # Divide by 1e6 first! + >>> setup_revenue_chart(ax) + >>> plt.show() + + See Also: + - .cursor/rules/chart_formatting.md for detailed patterns + - save_chart() for saving charts + """ + ax.yaxis.set_major_formatter(get_millions_formatter()) + ax.set_ylabel(ylabel) + ax.grid(True, alpha=0.3) + +def save_chart(fig, filename, output_dir=None): + """ + Save chart to file with organized directory structure + + Args: + fig: Matplotlib figure object + filename: Output filename (e.g., 'revenue_trend.png') + output_dir: Output directory (defaults to config.OUTPUT_DIR) + """ + if output_dir is None: + output_dir = OUTPUT_DIR + else: + output_dir = Path(output_dir) + + output_dir.mkdir(exist_ok=True) + + filepath = output_dir / filename + fig.savefig(filepath, dpi=CHART_DPI, bbox_inches=CHART_BBOX, format='png') + print(f"Chart saved: {filepath}") + +# ============================================================================ +# DATA VALIDATION +# ============================================================================ + +def validate_dataframe(df, required_columns=None): + """ + Validate DataFrame has required columns and basic data quality + + Args: + df: DataFrame to validate + required_columns: List of required column names (defaults to config) + + Returns: + tuple: (is_valid bool, error_message str) + """ + if required_columns is None: + required_columns = [REVENUE_COLUMN, 'Year'] + if 'YearMonth' in df.columns: + required_columns.append('YearMonth') + + missing_cols = [col for col in required_columns if col not in df.columns] + if missing_cols: + return False, f"Missing required columns: {missing_cols}" + + if len(df) == 0: + return False, "DataFrame is empty" + + if REVENUE_COLUMN in df.columns: + if df[REVENUE_COLUMN].isna().all(): + return False, f"All {REVENUE_COLUMN} values are NaN" + + return True, "OK" + +# ============================================================================ +# PRICE CALCULATION +# ============================================================================ + +def calculate_price_per_unit(df, quantity_col=None, revenue_col=None): + """ + Calculate average price per unit, excluding invalid quantities + + Args: + df: DataFrame with quantity and revenue columns + quantity_col: Name of quantity column (defaults to config) + revenue_col: Name of revenue column (defaults to config) + + Returns: + float: Average price per unit + """ + from config import QUANTITY_COLUMN, REVENUE_COLUMN, MIN_QUANTITY, MAX_QUANTITY + + if quantity_col is None: + quantity_col = QUANTITY_COLUMN + if revenue_col is None: + revenue_col = REVENUE_COLUMN + + # Check if quantity column exists + if quantity_col not in df.columns: + return np.nan + + # Filter for valid quantity transactions + df_valid = df[(df[quantity_col] > MIN_QUANTITY) & (df[quantity_col] <= MAX_QUANTITY)].copy() + + if len(df_valid) == 0: + return np.nan + + total_revenue = df_valid[revenue_col].sum() + total_quantity = df_valid[quantity_col].sum() + + if total_quantity == 0: + return np.nan + + return total_revenue / total_quantity + +# ============================================================================ +# OUTPUT FORMATTING +# ============================================================================ + +def format_currency(value: float, millions: bool = True) -> str: + """ + Format currency value for console output + + Args: + value: Numeric value to format + millions: If True, format as millions ($X.Xm), else thousands ($X.Xk) + + Returns: + str: Formatted string like "$99.9m" or "$99.9k" or "N/A" if NaN + + Example: + >>> format_currency(1000000) + '$1.00m' + >>> format_currency(1000, millions=False) + '$1.00k' + """ + if pd.isna(value): + return "N/A" + + if millions: + return f"${value / 1e6:.2f}m" + else: + return f"${value / 1e3:.2f}k" + +def print_annual_summary(annual_df, metric_col='Revenue', label='Revenue'): + """ + Print formatted annual summary to console + + Args: + annual_df: DataFrame with annual metrics (indexed by Year) + metric_col: Column name to print + label: Label for the metric + """ + print(f"\n{label} by Year:") + print("-" * 40) + for year in annual_df.index: + value = annual_df.loc[year, metric_col] + formatted = format_currency(value) + print(f" {year}: {formatted}") + print() + +# ============================================================================ +# DATA FILTERING HELPERS +# ============================================================================ + +def apply_exclusion_filters(df): + """ + Apply exclusion filters from config + + Args: + df: DataFrame to filter + + Returns: + Filtered DataFrame + """ + from config import EXCLUSION_FILTERS + + if not EXCLUSION_FILTERS.get('enabled', False): + return df + + exclude_col = EXCLUSION_FILTERS.get('exclude_by_column') + exclude_values = EXCLUSION_FILTERS.get('exclude_values', []) + + if exclude_col and exclude_col in df.columns and exclude_values: + original_count = len(df) + df_filtered = df[~df[exclude_col].isin(exclude_values)] + excluded_count = original_count - len(df_filtered) + if excluded_count > 0: + print(f"Excluded {excluded_count:,} rows based on {exclude_col} filter") + return df_filtered + + return df + +# ============================================================================ +# INTERACTIVE VISUALIZATIONS (OPTIONAL - PLOTLY) +# ============================================================================ + +def create_interactive_chart(data, chart_type='line', title=None, xlabel=None, ylabel=None): + """ + Create interactive chart using Plotly (optional dependency) + + Args: + data: DataFrame or dict with chart data + chart_type: Type of chart ('line', 'bar', 'scatter') + title: Chart title + xlabel: X-axis label + ylabel: Y-axis label + + Returns: + plotly.graph_objects.Figure: Plotly figure object + + Raises: + ImportError: If plotly is not installed + + Example: + fig = create_interactive_chart( + {'x': [1, 2, 3], 'y': [10, 20, 30]}, + chart_type='line', + title='Revenue Trend' + ) + fig.show() + """ + try: + import plotly.graph_objects as go + from plotly.subplots import make_subplots + except ImportError: + raise ImportError( + "plotly is required for interactive charts. Install with: pip install plotly" + ) + + fig = go.Figure() + + if chart_type == 'line': + if isinstance(data, dict) and 'x' in data and 'y' in data: + fig.add_trace(go.Scatter( + x=data['x'], + y=data['y'], + mode='lines+markers', + name='Data' + )) + elif chart_type == 'bar': + if isinstance(data, dict) and 'x' in data and 'y' in data: + fig.add_trace(go.Bar( + x=data['x'], + y=data['y'], + name='Data' + )) + + if title: + fig.update_layout(title=title) + if xlabel: + fig.update_xaxes(title_text=xlabel) + if ylabel: + fig.update_yaxes(title_text=ylabel) + + fig.update_layout( + template='plotly_white', + hovermode='x unified' + ) + + return fig + +def save_interactive_chart(fig, filename, output_dir=None): + """ + Save interactive Plotly chart to HTML file + + Args: + fig: Plotly figure object + filename: Output filename (e.g., 'chart.html') + output_dir: Output directory (defaults to config.OUTPUT_DIR) + """ + if output_dir is None: + output_dir = OUTPUT_DIR + else: + output_dir = Path(output_dir) + + output_dir.mkdir(exist_ok=True) + filepath = output_dir / filename + + fig.write_html(str(filepath)) + print(f"Interactive chart saved: {filepath}") + + return filepath diff --git a/config.py b/config.py new file mode 100644 index 0000000..a308c51 --- /dev/null +++ b/config.py @@ -0,0 +1,277 @@ +""" +Configuration file for sales analysis scripts +CONFIGURE THIS FILE FOR YOUR COMPANY'S SPECIFIC DATA STRUCTURE + +This file should be customized based on: +- Your data file name and location +- Column names in your sales data +- Date range and LTM period +- Company-specific settings + +CRITICAL: All column names, file paths, and settings are defined here. +Never hardcode these values in analysis scripts - always import from config. + +Usage: + from config import REVENUE_COLUMN, DATE_COLUMN, get_data_path + revenue = df[REVENUE_COLUMN].sum() # āœ… Correct + revenue = df['USD'].sum() # āŒ Wrong - hardcoded + +Quick Setup: + 1. Run: python setup_wizard.py (interactive configuration) + 2. Or manually edit this file following the TODO comments + 3. Validate: python config_validator.py + +See Also: + - .cursor/rules/analysis_patterns.md - How to use config values + - setup_wizard.py - Interactive configuration tool + - config_validator.py - Configuration validation +""" +from pathlib import Path +from typing import Optional, Tuple +import pandas as pd + +# ============================================================================ +# COMPANY INFORMATION +# ============================================================================ +# TODO: Update these values for your company +COMPANY_NAME = "Your Company Name" # Update this +ANALYSIS_DATE = "2026-01-12" # Update this to current date + +# ============================================================================ +# DATA FILES +# ============================================================================ +# TODO: Update with your actual data file name +DATA_FILE = 'sales_data.csv' # Update this to your CSV file name +OUTPUT_DIR = Path('charts') +REPORTS_DIR = Path('reports') +DATA_DIR = Path('data') # Optional: if data is in a subdirectory + +# ============================================================================ +# DATA COLUMN MAPPINGS +# ============================================================================ +# TODO: Map these to your actual column names +# These are the expected column names - update if your CSV uses different names + +# Revenue column (REQUIRED) +REVENUE_COLUMN = 'USD' # Common alternatives: 'Amount', 'Revenue', 'Total', 'Sales' + +# Date columns (at least one required) +DATE_COLUMN = 'InvoiceDate' # Primary date column +DATE_FALLBACK_COLUMNS = ['Month', 'Year'] # Fallback columns if primary is missing + +# Customer/Account columns +CUSTOMER_COLUMN = 'Customer' # Common alternatives: 'Account', 'CustomerName', 'Client' + +# Product/Item columns +ITEM_COLUMN = 'Item' # Common alternatives: 'Product', 'SKU', 'ItemCode' +PRODUCT_GROUP_COLUMN = 'ProductGroup' # Optional: for product categorization +QUANTITY_COLUMN = 'Quantity' # Optional: for price calculations + +# Geographic columns (optional) +REGION_COLUMN = 'Region' # Optional: for geographic analysis +COUNTRY_COLUMN = 'Country' # Optional: for country-level analysis + +# Segment/Category columns (optional - customize based on your data) +SEGMENT_COLUMNS = { + 'Technology': 'Technology', # Optional: technology/product type + 'EndMarket': 'EndMarket', # Optional: end market/industry + 'ProductGroup': 'ProductGroup', # Optional: product category +} + +# Invoice/Transaction columns +INVOICE_NUMBER_COLUMN = 'Invoice #' # Optional: for transaction-level analysis + +# ============================================================================ +# DATE RANGE CONFIGURATION +# ============================================================================ +# TODO: Update these based on your data and analysis needs + +# Analysis years (years to include in analysis) +ANALYSIS_YEARS = [2021, 2022, 2023, 2024, 2025] # Update based on your data + +# LTM (Last Twelve Months) Configuration +# For the most recent partial year, use LTM for apples-to-apples comparison +# Example: If latest data is through September 2025, use Oct 2024 - Sep 2025 +LTM_ENABLED = True # Set to False if you have complete calendar years only +LTM_START_MONTH = 10 # Month number (1-12) for LTM start +LTM_START_YEAR = 2024 # Year for LTM start +LTM_END_MONTH = 9 # Month number (1-12) for LTM end +LTM_END_YEAR = 2025 # Year for LTM end + +# Generate LTM period objects +if LTM_ENABLED: + LTM_START = pd.Period(f'{LTM_START_YEAR}-{LTM_START_MONTH:02d}', freq='M') + LTM_END = pd.Period(f'{LTM_END_YEAR}-{LTM_END_MONTH:02d}', freq='M') + LTM_LABEL = f'{LTM_END_YEAR} (LTM {LTM_END_MONTH}/{LTM_END_YEAR})' +else: + LTM_START = None + LTM_END = None + LTM_LABEL = None + +# Data date range (filter data to this range) +MIN_YEAR = 2021 # Minimum year to include +MAX_DATE = pd.Timestamp('2025-09-30') # Maximum date to include (update based on your data) + +# ============================================================================ +# CHART SETTINGS +# ============================================================================ +CHART_DPI = 300 +CHART_FORMAT = 'png' +CHART_BBOX = 'tight' +CHART_STYLE = 'seaborn-v0_8' # Options: 'default', 'ggplot', 'seaborn-v0_8', etc. + +# Chart size presets +CHART_SIZES = { + 'small': (6, 4), + 'medium': (10, 6), + 'large': (12, 8), + 'wide': (14, 6) +} + +# ============================================================================ +# DATA FILTERING +# ============================================================================ +# Quantity filtering for price calculations (exclude outliers) +MIN_QUANTITY = 0 # Minimum valid quantity +MAX_QUANTITY = 1000 # Maximum valid quantity (adjust based on your data) + +# Revenue filtering (optional - exclude negative values, returns, etc.) +EXCLUDE_NEGATIVE_REVENUE = False # Set to True to exclude negative revenue (returns/credits) +MIN_REVENUE = None # Optional: minimum revenue threshold + +# ============================================================================ +# EXCLUSION FILTERS (Optional) +# ============================================================================ +# Use this section to exclude specific segments, customers, or products +# Example: Exclude a business unit, test accounts, etc. + +EXCLUSION_FILTERS = { + 'enabled': False, # Set to True to enable exclusions + 'exclude_by_column': None, # Column name to filter on (e.g., 'Country', 'Segment') + 'exclude_values': [], # List of values to exclude (e.g., ['KVT', 'Test']) +} + +# ============================================================================ +# VALIDATION THRESHOLDS (Optional) +# ============================================================================ +# Expected revenue ranges for validation (update based on your company) +# These are used to validate that data loading is working correctly +VALIDATION_ENABLED = False # Set to True to enable validation +EXPECTED_REVENUE = {} # Example: {2021: 99_880_000, 2024: 89_990_000} +REVENUE_TOLERANCE_PCT = 0.01 # 1% tolerance for validation + +# ============================================================================ +# HELPER FUNCTIONS +# ============================================================================ +def ensure_directories() -> None: + """ + Create output directories if they don't exist + + Creates charts/ and reports/ directories for saving analysis outputs. + Called automatically by get_chart_path() and get_report_path(). + + Returns: + None: Creates directories in place + """ + OUTPUT_DIR.mkdir(exist_ok=True) + REPORTS_DIR.mkdir(exist_ok=True) + if DATA_DIR.exists(): + DATA_DIR.mkdir(exist_ok=True) + +def get_chart_path(filename: str) -> Path: + """ + Get full path for chart file + + Args: + filename: Chart filename (e.g., 'revenue_trend.png') + + Returns: + Path: Full path to chart file in OUTPUT_DIR + """ + ensure_directories() + return OUTPUT_DIR / filename + +def get_report_path(filename: str) -> Path: + """ + Get full path for report file + + Args: + filename: Report filename (e.g., 'analysis_report.pdf') + + Returns: + Path: Full path to report file in REPORTS_DIR + """ + ensure_directories() + return REPORTS_DIR / filename + +def get_data_path(filename: Optional[str] = None) -> Path: + """ + Get full path for data file + + This function handles data file location logic: + - If DATA_DIR exists, looks there first + - Otherwise uses current directory + - Defaults to DATA_FILE from config if filename not provided + + Args: + filename: Optional filename override (defaults to config.DATA_FILE) + + Returns: + Path: Full path to data file + + Example: + >>> from config import get_data_path + >>> data_path = get_data_path() + >>> print(f"Loading from: {data_path}") + """ + if filename is None: + filename = DATA_FILE + if DATA_DIR.exists(): + return DATA_DIR / filename + return Path(filename) + +def get_ltm_period() -> Tuple[Optional[pd.Period], Optional[pd.Period]]: + """ + Get LTM (Last Twelve Months) period boundaries from config + + Returns LTM start and end periods if LTM is enabled and configured, + otherwise returns (None, None). + + Returns: + Tuple[Optional[pd.Period], Optional[pd.Period]]: + (ltm_start, ltm_end) or (None, None) if disabled + + Example: + >>> ltm_start, ltm_end = get_ltm_period() + >>> if ltm_start and ltm_end: + ... print(f"LTM: {ltm_start} to {ltm_end}") + + See Also: + - get_ltm_label() - Get formatted LTM label string + - .cursor/rules/ltm_methodology.md - LTM explanation + """ + if LTM_ENABLED and LTM_START and LTM_END: + return LTM_START, LTM_END + return None, None + +def get_ltm_label() -> Optional[str]: + """ + Get LTM label string for display + + Returns formatted label like "2025 (LTM 9/2025)" if LTM is enabled, + otherwise None. Use this in chart titles and labels. + + Returns: + Optional[str]: LTM label string or None if LTM disabled + + Example: + >>> from config import get_ltm_label + >>> ltm_label = get_ltm_label() + >>> if ltm_label: + ... title = f'Revenue Trend\n({ltm_label})' + + See Also: + - get_ltm_period() - Get LTM period objects + - .cursor/rules/ltm_methodology.md - LTM usage guide + """ + return LTM_LABEL if LTM_ENABLED else None diff --git a/config_validator.py b/config_validator.py new file mode 100644 index 0000000..87597fd --- /dev/null +++ b/config_validator.py @@ -0,0 +1,214 @@ +""" +Configuration validation utility +Validates configuration settings against data to catch errors early + +Usage: + from config_validator import validate_config + + # Validate configuration + errors, warnings = validate_config(df) + if errors: + print("Configuration errors found:", errors) +""" +import pandas as pd +from pathlib import Path +from config import ( + DATA_FILE, REVENUE_COLUMN, DATE_COLUMN, DATE_FALLBACK_COLUMNS, + CUSTOMER_COLUMN, ITEM_COLUMN, QUANTITY_COLUMN, + MIN_YEAR, MAX_DATE, ANALYSIS_YEARS, + LTM_ENABLED, LTM_START, LTM_END, LTM_START_YEAR, LTM_END_YEAR, + EXCLUSION_FILTERS, get_data_path +) + +def validate_config(df=None): + """ + Validate configuration against data + + Args: + df: Optional DataFrame to validate against. If None, attempts to load data. + + Returns: + tuple: (errors list, warnings list) + + Example: + errors, warnings = validate_config(df) + if errors: + for error in errors: + print(f"ERROR: {error}") + if warnings: + for warning in warnings: + print(f"WARNING: {warning}") + """ + errors = [] + warnings = [] + + # Load data if not provided + if df is None: + try: + from data_loader import load_sales_data + data_path = get_data_path() + if not data_path.exists(): + errors.append(f"Data file not found: {data_path}") + return errors, warnings + df = load_sales_data(data_path) + except Exception as e: + errors.append(f"Could not load data for validation: {e}") + return errors, warnings + + # 1. Validate required columns exist + required_columns = [REVENUE_COLUMN, DATE_COLUMN] + for col in required_columns: + if col not in df.columns: + errors.append(f"Required column '{col}' not found in data. Available columns: {list(df.columns)[:10]}") + + # 2. Validate date column has valid dates + if DATE_COLUMN in df.columns: + date_coverage = df[DATE_COLUMN].notna().sum() / len(df) * 100 + if date_coverage < 50: + errors.append(f"Date coverage is very low ({date_coverage:.1f}%). Check date column configuration.") + elif date_coverage < 90: + warnings.append(f"Date coverage is {date_coverage:.1f}%. Consider adding fallback date columns.") + + # 3. Validate fallback date columns + if DATE_FALLBACK_COLUMNS: + missing_fallbacks = [col for col in DATE_FALLBACK_COLUMNS if col not in df.columns] + if missing_fallbacks: + warnings.append(f"Fallback date columns not found: {missing_fallbacks}") + + # 4. Validate revenue column is numeric + if REVENUE_COLUMN in df.columns: + try: + pd.to_numeric(df[REVENUE_COLUMN], errors='coerce') + valid_revenue = df[REVENUE_COLUMN].notna().sum() + if valid_revenue == 0: + errors.append(f"Revenue column '{REVENUE_COLUMN}' has no valid numeric values") + elif valid_revenue < len(df) * 0.9: + warnings.append(f"Revenue column has {len(df) - valid_revenue} invalid values") + except Exception: + errors.append(f"Revenue column '{REVENUE_COLUMN}' cannot be converted to numeric") + + # 5. Validate date range + if DATE_COLUMN in df.columns and df[DATE_COLUMN].notna().any(): + min_date_in_data = df[DATE_COLUMN].min() + max_date_in_data = df[DATE_COLUMN].max() + + if MIN_YEAR and min_date_in_data.year > MIN_YEAR: + warnings.append(f"MIN_YEAR ({MIN_YEAR}) is earlier than earliest data ({min_date_in_data.year})") + + if MAX_DATE and max_date_in_data > MAX_DATE: + warnings.append(f"MAX_DATE ({MAX_DATE.date()}) is earlier than latest data ({max_date_in_data.date()})") + + # 6. Validate analysis years + if 'Year' in df.columns: + available_years = sorted(df['Year'].unique()) + missing_years = [year for year in ANALYSIS_YEARS if year not in available_years] + if missing_years: + warnings.append(f"ANALYSIS_YEARS includes years not in data: {missing_years}") + + # 7. Validate LTM configuration + if LTM_ENABLED: + if LTM_START is None or LTM_END is None: + errors.append("LTM_ENABLED is True but LTM_START or LTM_END is None") + else: + if LTM_START > LTM_END: + errors.append(f"LTM_START ({LTM_START}) is after LTM_END ({LTM_END})") + + if 'YearMonth' in df.columns: + available_periods = df['YearMonth'].unique() + if LTM_START not in available_periods: + warnings.append(f"LTM_START ({LTM_START}) not found in data") + if LTM_END not in available_periods: + warnings.append(f"LTM_END ({LTM_END}) not found in data") + + # 8. Validate exclusion filters + if EXCLUSION_FILTERS.get('enabled', False): + exclude_col = EXCLUSION_FILTERS.get('exclude_by_column') + if exclude_col: + if exclude_col not in df.columns: + errors.append(f"Exclusion filter column '{exclude_col}' not found in data") + else: + exclude_values = EXCLUSION_FILTERS.get('exclude_values', []) + if exclude_values: + available_values = df[exclude_col].unique() + invalid_values = [v for v in exclude_values if v not in available_values] + if invalid_values: + warnings.append(f"Exclusion filter values not found in data: {invalid_values}") + + # 9. Validate optional columns (warnings only) + optional_columns = { + 'Customer': CUSTOMER_COLUMN, + 'Item': ITEM_COLUMN, + 'Quantity': QUANTITY_COLUMN + } + + for col_type, col_name in optional_columns.items(): + if col_name and col_name not in df.columns: + warnings.append(f"Optional {col_type} column '{col_name}' not found. Some analyses may not work.") + + # 10. Validate data file exists + data_path = get_data_path() + if not data_path.exists(): + errors.append(f"Data file not found: {data_path}") + + return errors, warnings + +def print_validation_report(errors, warnings): + """ + Print a formatted validation report + + Args: + errors: List of error messages + warnings: List of warning messages + """ + print("\n" + "="*60) + print("Configuration Validation Report") + print("="*60) + + if errors: + print(f"\nāŒ ERRORS ({len(errors)}):") + for i, error in enumerate(errors, 1): + print(f" {i}. {error}") + else: + print("\nāœ… No configuration errors found") + + if warnings: + print(f"\nāš ļø WARNINGS ({len(warnings)}):") + for i, warning in enumerate(warnings, 1): + print(f" {i}. {warning}") + else: + print("\nāœ… No warnings") + + print("\n" + "="*60) + + if errors: + return False + return True + +def validate_and_report(df=None): + """ + Validate configuration and print report + + Args: + df: Optional DataFrame to validate against + + Returns: + bool: True if no errors, False otherwise + """ + errors, warnings = validate_config(df) + return print_validation_report(errors, warnings) + +# ============================================================================ +# STANDALONE VALIDATION SCRIPT +# ============================================================================ + +if __name__ == "__main__": + """Run configuration validation""" + print("Validating configuration...") + is_valid = validate_and_report() + + if is_valid: + print("\nāœ… Configuration is valid!") + exit(0) + else: + print("\nāŒ Configuration has errors. Please fix them before running analyses.") + exit(1) diff --git a/data_loader.py b/data_loader.py new file mode 100644 index 0000000..255bad5 --- /dev/null +++ b/data_loader.py @@ -0,0 +1,224 @@ +""" +Generic data loading utility with flexible date handling +Handles various date column formats and fallback logic + +This loader is designed to work with different CSV structures by: +1. Trying primary date column first +2. Falling back to alternative date columns if needed +3. Ensuring 100% date coverage +""" +import pandas as pd +import numpy as np +from pathlib import Path +from config import ( + REVENUE_COLUMN, DATE_COLUMN, DATE_FALLBACK_COLUMNS, + get_data_path +) + +def load_sales_data(filepath=None): + """ + Load sales data with flexible date handling + + This function provides intelligent data loading with fallback logic: + 1. Loads the CSV file + 2. Converts revenue column to numeric + 3. Attempts to parse dates using primary date column + 4. Falls back to alternative date columns if needed (100% coverage) + 5. Creates Year and YearMonth columns for analysis + + CRITICAL: Always use this function instead of pd.read_csv() directly. + This ensures proper date parsing with fallback logic. + + Args: + filepath: Path to the CSV file (defaults to config.DATA_FILE). + Can be str, Path, or None (uses config.get_data_path()) + + Returns: + pd.DataFrame: DataFrame with properly parsed dates and revenue. + Includes 'Year' and 'YearMonth' columns. + + Raises: + FileNotFoundError: If data file doesn't exist. + Error message includes file path and suggests checking config.py + ValueError: If required columns (REVENUE_COLUMN) are missing. + Error message lists available columns and suggests updating config.py + + Example: + >>> from data_loader import load_sales_data + >>> from config import get_data_path + >>> df = load_sales_data(get_data_path()) + >>> print(f"Loaded {len(df):,} rows with {df['Year'].notna().sum():,} with dates") + + See Also: + - .cursor/rules/data_loading.md for detailed patterns + - config.py for column name configuration + """ + # Get data file path + if filepath is None: + filepath = get_data_path() + else: + filepath = Path(filepath) + + # Check if file exists + if not filepath.exists(): + raise FileNotFoundError( + f"Data file not found: {filepath}\n" + f"Please update config.py with the correct DATA_FILE path." + ) + + # Load CSV + print(f"Loading data from: {filepath}") + df = pd.read_csv(filepath, low_memory=False) + print(f"Loaded {len(df):,} rows") + + # Validate required columns + if REVENUE_COLUMN not in df.columns: + raise ValueError( + f"Required column '{REVENUE_COLUMN}' not found in data.\n" + f"Available columns: {list(df.columns)}\n" + f"Please update config.py REVENUE_COLUMN to match your data." + ) + + # Convert revenue column to numeric + df[REVENUE_COLUMN] = pd.to_numeric(df[REVENUE_COLUMN], errors='coerce') + + # Count missing revenue values + missing_revenue = df[REVENUE_COLUMN].isna().sum() + if missing_revenue > 0: + print(f"Warning: {missing_revenue:,} rows have missing/invalid revenue values") + + # Create working date column + df['WorkingDate'] = pd.NaT + + # Try primary date column first + if DATE_COLUMN in df.columns: + print(f"Attempting to parse {DATE_COLUMN}...") + df['Date_Parsed'] = pd.to_datetime(df[DATE_COLUMN], errors='coerce', format='mixed') + parsed_count = df['Date_Parsed'].notna().sum() + df.loc[df['Date_Parsed'].notna(), 'WorkingDate'] = df.loc[df['Date_Parsed'].notna(), 'Date_Parsed'] + print(f" Parsed {parsed_count:,} dates from {DATE_COLUMN}") + else: + print(f"Warning: Primary date column '{DATE_COLUMN}' not found") + + # Use fallback date columns + if DATE_FALLBACK_COLUMNS: + for fallback_col in DATE_FALLBACK_COLUMNS: + if fallback_col in df.columns: + missing_dates = df['WorkingDate'].isna() + if missing_dates.sum() > 0: + print(f"Using fallback column: {fallback_col}...") + fallback_parsed = pd.to_datetime( + df.loc[missing_dates, fallback_col], + errors='coerce', + format='mixed' + ) + newly_parsed = missing_dates & fallback_parsed.notna() + if newly_parsed.sum() > 0: + df.loc[newly_parsed, 'WorkingDate'] = fallback_parsed[newly_parsed] + print(f" Parsed {newly_parsed.sum():,} additional dates from {fallback_col}") + + # Final fallback: try to construct from Year column if available + if 'Year' in df.columns and df['WorkingDate'].isna().sum() > 0: + missing_dates = df['WorkingDate'].isna() + year_values = pd.to_numeric(df.loc[missing_dates, 'Year'], errors='coerce') + valid_years = missing_dates & year_values.notna() + if valid_years.sum() > 0: + print(f"Using Year column for remaining {valid_years.sum():,} rows...") + df.loc[valid_years, 'WorkingDate'] = pd.to_datetime( + df.loc[valid_years, 'Year'].astype(int).astype(str) + '-01-01', + errors='coerce' + ) + + # Set WorkingDate as the primary date column + df[DATE_COLUMN] = df['WorkingDate'] + + # Clean up temporary columns + df = df.drop(columns=['Date_Parsed', 'WorkingDate'], errors='ignore') + + # Extract Year from date column + df['Year'] = df[DATE_COLUMN].dt.year + + # Fill missing Year from Year column if it exists and date is missing + if 'Year' in df.columns: + year_orig = pd.to_numeric(df['Year'], errors='coerce') + missing_year = df['Year'].isna() + if missing_year.sum() > 0 and 'Year' in df.columns: + year_fallback = pd.to_numeric(df.loc[missing_year, 'Year'], errors='coerce') + df.loc[missing_year & year_fallback.notna(), 'Year'] = year_fallback[missing_year & year_fallback.notna()] + + # Create YearMonth for monthly analysis + if DATE_COLUMN in df.columns: + df['YearMonth'] = df[DATE_COLUMN].dt.to_period('M') + + # Report date coverage + total_rows = len(df) + date_coverage = df[DATE_COLUMN].notna().sum() + coverage_pct = (date_coverage / total_rows * 100) if total_rows > 0 else 0 + print(f"Date coverage: {date_coverage:,} / {total_rows:,} rows ({coverage_pct:.1f}%)") + + if coverage_pct < 100: + print(f"Warning: {total_rows - date_coverage:,} rows have missing dates") + + # Report date range + if df[DATE_COLUMN].notna().any(): + min_date = df[DATE_COLUMN].min() + max_date = df[DATE_COLUMN].max() + print(f"Date range: {min_date.strftime('%Y-%m-%d')} to {max_date.strftime('%Y-%m-%d')}") + + return df + +def validate_data_structure(df: pd.DataFrame) -> tuple[bool, str]: + """ + Validate that loaded data has expected structure. + + Checks for required columns, data quality, and basic validity. + Returns actionable error messages if validation fails. + + Args: + df: DataFrame to validate (should be result of load_sales_data()) + + Returns: + tuple[bool, str]: (is_valid, error_message) + - is_valid: True if data structure is valid, False otherwise + - error_message: "OK" if valid, otherwise descriptive error message + + Example: + >>> df = load_sales_data(get_data_path()) + >>> is_valid, msg = validate_data_structure(df) + >>> if not is_valid: + ... print(f"ERROR: {msg}") + + See Also: + - load_sales_data() - Load data before validating + - config_validator.py - Comprehensive configuration validation + """ + from config import REVENUE_COLUMN, DATE_COLUMN + + errors = [] + + # Check required columns + if REVENUE_COLUMN not in df.columns: + errors.append(f"Missing required column: {REVENUE_COLUMN}") + + if DATE_COLUMN not in df.columns: + errors.append(f"Missing required column: {DATE_COLUMN}") + + # Check data quality + if len(df) == 0: + errors.append("DataFrame is empty") + + if REVENUE_COLUMN in df.columns: + if df[REVENUE_COLUMN].isna().all(): + errors.append(f"All {REVENUE_COLUMN} values are NaN") + + if df[REVENUE_COLUMN].notna().sum() == 0: + errors.append(f"No valid {REVENUE_COLUMN} values") + + if DATE_COLUMN in df.columns: + if df[DATE_COLUMN].isna().all(): + errors.append(f"All {DATE_COLUMN} values are NaN") + + if errors: + return False, "; ".join(errors) + + return True, "OK" diff --git a/data_processing.py b/data_processing.py new file mode 100644 index 0000000..f1bbe9b --- /dev/null +++ b/data_processing.py @@ -0,0 +1,285 @@ +""" +Data processing utilities +Common data cleaning and transformation helpers + +Usage: + from data_processing import clean_data, create_pivot_table, prepare_time_series + + # Clean data + df_clean = clean_data(df) + + # Create pivot table + pivot = create_pivot_table(df, index='Year', columns='Product', values='Revenue') +""" +import pandas as pd +import numpy as np +from config import REVENUE_COLUMN, DATE_COLUMN, MIN_QUANTITY, MAX_QUANTITY + +def clean_data(df, remove_duplicates=True, handle_missing_dates=True): + """ + Clean data with common operations + + Args: + df: DataFrame to clean + remove_duplicates: Whether to remove duplicate rows + handle_missing_dates: Whether to handle missing dates + + Returns: + DataFrame: Cleaned DataFrame + """ + df_clean = df.copy() + + # Remove duplicates + if remove_duplicates: + initial_count = len(df_clean) + df_clean = df_clean.drop_duplicates() + removed = initial_count - len(df_clean) + if removed > 0: + print(f"Removed {removed:,} duplicate rows") + + # Handle missing dates + if handle_missing_dates and DATE_COLUMN in df_clean.columns: + missing_dates = df_clean[DATE_COLUMN].isna().sum() + if missing_dates > 0: + print(f"Warning: {missing_dates:,} rows have missing dates") + + # Remove rows with negative revenue (if configured) + if REVENUE_COLUMN in df_clean.columns: + negative_revenue = (df_clean[REVENUE_COLUMN] < 0).sum() + if negative_revenue > 0: + print(f"Found {negative_revenue:,} rows with negative revenue") + # Optionally remove: df_clean = df_clean[df_clean[REVENUE_COLUMN] >= 0] + + return df_clean + +def create_pivot_table(df, index, columns=None, values=None, aggfunc='sum', fill_value=0): + """ + Create pivot table with common defaults + + Args: + df: DataFrame + index: Column(s) to use as index + columns: Column(s) to use as columns + values: Column(s) to aggregate + aggfunc: Aggregation function (default: 'sum') + fill_value: Value to fill missing cells (default: 0) + + Returns: + DataFrame: Pivot table + """ + if values is None and REVENUE_COLUMN in df.columns: + values = REVENUE_COLUMN + + pivot = pd.pivot_table( + df, + index=index, + columns=columns, + values=values, + aggfunc=aggfunc, + fill_value=fill_value + ) + + return pivot + +def prepare_time_series(df, date_column=None, value_column=None, freq='M'): + """ + Prepare time series data + + Args: + df: DataFrame + date_column: Date column name (defaults to config.DATE_COLUMN) + value_column: Value column to aggregate (defaults to config.REVENUE_COLUMN) + freq: Frequency for resampling ('D', 'W', 'M', 'Q', 'Y') + + Returns: + Series: Time series data + """ + if date_column is None: + date_column = DATE_COLUMN + + if value_column is None: + value_column = REVENUE_COLUMN + + if date_column not in df.columns: + raise ValueError(f"Date column '{date_column}' not found") + + if value_column not in df.columns: + raise ValueError(f"Value column '{value_column}' not found") + + # Ensure date column is datetime + df = df.copy() + df[date_column] = pd.to_datetime(df[date_column], errors='coerce') + + # Set date as index + df_indexed = df.set_index(date_column) + + # Resample and aggregate + time_series = df_indexed[value_column].resample(freq).sum() + + return time_series + +def aggregate_by_period(df, period='year', date_column=None, value_column=None): + """ + Aggregate data by time period + + Args: + df: DataFrame + period: Period type ('year', 'month', 'quarter') + date_column: Date column name + value_column: Value column to aggregate + + Returns: + DataFrame: Aggregated data + """ + if date_column is None: + date_column = DATE_COLUMN + + if value_column is None: + value_column = REVENUE_COLUMN + + df = df.copy() + df[date_column] = pd.to_datetime(df[date_column], errors='coerce') + + # Extract period + if period == 'year': + df['Period'] = df[date_column].dt.year + elif period == 'month': + df['Period'] = df[date_column].dt.to_period('M') + elif period == 'quarter': + df['Period'] = df[date_column].dt.to_period('Q') + else: + raise ValueError(f"Unknown period: {period}") + + # Aggregate + aggregated = df.groupby('Period')[value_column].agg(['sum', 'count', 'mean']).reset_index() + aggregated.columns = ['Period', 'Total', 'Count', 'Average'] + + return aggregated + +def filter_outliers(df, column, method='iqr', lower_bound=None, upper_bound=None): + """ + Filter outliers from DataFrame + + Args: + df: DataFrame + column: Column name to filter on + method: Method ('iqr' for interquartile range, 'zscore' for z-score) + lower_bound: Manual lower bound + upper_bound: Manual upper bound + + Returns: + DataFrame: Filtered DataFrame + """ + df_filtered = df.copy() + + if method == 'iqr': + q1 = df[column].quantile(0.25) + q3 = df[column].quantile(0.75) + iqr = q3 - q1 + lower = lower_bound if lower_bound is not None else q1 - 1.5 * iqr + upper = upper_bound if upper_bound is not None else q3 + 1.5 * iqr + elif method == 'zscore': + mean = df[column].mean() + std = df[column].std() + lower = lower_bound if lower_bound is not None else mean - 3 * std + upper = upper_bound if upper_bound is not None else mean + 3 * std + else: + raise ValueError(f"Unknown method: {method}") + + initial_count = len(df_filtered) + df_filtered = df_filtered[(df_filtered[column] >= lower) & (df_filtered[column] <= upper)] + removed = initial_count - len(df_filtered) + + if removed > 0: + print(f"Removed {removed:,} outliers from {column} ({removed/initial_count*100:.1f}%)") + + return df_filtered + +def normalize_column(df, column, method='min_max'): + """ + Normalize a column + + Args: + df: DataFrame + column: Column name to normalize + method: Normalization method ('min_max', 'zscore') + + Returns: + Series: Normalized values + """ + if method == 'min_max': + min_val = df[column].min() + max_val = df[column].max() + if max_val - min_val == 0: + return pd.Series([0] * len(df), index=df.index) + return (df[column] - min_val) / (max_val - min_val) + elif method == 'zscore': + mean = df[column].mean() + std = df[column].std() + if std == 0: + return pd.Series([0] * len(df), index=df.index) + return (df[column] - mean) / std + else: + raise ValueError(f"Unknown method: {method}") + +def create_derived_columns(df): + """ + Create common derived columns + + Args: + df: DataFrame + + Returns: + DataFrame: DataFrame with derived columns + """ + df_derived = df.copy() + + # Extract year, month, quarter if date column exists + if DATE_COLUMN in df_derived.columns: + df_derived[DATE_COLUMN] = pd.to_datetime(df_derived[DATE_COLUMN], errors='coerce') + + if 'Year' not in df_derived.columns: + df_derived['Year'] = df_derived[DATE_COLUMN].dt.year + + if 'Month' not in df_derived.columns: + df_derived['Month'] = df_derived[DATE_COLUMN].dt.month + + if 'Quarter' not in df_derived.columns: + df_derived['Quarter'] = df_derived[DATE_COLUMN].dt.quarter + + if 'YearMonth' not in df_derived.columns: + df_derived['YearMonth'] = df_derived[DATE_COLUMN].dt.to_period('M') + + # Calculate price per unit if quantity and revenue exist + from config import QUANTITY_COLUMN + if QUANTITY_COLUMN in df_derived.columns and REVENUE_COLUMN in df_derived.columns: + df_derived['Price_Per_Unit'] = df_derived[REVENUE_COLUMN] / df_derived[QUANTITY_COLUMN].replace(0, np.nan) + + return df_derived + +# ============================================================================ +# EXAMPLE USAGE +# ============================================================================ + +if __name__ == "__main__": + """Example usage""" + # Create sample data + df = pd.DataFrame({ + 'InvoiceDate': pd.date_range('2023-01-01', periods=100, freq='D'), + 'USD': np.random.normal(1000, 200, 100), + 'Quantity': np.random.randint(1, 100, 100) + }) + + # Clean data + df_clean = clean_data(df) + print(f"Cleaned data: {len(df_clean)} rows") + + # Create pivot table + df_clean['Year'] = df_clean['InvoiceDate'].dt.year + pivot = create_pivot_table(df_clean, index='Year', values='USD') + print("\nPivot table:") + print(pivot) + + # Prepare time series + ts = prepare_time_series(df_clean, freq='M') + print(f"\nTime series: {len(ts)} periods") diff --git a/data_quality.py b/data_quality.py new file mode 100644 index 0000000..e41d01f --- /dev/null +++ b/data_quality.py @@ -0,0 +1,344 @@ +""" +Data quality reporting utility +Generates comprehensive data quality reports + +Usage: + from data_quality import generate_data_quality_report, print_data_quality_report + + # Generate and print report + report = generate_data_quality_report(df) + print_data_quality_report(report) +""" +import pandas as pd +import numpy as np +from config import ( + REVENUE_COLUMN, DATE_COLUMN, CUSTOMER_COLUMN, ITEM_COLUMN, + QUANTITY_COLUMN, MIN_QUANTITY, MAX_QUANTITY +) + +def generate_data_quality_report(df): + """ + Generate comprehensive data quality report + + Args: + df: DataFrame to analyze + + Returns: + dict: Dictionary containing data quality metrics + """ + report = { + 'overview': {}, + 'missing_values': {}, + 'duplicates': {}, + 'outliers': {}, + 'data_types': {}, + 'date_coverage': {}, + 'revenue_summary': {}, + 'issues': [] + } + + # Overview + report['overview'] = { + 'total_rows': len(df), + 'total_columns': len(df.columns), + 'memory_usage_mb': df.memory_usage(deep=True).sum() / 1024**2 + } + + # Missing values + missing = df.isnull().sum() + missing_pct = (missing / len(df)) * 100 + report['missing_values'] = { + 'by_column': missing[missing > 0].to_dict(), + 'percentages': missing_pct[missing > 0].to_dict(), + 'total_missing': missing.sum(), + 'columns_with_missing': len(missing[missing > 0]) + } + + # Duplicates + duplicate_rows = df.duplicated().sum() + report['duplicates'] = { + 'duplicate_rows': int(duplicate_rows), + 'duplicate_percentage': (duplicate_rows / len(df)) * 100 if len(df) > 0 else 0 + } + + # Outliers (revenue and quantity) + outliers = {} + + if REVENUE_COLUMN in df.columns: + revenue = pd.to_numeric(df[REVENUE_COLUMN], errors='coerce') + q1 = revenue.quantile(0.25) + q3 = revenue.quantile(0.75) + iqr = q3 - q1 + lower_bound = q1 - 1.5 * iqr + upper_bound = q3 + 1.5 * iqr + + revenue_outliers = ((revenue < lower_bound) | (revenue > upper_bound)).sum() + outliers['revenue'] = { + 'count': int(revenue_outliers), + 'percentage': (revenue_outliers / len(df)) * 100 if len(df) > 0 else 0, + 'lower_bound': float(lower_bound), + 'upper_bound': float(upper_bound), + 'negative_values': int((revenue < 0).sum()) + } + + if QUANTITY_COLUMN in df.columns: + quantity = pd.to_numeric(df[QUANTITY_COLUMN], errors='coerce') + # Use config thresholds if available + if MIN_QUANTITY is not None and MAX_QUANTITY is not None: + quantity_outliers = ((quantity < MIN_QUANTITY) | (quantity > MAX_QUANTITY)).sum() + outliers['quantity'] = { + 'count': int(quantity_outliers), + 'percentage': (quantity_outliers / len(df)) * 100 if len(df) > 0 else 0, + 'below_min': int((quantity < MIN_QUANTITY).sum()), + 'above_max': int((quantity > MAX_QUANTITY).sum()) + } + else: + q1 = quantity.quantile(0.25) + q3 = quantity.quantile(0.75) + iqr = q3 - q1 + lower_bound = q1 - 1.5 * iqr + upper_bound = q3 + 1.5 * iqr + + quantity_outliers = ((quantity < lower_bound) | (quantity > upper_bound)).sum() + outliers['quantity'] = { + 'count': int(quantity_outliers), + 'percentage': (quantity_outliers / len(df)) * 100 if len(df) > 0 else 0, + 'lower_bound': float(lower_bound), + 'upper_bound': float(upper_bound) + } + + report['outliers'] = outliers + + # Data types + report['data_types'] = { + 'numeric_columns': list(df.select_dtypes(include=[np.number]).columns), + 'datetime_columns': list(df.select_dtypes(include=['datetime64']).columns), + 'object_columns': list(df.select_dtypes(include=['object']).columns), + 'type_summary': df.dtypes.value_counts().to_dict() + } + + # Date coverage + if DATE_COLUMN in df.columns: + date_coverage = df[DATE_COLUMN].notna().sum() + report['date_coverage'] = { + 'total_rows': len(df), + 'rows_with_dates': int(date_coverage), + 'coverage_percentage': (date_coverage / len(df)) * 100 if len(df) > 0 else 0, + 'min_date': str(df[DATE_COLUMN].min()) if date_coverage > 0 else None, + 'max_date': str(df[DATE_COLUMN].max()) if date_coverage > 0 else None + } + + # Revenue summary + if REVENUE_COLUMN in df.columns: + revenue = pd.to_numeric(df[REVENUE_COLUMN], errors='coerce') + valid_revenue = revenue.dropna() + + if len(valid_revenue) > 0: + report['revenue_summary'] = { + 'total_revenue': float(valid_revenue.sum()), + 'mean_revenue': float(valid_revenue.mean()), + 'median_revenue': float(valid_revenue.median()), + 'min_revenue': float(valid_revenue.min()), + 'max_revenue': float(valid_revenue.max()), + 'std_revenue': float(valid_revenue.std()), + 'valid_rows': int(len(valid_revenue)), + 'invalid_rows': int(len(df) - len(valid_revenue)) + } + + # Identify issues + issues = [] + + # Critical issues + if report['missing_values']['columns_with_missing'] > 0: + high_missing = {k: v for k, v in report['missing_values']['percentages'].items() if v > 50} + if high_missing: + issues.append({ + 'severity': 'critical', + 'issue': f"Columns with >50% missing values: {list(high_missing.keys())}", + 'impact': 'High' + }) + + if DATE_COLUMN in df.columns: + if report['date_coverage']['coverage_percentage'] < 50: + issues.append({ + 'severity': 'critical', + 'issue': f"Date coverage is only {report['date_coverage']['coverage_percentage']:.1f}%", + 'impact': 'High - analyses may fail' + }) + + if REVENUE_COLUMN in df.columns: + if report['revenue_summary'].get('invalid_rows', 0) > len(df) * 0.1: + issues.append({ + 'severity': 'critical', + 'issue': f"{report['revenue_summary']['invalid_rows']} rows have invalid revenue values", + 'impact': 'High' + }) + + # Warnings + if report['duplicates']['duplicate_percentage'] > 5: + issues.append({ + 'severity': 'warning', + 'issue': f"{report['duplicates']['duplicate_rows']} duplicate rows ({report['duplicates']['duplicate_percentage']:.1f}%)", + 'impact': 'Medium' + }) + + if 'revenue' in outliers: + if outliers['revenue']['percentage'] > 10: + issues.append({ + 'severity': 'warning', + 'issue': f"{outliers['revenue']['count']} revenue outliers ({outliers['revenue']['percentage']:.1f}%)", + 'impact': 'Medium' + }) + + report['issues'] = issues + + return report + +def print_data_quality_report(report): + """ + Print formatted data quality report + + Args: + report: Dictionary from generate_data_quality_report() + """ + print("\n" + "="*70) + print("DATA QUALITY REPORT") + print("="*70) + + # Overview + print("\nšŸ“Š OVERVIEW") + print("-" * 70) + print(f"Total Rows: {report['overview']['total_rows']:,}") + print(f"Total Columns: {report['overview']['total_columns']}") + print(f"Memory Usage: {report['overview']['memory_usage_mb']:.2f} MB") + + # Missing values + print("\nšŸ” MISSING VALUES") + print("-" * 70) + if report['missing_values']['columns_with_missing'] > 0: + print(f"Columns with missing values: {report['missing_values']['columns_with_missing']}") + print(f"Total missing values: {report['missing_values']['total_missing']:,}") + print("\nTop columns by missing values:") + missing_sorted = sorted( + report['missing_values']['percentages'].items(), + key=lambda x: x[1], + reverse=True + )[:10] + for col, pct in missing_sorted: + count = report['missing_values']['by_column'][col] + print(f" {col:30s}: {count:8,} ({pct:5.1f}%)") + else: + print("āœ… No missing values found") + + # Duplicates + print("\nšŸ”„ DUPLICATES") + print("-" * 70) + if report['duplicates']['duplicate_rows'] > 0: + print(f"Duplicate Rows: {report['duplicates']['duplicate_rows']:,} ({report['duplicates']['duplicate_percentage']:.2f}%)") + else: + print("āœ… No duplicate rows found") + + # Outliers + print("\nšŸ“ˆ OUTLIERS") + print("-" * 70) + if 'revenue' in report['outliers']: + rev_out = report['outliers']['revenue'] + print(f"Revenue Outliers: {rev_out['count']:,} ({rev_out['percentage']:.2f}%)") + if 'negative_values' in rev_out and rev_out['negative_values'] > 0: + print(f" Negative Revenue Values: {rev_out['negative_values']:,}") + + if 'quantity' in report['outliers']: + qty_out = report['outliers']['quantity'] + print(f"Quantity Outliers: {qty_out['count']:,} ({qty_out['percentage']:.2f}%)") + + if not report['outliers']: + print("āœ… No significant outliers detected") + + # Date coverage + if report['date_coverage']: + print("\nšŸ“… DATE COVERAGE") + print("-" * 70) + dc = report['date_coverage'] + print(f"Rows with Dates: {dc['rows_with_dates']:,} / {dc['total_rows']:,} ({dc['coverage_percentage']:.1f}%)") + if dc['min_date']: + print(f"Date Range: {dc['min_date']} to {dc['max_date']}") + + # Revenue summary + if report['revenue_summary']: + print("\nšŸ’° REVENUE SUMMARY") + print("-" * 70) + rs = report['revenue_summary'] + print(f"Total Revenue: ${rs['total_revenue'] / 1e6:.2f}m") + print(f"Valid Rows: {rs['valid_rows']:,} / {rs['valid_rows'] + rs['invalid_rows']:,}") + if rs['invalid_rows'] > 0: + print(f"Invalid Rows: {rs['invalid_rows']:,}") + print(f"Mean: ${rs['mean_revenue']:,.2f}") + print(f"Median: ${rs['median_revenue']:,.2f}") + print(f"Min: ${rs['min_revenue']:,.2f}") + print(f"Max: ${rs['max_revenue']:,.2f}") + + # Issues + if report['issues']: + print("\nāš ļø ISSUES DETECTED") + print("-" * 70) + critical = [i for i in report['issues'] if i['severity'] == 'critical'] + warnings = [i for i in report['issues'] if i['severity'] == 'warning'] + + if critical: + print("āŒ CRITICAL ISSUES:") + for issue in critical: + print(f" • {issue['issue']}") + print(f" Impact: {issue['impact']}") + + if warnings: + print("\nāš ļø WARNINGS:") + for issue in warnings: + print(f" • {issue['issue']}") + print(f" Impact: {issue['impact']}") + else: + print("\nāœ… NO ISSUES DETECTED") + + print("\n" + "="*70) + +def generate_data_quality_report_simple(df): + """ + Generate a simple data quality summary (quick check) + + Args: + df: DataFrame to analyze + + Returns: + str: Simple summary string + """ + summary_parts = [] + + summary_parts.append(f"Rows: {len(df):,}") + summary_parts.append(f"Columns: {len(df.columns)}") + + if REVENUE_COLUMN in df.columns: + revenue = pd.to_numeric(df[REVENUE_COLUMN], errors='coerce') + valid = revenue.notna().sum() + summary_parts.append(f"Valid Revenue: {valid:,} ({valid/len(df)*100:.1f}%)") + + if DATE_COLUMN in df.columns: + date_coverage = df[DATE_COLUMN].notna().sum() + summary_parts.append(f"Date Coverage: {date_coverage:,} ({date_coverage/len(df)*100:.1f}%)") + + return " | ".join(summary_parts) + +# ============================================================================ +# STANDALONE DATA QUALITY CHECK +# ============================================================================ + +if __name__ == "__main__": + """Run data quality check""" + from data_loader import load_sales_data + from config import get_data_path + + print("Loading data for quality check...") + try: + df = load_sales_data(get_data_path()) + report = generate_data_quality_report(df) + print_data_quality_report(report) + except Exception as e: + print(f"ERROR: {e}") diff --git a/examples/annual_revenue_trend.py b/examples/annual_revenue_trend.py new file mode 100644 index 0000000..2b355b9 --- /dev/null +++ b/examples/annual_revenue_trend.py @@ -0,0 +1,134 @@ +""" +Example: Annual Revenue Trend Analysis +Simple example showing annual revenue with LTM support + +This is a working example that demonstrates: +- Loading data using data_loader +- Calculating annual metrics with LTM +- Creating a revenue trend chart +- Following template best practices +""" +import pandas as pd +import matplotlib.pyplot as plt +from pathlib import Path + +# Import utilities +from data_loader import load_sales_data, validate_data_structure +from validate_revenue import validate_revenue +from analysis_utils import ( + get_ltm_period_config, calculate_annual_metrics, + setup_revenue_chart, save_chart, + format_currency, print_annual_summary, sort_mixed_years, + apply_exclusion_filters +) +from config import ( + OUTPUT_DIR, ANALYSIS_YEARS, MAX_DATE, + CHART_SIZES, ensure_directories, get_data_path, COMPANY_NAME, + REVENUE_COLUMN, MIN_YEAR, DATE_COLUMN +) + +# ============================================================================ +# CONFIGURATION +# ============================================================================ + +ANALYSIS_NAME = "Annual Revenue Trend" +DESCRIPTION = "Simple annual revenue trend analysis with LTM support" + +# ============================================================================ +# MAIN ANALYSIS FUNCTION +# ============================================================================ + +def main(): + """Main analysis function""" + + print(f"\n{'='*60}") + print(f"{ANALYSIS_NAME}") + print(f"{'='*60}\n") + + # 1. Load data + print("Loading data...") + try: + df = load_sales_data(get_data_path()) + print(f"Loaded {len(df):,} transactions") + except Exception as e: + print(f"ERROR loading data: {e}") + return + + # 2. Validate data structure + is_valid, msg = validate_data_structure(df) + if not is_valid: + print(f"ERROR: {msg}") + return + print("Data validation passed") + + # 3. Apply exclusion filters (if configured) + df = apply_exclusion_filters(df) + + # 4. Filter by date range + df = df[df['Year'] >= MIN_YEAR] + if DATE_COLUMN in df.columns: + df = df[df[DATE_COLUMN] <= MAX_DATE] + + # 5. Setup LTM period (if enabled) + ltm_start, ltm_end = get_ltm_period_config() + if ltm_start and ltm_end: + print(f"LTM period: {ltm_start} to {ltm_end}") + + # 6. Calculate annual metrics + print("\nCalculating annual metrics...") + + def calculate_metrics(year_data): + """Calculate metrics for a single year""" + return { + 'Revenue': year_data[REVENUE_COLUMN].sum(), + } + + annual_df = calculate_annual_metrics(df, calculate_metrics, ltm_start, ltm_end) + + # 7. Print summary + print_annual_summary(annual_df, 'Revenue', 'Revenue') + + # 8. Create visualization + print("Generating chart...") + ensure_directories() + + # Annual revenue trend chart + fig, ax = plt.subplots(figsize=CHART_SIZES['medium']) + + # Prepare data for plotting (handle mixed types) + annual_df_sorted = sort_mixed_years(annual_df.reset_index(), 'Year') + years = annual_df_sorted['Year'].tolist() + revenue = annual_df_sorted['Revenue'].values / 1e6 # Convert to millions + + # Create chart + ax.plot(range(len(years)), revenue, marker='o', linewidth=2, markersize=8, color='#2E86AB') + ax.set_xticks(range(len(years))) + ax.set_xticklabels(years, rotation=45, ha='right') + setup_revenue_chart(ax) + + # Add LTM notation to title if applicable + title = f'Annual Revenue Trend - {COMPANY_NAME}' + if ltm_start and ltm_end: + from config import get_ltm_label + ltm_label = get_ltm_label() + if ltm_label: + title += f'\n({ltm_label})' + ax.set_title(title, fontsize=14, fontweight='bold') + + plt.tight_layout() + save_chart(fig, 'annual_revenue_trend.png') + plt.close() + + # 9. Validate revenue + print("\nValidating revenue...") + validate_revenue(df, ANALYSIS_NAME) + + print(f"\n{ANALYSIS_NAME} complete!") + print(f"Chart saved to: {OUTPUT_DIR}") + +# ============================================================================ +# RUN ANALYSIS +# ============================================================================ + +if __name__ == "__main__": + main() diff --git a/examples/cohort_analysis.py b/examples/cohort_analysis.py new file mode 100644 index 0000000..02a8035 --- /dev/null +++ b/examples/cohort_analysis.py @@ -0,0 +1,218 @@ +""" +Example: Cohort Analysis +Advanced example showing customer cohort retention analysis + +This demonstrates: +- Cohort-based analysis +- Retention rate calculations +- Revenue retention metrics +- Advanced visualization +""" +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +import seaborn as sns +from pathlib import Path +from operator import attrgetter + +# Import utilities +from data_loader import load_sales_data, validate_data_structure +from validate_revenue import validate_revenue +from analysis_utils import ( + get_ltm_period_config, apply_exclusion_filters, + setup_revenue_chart, save_chart, format_currency +) +from config import ( + OUTPUT_DIR, MAX_DATE, CHART_SIZES, ensure_directories, + get_data_path, COMPANY_NAME, REVENUE_COLUMN, CUSTOMER_COLUMN, + DATE_COLUMN, MIN_YEAR +) + +# ============================================================================ +# CONFIGURATION +# ============================================================================ + +ANALYSIS_NAME = "Cohort Analysis" +DESCRIPTION = "Customer cohort retention and revenue analysis" + +# ============================================================================ +# COHORT ANALYSIS FUNCTIONS +# ============================================================================ + +def create_cohorts(df): + """ + Create customer cohorts based on first purchase date + + Args: + df: DataFrame with customer and date columns + + Returns: + DataFrame: Original DataFrame with 'Cohort' and 'CohortPeriod' columns + """ + from config import CUSTOMER_COLUMN, DATE_COLUMN + + # Get first purchase date for each customer + first_purchase = df.groupby(CUSTOMER_COLUMN)[DATE_COLUMN].min().reset_index() + first_purchase.columns = [CUSTOMER_COLUMN, 'FirstPurchaseDate'] + + # Extract cohort year-month + first_purchase['Cohort'] = first_purchase['FirstPurchaseDate'].dt.to_period('M') + + # Merge back to original data + df_with_cohort = df.merge(first_purchase[[CUSTOMER_COLUMN, 'Cohort']], on=CUSTOMER_COLUMN) + + # Calculate period number (months since first purchase) + df_with_cohort['Period'] = df_with_cohort[DATE_COLUMN].dt.to_period('M') + df_with_cohort['CohortPeriod'] = (df_with_cohort['Period'] - df_with_cohort['Cohort']).apply(attrgetter('n')) + + return df_with_cohort + +def calculate_cohort_metrics(df_with_cohort): + """ + Calculate cohort retention metrics + + Args: + df_with_cohort: DataFrame with Cohort and CohortPeriod columns + + Returns: + DataFrame: Cohort metrics by period + """ + from config import REVENUE_COLUMN, CUSTOMER_COLUMN + + # Customer count by cohort and period + cohort_size = df_with_cohort.groupby('Cohort')[CUSTOMER_COLUMN].nunique() + + # Revenue by cohort and period + cohort_revenue = df_with_cohort.groupby(['Cohort', 'CohortPeriod']).agg({ + CUSTOMER_COLUMN: 'nunique', + REVENUE_COLUMN: 'sum' + }).reset_index() + cohort_revenue.columns = ['Cohort', 'Period', 'Customers', 'Revenue'] + + # Calculate retention rates + cohort_retention = [] + for cohort in cohort_revenue['Cohort'].unique(): + cohort_data = cohort_revenue[cohort_revenue['Cohort'] == cohort].copy() + initial_customers = cohort_data[cohort_data['Period'] == 0]['Customers'].values[0] + + cohort_data['Retention_Rate'] = (cohort_data['Customers'] / initial_customers) * 100 + cohort_data['Revenue_Retention'] = cohort_data['Revenue'] / cohort_data[cohort_data['Period'] == 0]['Revenue'].values[0] * 100 + + cohort_retention.append(cohort_data) + + return pd.concat(cohort_retention, ignore_index=True) + +# ============================================================================ +# MAIN ANALYSIS FUNCTION +# ============================================================================ + +def main(): + """Main analysis function""" + + print(f"\n{'='*60}") + print(f"{ANALYSIS_NAME}") + print(f"{'='*60}\n") + + # 1. Load data + print("Loading data...") + try: + df = load_sales_data(get_data_path()) + print(f"Loaded {len(df):,} transactions") + except Exception as e: + print(f"ERROR loading data: {e}") + return + + # 2. Validate + is_valid, msg = validate_data_structure(df) + if not is_valid: + print(f"ERROR: {msg}") + return + + if CUSTOMER_COLUMN not in df.columns: + print(f"ERROR: Customer column '{CUSTOMER_COLUMN}' not found") + return + + # 3. Apply filters + df = apply_exclusion_filters(df) + df = df[df['Year'] >= MIN_YEAR] + if DATE_COLUMN in df.columns: + df = df[df[DATE_COLUMN] <= MAX_DATE] + + # 4. Create cohorts + print("\nCreating customer cohorts...") + df_cohort = create_cohorts(df) + + # 5. Calculate cohort metrics + print("Calculating cohort metrics...") + cohort_metrics = calculate_cohort_metrics(df_cohort) + + # 6. Print summary + print("\nCohort Summary:") + print("-" * 60) + for cohort in sorted(cohort_metrics['Cohort'].unique())[:5]: # Show top 5 cohorts + cohort_data = cohort_metrics[cohort_metrics['Cohort'] == cohort] + period_0 = cohort_data[cohort_data['Period'] == 0] + if len(period_0) > 0: + initial_customers = period_0['Customers'].values[0] + initial_revenue = period_0['Revenue'].values[0] + print(f"\n{cohort}:") + print(f" Initial: {initial_customers:,} customers, {format_currency(initial_revenue)}") + + # Show retention at period 12 + period_12 = cohort_data[cohort_data['Period'] == 12] + if len(period_12) > 0: + retention = period_12['Retention_Rate'].values[0] + revenue_ret = period_12['Revenue_Retention'].values[0] + print(f" Period 12: {retention:.1f}% customer retention, {revenue_ret:.1f}% revenue retention") + + # 7. Create visualizations + print("\nGenerating charts...") + ensure_directories() + + # Heatmap: Customer retention + pivot_retention = cohort_metrics.pivot_table( + index='Cohort', + columns='Period', + values='Retention_Rate', + aggfunc='mean' + ) + + fig, (ax1, ax2) = plt.subplots(1, 2, figsize=CHART_SIZES['wide']) + + # Retention heatmap + sns.heatmap(pivot_retention, annot=True, fmt='.0f', cmap='YlOrRd', ax=ax1, cbar_kws={'label': 'Retention %'}) + ax1.set_title('Customer Retention by Cohort\n(Period 0 = 100%)', fontsize=12, fontweight='bold') + ax1.set_xlabel('Months Since First Purchase') + ax1.set_ylabel('Cohort') + + # Revenue retention heatmap + pivot_revenue = cohort_metrics.pivot_table( + index='Cohort', + columns='Period', + values='Revenue_Retention', + aggfunc='mean' + ) + + sns.heatmap(pivot_revenue, annot=True, fmt='.0f', cmap='YlGnBu', ax=ax2, cbar_kws={'label': 'Revenue Retention %'}) + ax2.set_title('Revenue Retention by Cohort\n(Period 0 = 100%)', fontsize=12, fontweight='bold') + ax2.set_xlabel('Months Since First Purchase') + ax2.set_ylabel('Cohort') + + plt.suptitle(f'Cohort Analysis - {COMPANY_NAME}', fontsize=14, fontweight='bold', y=1.02) + plt.tight_layout() + save_chart(fig, 'cohort_analysis.png') + plt.close() + + # 8. Validate + print("\nValidating revenue...") + validate_revenue(df, ANALYSIS_NAME) + + print(f"\n{ANALYSIS_NAME} complete!") + print(f"Charts saved to: {OUTPUT_DIR}") + +# ============================================================================ +# RUN ANALYSIS +# ============================================================================ + +if __name__ == "__main__": + main() diff --git a/examples/customer_segmentation.py b/examples/customer_segmentation.py new file mode 100644 index 0000000..c12369f --- /dev/null +++ b/examples/customer_segmentation.py @@ -0,0 +1,213 @@ +""" +Example: Customer Segmentation (RFM) Analysis +Example showing customer segmentation using RFM methodology + +This example demonstrates: +- Customer-level aggregation +- RFM segmentation (Recency, Frequency, Monetary) +- Segment analysis and visualization +""" +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +from pathlib import Path + +# Import utilities +from data_loader import load_sales_data, validate_data_structure +from validate_revenue import validate_revenue +from analysis_utils import ( + get_ltm_period_config, apply_exclusion_filters, + setup_revenue_chart, save_chart, format_currency +) +from config import ( + OUTPUT_DIR, MAX_DATE, CHART_SIZES, ensure_directories, + get_data_path, COMPANY_NAME, REVENUE_COLUMN, CUSTOMER_COLUMN, + DATE_COLUMN, MIN_YEAR +) + +# ============================================================================ +# CONFIGURATION +# ============================================================================ + +ANALYSIS_NAME = "Customer Segmentation (RFM)" +DESCRIPTION = "Customer segmentation using RFM methodology" + +# ============================================================================ +# RFM SEGMENTATION FUNCTIONS +# ============================================================================ + +def calculate_rfm_scores(df, analysis_date=None): + """ + Calculate RFM scores for each customer + + Args: + df: DataFrame with customer, date, and revenue columns + analysis_date: Reference date for recency calculation (defaults to max date) + + Returns: + DataFrame with RFM scores and segment assignment + """ + if analysis_date is None: + analysis_date = df[DATE_COLUMN].max() + + # Calculate customer-level metrics + customer_metrics = df.groupby(CUSTOMER_COLUMN).agg({ + DATE_COLUMN: ['max', 'count'], + REVENUE_COLUMN: 'sum' + }).reset_index() + + customer_metrics.columns = [CUSTOMER_COLUMN, 'LastPurchaseDate', 'Frequency', 'Monetary'] + + # Calculate Recency (days since last purchase) + customer_metrics['Recency'] = (analysis_date - customer_metrics['LastPurchaseDate']).dt.days + + # Score each dimension (1-5 scale, 5 = best) + customer_metrics['R_Score'] = pd.qcut( + customer_metrics['Recency'].rank(method='first'), + q=5, labels=[5, 4, 3, 2, 1], duplicates='drop' + ).astype(int) + + customer_metrics['F_Score'] = pd.qcut( + customer_metrics['Frequency'].rank(method='first'), + q=5, labels=[1, 2, 3, 4, 5], duplicates='drop' + ).astype(int) + + customer_metrics['M_Score'] = pd.qcut( + customer_metrics['Monetary'].rank(method='first'), + q=5, labels=[1, 2, 3, 4, 5], duplicates='drop' + ).astype(int) + + # Calculate RFM score (sum of R, F, M) + customer_metrics['RFM_Score'] = ( + customer_metrics['R_Score'] + + customer_metrics['F_Score'] + + customer_metrics['M_Score'] + ) + + # Assign segments + def assign_segment(row): + r, f, m = row['R_Score'], row['F_Score'], row['M_Score'] + if r >= 4 and f >= 4 and m >= 4: + return 'Champions' + elif r >= 3 and f >= 3 and m >= 4: + return 'Loyal Customers' + elif r >= 4 and f <= 2: + return 'At Risk' + elif r <= 2: + return 'Hibernating' + elif r >= 3 and f >= 3 and m <= 2: + return 'Potential Loyalists' + else: + return 'Need Attention' + + customer_metrics['Segment'] = customer_metrics.apply(assign_segment, axis=1) + + return customer_metrics + +# ============================================================================ +# MAIN ANALYSIS FUNCTION +# ============================================================================ + +def main(): + """Main analysis function""" + + print(f"\n{'='*60}") + print(f"{ANALYSIS_NAME}") + print(f"{'='*60}\n") + + # 1. Load data + print("Loading data...") + try: + df = load_sales_data(get_data_path()) + print(f"Loaded {len(df):,} transactions") + except Exception as e: + print(f"ERROR loading data: {e}") + return + + # 2. Validate data structure + is_valid, msg = validate_data_structure(df) + if not is_valid: + print(f"ERROR: {msg}") + return + + if CUSTOMER_COLUMN not in df.columns: + print(f"ERROR: Customer column '{CUSTOMER_COLUMN}' not found in data") + return + + print("Data validation passed") + + # 3. Apply exclusion filters + df = apply_exclusion_filters(df) + + # 4. Filter by date range + df = df[df['Year'] >= MIN_YEAR] + if DATE_COLUMN in df.columns: + df = df[df[DATE_COLUMN] <= MAX_DATE] + + # 5. Calculate RFM scores + print("\nCalculating RFM scores...") + rfm_df = calculate_rfm_scores(df) + + # 6. Segment summary + print("\nCustomer Segmentation Summary:") + print("-" * 60) + segment_summary = rfm_df.groupby('Segment').agg({ + CUSTOMER_COLUMN: 'count', + 'Monetary': 'sum' + }).reset_index() + segment_summary.columns = ['Segment', 'Customer Count', 'Total Revenue'] + segment_summary = segment_summary.sort_values('Total Revenue', ascending=False) + + for _, row in segment_summary.iterrows(): + pct_customers = (row['Customer Count'] / len(rfm_df)) * 100 + pct_revenue = (row['Total Revenue'] / rfm_df['Monetary'].sum()) * 100 + print(f"{row['Segment']:20s}: {row['Customer Count']:5d} customers ({pct_customers:5.1f}%), " + f"{format_currency(row['Total Revenue'])} ({pct_revenue:5.1f}% of revenue)") + + # 7. Create visualizations + print("\nGenerating charts...") + ensure_directories() + + # Chart 1: Revenue by Segment + fig, (ax1, ax2) = plt.subplots(1, 2, figsize=CHART_SIZES['wide']) + + segment_summary_sorted = segment_summary.sort_values('Total Revenue', ascending=True) + revenue_millions = segment_summary_sorted['Total Revenue'].values / 1e6 + + ax1.barh(range(len(segment_summary_sorted)), revenue_millions, color='#2E86AB') + ax1.set_yticks(range(len(segment_summary_sorted))) + ax1.set_yticklabels(segment_summary_sorted['Segment'].values) + ax1.set_xlabel('Revenue (Millions USD)') + ax1.set_title('Revenue by Customer Segment', fontsize=12, fontweight='bold') + setup_revenue_chart(ax1) + ax1.set_ylabel('') + + # Chart 2: Customer Count by Segment + customer_counts = segment_summary_sorted['Customer Count'].values + ax2.barh(range(len(segment_summary_sorted)), customer_counts, color='#A23B72') + ax2.set_yticks(range(len(segment_summary_sorted))) + ax2.set_yticklabels(segment_summary_sorted['Segment'].values) + ax2.set_xlabel('Number of Customers') + ax2.set_title('Customer Count by Segment', fontsize=12, fontweight='bold') + ax2.set_ylabel('') + ax2.grid(True, alpha=0.3) + + plt.suptitle(f'Customer Segmentation Analysis - {COMPANY_NAME}', + fontsize=14, fontweight='bold', y=1.02) + plt.tight_layout() + save_chart(fig, 'customer_segmentation.png') + plt.close() + + # 8. Validate revenue + print("\nValidating revenue...") + validate_revenue(df, ANALYSIS_NAME) + + print(f"\n{ANALYSIS_NAME} complete!") + print(f"Charts saved to: {OUTPUT_DIR}") + +# ============================================================================ +# RUN ANALYSIS +# ============================================================================ + +if __name__ == "__main__": + main() diff --git a/examples/product_performance.py b/examples/product_performance.py new file mode 100644 index 0000000..47b7ad6 --- /dev/null +++ b/examples/product_performance.py @@ -0,0 +1,203 @@ +""" +Example: Product Performance Analysis +Example showing product mix and performance analysis + +This example demonstrates: +- Product-level aggregation +- Product performance metrics +- Product mix visualization +""" +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +from pathlib import Path + +# Import utilities +from data_loader import load_sales_data, validate_data_structure +from validate_revenue import validate_revenue +from analysis_utils import ( + get_ltm_period_config, calculate_annual_metrics, + apply_exclusion_filters, setup_revenue_chart, save_chart, + format_currency, sort_mixed_years +) +from config import ( + OUTPUT_DIR, MAX_DATE, CHART_SIZES, ensure_directories, + get_data_path, COMPANY_NAME, REVENUE_COLUMN, ITEM_COLUMN, + DATE_COLUMN, MIN_YEAR, QUANTITY_COLUMN +) + +# ============================================================================ +# CONFIGURATION +# ============================================================================ + +ANALYSIS_NAME = "Product Performance Analysis" +DESCRIPTION = "Product mix and performance analysis" + +# ============================================================================ +# MAIN ANALYSIS FUNCTION +# ============================================================================ + +def main(): + """Main analysis function""" + + print(f"\n{'='*60}") + print(f"{ANALYSIS_NAME}") + print(f"{'='*60}\n") + + # 1. Load data + print("Loading data...") + try: + df = load_sales_data(get_data_path()) + print(f"Loaded {len(df):,} transactions") + except Exception as e: + print(f"ERROR loading data: {e}") + return + + # 2. Validate data structure + is_valid, msg = validate_data_structure(df) + if not is_valid: + print(f"ERROR: {msg}") + return + + if ITEM_COLUMN not in df.columns: + print(f"WARNING: Item column '{ITEM_COLUMN}' not found. Using transaction-level analysis.") + # Create a dummy item column for demonstration + df[ITEM_COLUMN] = 'All Products' + + print("Data validation passed") + + # 3. Apply exclusion filters + df = apply_exclusion_filters(df) + + # 4. Filter by date range + df = df[df['Year'] >= MIN_YEAR] + if DATE_COLUMN in df.columns: + df = df[df[DATE_COLUMN] <= MAX_DATE] + + # 5. Setup LTM period + ltm_start, ltm_end = get_ltm_period_config() + + # 6. Product performance summary + print("\nCalculating product performance...") + + # Get most recent period data + if ltm_start and ltm_end and 'YearMonth' in df.columns: + recent_data = df[(df['YearMonth'] >= ltm_start) & (df['YearMonth'] <= ltm_end)] + period_label = f"LTM {ltm_end}" + else: + recent_year = df['Year'].max() + recent_data = df[df['Year'] == recent_year] + period_label = str(recent_year) + + # Product-level metrics + product_metrics = recent_data.groupby(ITEM_COLUMN).agg({ + REVENUE_COLUMN: ['sum', 'count'], + QUANTITY_COLUMN: 'sum' if QUANTITY_COLUMN in df.columns else 'count' + }).reset_index() + + product_metrics.columns = [ITEM_COLUMN, 'Revenue', 'Transaction_Count', 'Quantity'] + + # Calculate average price per unit if quantity available + if QUANTITY_COLUMN in df.columns: + product_metrics['Avg_Price'] = product_metrics['Revenue'] / product_metrics['Quantity'].replace(0, np.nan) + else: + product_metrics['Avg_Price'] = product_metrics['Revenue'] / product_metrics['Transaction_Count'] + + # Sort by revenue + product_metrics = product_metrics.sort_values('Revenue', ascending=False) + + # Top products summary + print(f"\nTop 10 Products by Revenue ({period_label}):") + print("-" * 80) + top_10 = product_metrics.head(10) + total_revenue = product_metrics['Revenue'].sum() + + for idx, row in top_10.iterrows(): + pct = (row['Revenue'] / total_revenue) * 100 + print(f"{row[ITEM_COLUMN]:30s}: {format_currency(row['Revenue']):>12s} ({pct:5.1f}%)") + + # 7. Annual product trends (if multiple years available) + if len(df['Year'].unique()) > 1: + print("\nCalculating annual product trends...") + + def calculate_product_metrics(year_data): + """Calculate product metrics for a year""" + product_revenue = year_data.groupby(ITEM_COLUMN)[REVENUE_COLUMN].sum() + # Get top 5 products + top_5 = product_revenue.nlargest(5) + return dict(top_5) + + annual_product_df = calculate_annual_metrics(df, calculate_product_metrics, ltm_start, ltm_end) + + # 8. Create visualizations + print("\nGenerating charts...") + ensure_directories() + + # Chart 1: Top Products Revenue (Bar Chart) + fig, (ax1, ax2) = plt.subplots(1, 2, figsize=CHART_SIZES['wide']) + + top_10_revenue = top_10['Revenue'].values / 1e6 + top_10_names = top_10[ITEM_COLUMN].values + + ax1.barh(range(len(top_10)), top_10_revenue, color='#2E86AB') + ax1.set_yticks(range(len(top_10))) + ax1.set_yticklabels([name[:30] + '...' if len(name) > 30 else name for name in top_10_names]) + ax1.set_xlabel('Revenue (Millions USD)') + ax1.set_title(f'Top 10 Products by Revenue\n({period_label})', fontsize=12, fontweight='bold') + setup_revenue_chart(ax1) + ax1.set_ylabel('') + + # Chart 2: Revenue Distribution (Pie Chart for top 10) + if len(product_metrics) > 10: + other_revenue = product_metrics.iloc[10:]['Revenue'].sum() + pie_data = list(top_10['Revenue'].values) + [other_revenue] + pie_labels = list(top_10[ITEM_COLUMN].values) + ['Other'] + else: + pie_data = product_metrics['Revenue'].values + pie_labels = product_metrics[ITEM_COLUMN].values + + pie_data_millions = [x / 1e6 for x in pie_data] + ax2.pie(pie_data_millions, labels=pie_labels, autopct='%1.1f%%', startangle=90) + ax2.set_title('Revenue Distribution\n(Top Products)', fontsize=12, fontweight='bold') + + plt.suptitle(f'Product Performance Analysis - {COMPANY_NAME}', + fontsize=14, fontweight='bold', y=1.02) + plt.tight_layout() + save_chart(fig, 'product_performance.png') + plt.close() + else: + # Single chart if only one year + print("\nGenerating chart...") + ensure_directories() + + fig, ax = plt.subplots(figsize=CHART_SIZES['medium']) + + top_10_revenue = top_10['Revenue'].values / 1e6 + top_10_names = top_10[ITEM_COLUMN].values + + ax.barh(range(len(top_10)), top_10_revenue, color='#2E86AB') + ax.set_yticks(range(len(top_10))) + ax.set_yticklabels([name[:40] + '...' if len(name) > 40 else name for name in top_10_names]) + ax.set_xlabel('Revenue (Millions USD)') + ax.set_title(f'Top 10 Products by Revenue - {COMPANY_NAME}\n({period_label})', + fontsize=14, fontweight='bold') + setup_revenue_chart(ax) + ax.set_ylabel('') + + plt.tight_layout() + save_chart(fig, 'product_performance.png') + plt.close() + + # 9. Validate revenue + print("\nValidating revenue...") + validate_revenue(df, ANALYSIS_NAME) + + print(f"\n{ANALYSIS_NAME} complete!") + print(f"Charts saved to: {OUTPUT_DIR}") + +# ============================================================================ +# RUN ANALYSIS +# ============================================================================ + +if __name__ == "__main__": + main() diff --git a/export_utils.py b/export_utils.py new file mode 100644 index 0000000..6a1d75f --- /dev/null +++ b/export_utils.py @@ -0,0 +1,238 @@ +""" +Export utilities for analysis results +Provides functions to export DataFrames and summary data to CSV and Excel + +Usage: + from export_utils import export_to_csv, export_to_excel, export_summary_table + + # Export DataFrame to CSV + export_to_csv(df, 'results.csv') + + # Export DataFrame to Excel + export_to_excel(df, 'results.xlsx', sheet_name='Data') + + # Export summary table + export_summary_table({'Metric1': 100, 'Metric2': 200}, 'summary.xlsx') +""" +import pandas as pd +from pathlib import Path +from config import REPORTS_DIR, ensure_directories + +def export_to_csv(df, filename, output_dir=None, index=True): + """ + Export DataFrame to CSV with proper formatting + + Args: + df: DataFrame to export + filename: Output filename (e.g., 'results.csv') + output_dir: Output directory (defaults to config.REPORTS_DIR) + index: Whether to include index in export (default: True) + + Returns: + Path to exported file + """ + if output_dir is None: + output_dir = REPORTS_DIR + else: + output_dir = Path(output_dir) + + ensure_directories() + output_dir.mkdir(exist_ok=True) + + filepath = output_dir / filename + + df.to_csv(filepath, index=index, encoding='utf-8-sig') + print(f"Exported to CSV: {filepath}") + + return filepath + +def export_to_excel(df, filename, sheet_name='Data', output_dir=None, index=True): + """ + Export DataFrame to Excel with formatting + + Args: + df: DataFrame to export + filename: Output filename (e.g., 'results.xlsx') + sheet_name: Excel sheet name (default: 'Data') + output_dir: Output directory (defaults to config.REPORTS_DIR) + index: Whether to include index in export (default: True) + + Returns: + Path to exported file + + Raises: + ImportError: If openpyxl is not installed + """ + try: + import openpyxl + except ImportError: + raise ImportError( + "openpyxl is required for Excel export. Install with: pip install openpyxl" + ) + + if output_dir is None: + output_dir = REPORTS_DIR + else: + output_dir = Path(output_dir) + + ensure_directories() + output_dir.mkdir(exist_ok=True) + + filepath = output_dir / filename + + # Create Excel writer + with pd.ExcelWriter(filepath, engine='openpyxl') as writer: + df.to_excel(writer, sheet_name=sheet_name, index=index) + + # Auto-adjust column widths + worksheet = writer.sheets[sheet_name] + for idx, col in enumerate(df.columns, 1): + max_length = max( + df[col].astype(str).map(len).max(), + len(str(col)) + ) + # Cap at 50 characters for readability + adjusted_width = min(max_length + 2, 50) + worksheet.column_dimensions[chr(64 + idx)].width = adjusted_width + + print(f"Exported to Excel: {filepath}") + + return filepath + +def export_summary_table(data_dict, filename, output_dir=None, title=None): + """ + Export summary statistics to formatted table (Excel) + + Args: + data_dict: Dictionary of {metric_name: value} pairs + filename: Output filename (e.g., 'summary.xlsx') + output_dir: Output directory (defaults to config.REPORTS_DIR) + title: Optional title for the summary table + + Returns: + Path to exported file + + Example: + export_summary_table({ + 'Total Revenue': 1000000, + 'Customer Count': 500, + 'Average Order Value': 2000 + }, 'summary.xlsx') + """ + try: + import openpyxl + except ImportError: + raise ImportError( + "openpyxl is required for Excel export. Install with: pip install openpyxl" + ) + + if output_dir is None: + output_dir = REPORTS_DIR + else: + output_dir = Path(output_dir) + + ensure_directories() + output_dir.mkdir(exist_ok=True) + + filepath = output_dir / filename + + # Create DataFrame from dictionary + df = pd.DataFrame({ + 'Metric': list(data_dict.keys()), + 'Value': list(data_dict.values()) + }) + + # Format numeric values + def format_value(val): + if isinstance(val, (int, float)): + if abs(val) >= 1e6: + return f"${val / 1e6:.2f}m" + elif abs(val) >= 1e3: + return f"${val / 1e3:.2f}k" + else: + return f"${val:.2f}" + return str(val) + + df['Formatted_Value'] = df['Value'].apply(format_value) + + # Create Excel writer + with pd.ExcelWriter(filepath, engine='openpyxl') as writer: + df.to_excel(writer, sheet_name='Summary', index=False) + + # Format worksheet + worksheet = writer.sheets['Summary'] + + # Set column widths + worksheet.column_dimensions['A'].width = 30 + worksheet.column_dimensions['B'].width = 20 + worksheet.column_dimensions['C'].width = 20 + + # Add title if provided + if title: + worksheet.insert_rows(1) + worksheet.merge_cells('A1:C1') + worksheet['A1'] = title + worksheet['A1'].font = openpyxl.styles.Font(bold=True, size=14) + worksheet['A1'].alignment = openpyxl.styles.Alignment(horizontal='center') + + print(f"Exported summary table to Excel: {filepath}") + + return filepath + +def export_multiple_sheets(data_dict, filename, output_dir=None): + """ + Export multiple DataFrames to Excel with multiple sheets + + Args: + data_dict: Dictionary of {sheet_name: DataFrame} pairs + filename: Output filename (e.g., 'results.xlsx') + output_dir: Output directory (defaults to config.REPORTS_DIR) + + Returns: + Path to exported file + + Example: + export_multiple_sheets({ + 'Revenue': revenue_df, + 'Customers': customer_df, + 'Products': product_df + }, 'analysis_results.xlsx') + """ + try: + import openpyxl + except ImportError: + raise ImportError( + "openpyxl is required for Excel export. Install with: pip install openpyxl" + ) + + if output_dir is None: + output_dir = REPORTS_DIR + else: + output_dir = Path(output_dir) + + ensure_directories() + output_dir.mkdir(exist_ok=True) + + filepath = output_dir / filename + + # Create Excel writer + with pd.ExcelWriter(filepath, engine='openpyxl') as writer: + for sheet_name, df in data_dict.items(): + # Truncate sheet name to 31 characters (Excel limit) + safe_sheet_name = sheet_name[:31] + df.to_excel(writer, sheet_name=safe_sheet_name, index=True) + + # Auto-adjust column widths + worksheet = writer.sheets[safe_sheet_name] + for idx, col in enumerate(df.columns, 1): + max_length = max( + df[col].astype(str).map(len).max(), + len(str(col)) + ) + adjusted_width = min(max_length + 2, 50) + col_letter = openpyxl.utils.get_column_letter(idx) + worksheet.column_dimensions[col_letter].width = adjusted_width + + print(f"Exported {len(data_dict)} sheets to Excel: {filepath}") + + return filepath diff --git a/generate_sample_data.py b/generate_sample_data.py new file mode 100644 index 0000000..72b1014 --- /dev/null +++ b/generate_sample_data.py @@ -0,0 +1,184 @@ +""" +Sample data generator for testing and demonstrations +Generates realistic sample sales data + +Usage: + python generate_sample_data.py + + # Or import and use programmatically: + from generate_sample_data import generate_sample_sales_data + df = generate_sample_sales_data(num_customers=100, num_products=50, years=[2021, 2022, 2023]) +""" +import pandas as pd +import numpy as np +from pathlib import Path +from datetime import datetime, timedelta +import random + +def generate_sample_sales_data( + num_customers=100, + num_products=50, + years=[2021, 2022, 2023, 2024, 2025], + transactions_per_month=500, + output_file='sample_sales_data.csv' +): + """ + Generate realistic sample sales data + + Args: + num_customers: Number of unique customers + num_products: Number of unique products + years: List of years to generate data for + transactions_per_month: Average transactions per month + output_file: Output CSV filename + + Returns: + DataFrame: Generated sales data + """ + print(f"Generating sample sales data...") + print(f" Customers: {num_customers}") + print(f" Products: {num_products}") + print(f" Years: {years}") + + # Generate customer names + customer_names = [f"Customer_{i:04d}" for i in range(1, num_customers + 1)] + + # Generate product names + product_names = [f"Product_{i:04d}" for i in range(1, num_products + 1)] + + # Generate transactions + transactions = [] + + for year in years: + for month in range(1, 13): + # Skip future months + current_date = datetime.now() + if year > current_date.year or (year == current_date.year and month > current_date.month): + continue + + # Generate transactions for this month + num_transactions = int(np.random.normal(transactions_per_month, transactions_per_month * 0.2)) + num_transactions = max(10, num_transactions) # At least 10 transactions + + for _ in range(num_transactions): + # Random date within month + if month == 2: + max_day = 28 + elif month in [4, 6, 9, 11]: + max_day = 30 + else: + max_day = 31 + + day = random.randint(1, max_day) + invoice_date = datetime(year, month, day) + + # Random customer and product + customer = random.choice(customer_names) + product = random.choice(product_names) + + # Generate quantity (most transactions are small) + quantity = int(np.random.lognormal(mean=1.5, sigma=1.0)) + quantity = max(1, min(quantity, 100)) # Cap at 100 + + # Generate revenue (with some correlation to quantity) + base_price = np.random.lognormal(mean=5, sigma=1.5) + revenue = base_price * quantity + + # Add some variation + revenue *= np.random.uniform(0.8, 1.2) + revenue = round(revenue, 2) + + transactions.append({ + 'InvoiceDate': invoice_date, + 'Customer': customer, + 'Item': product, + 'Quantity': quantity, + 'USD': revenue, + 'Year': year, + 'Month': month + }) + + # Create DataFrame + df = pd.DataFrame(transactions) + + # Sort by date + df = df.sort_values('InvoiceDate').reset_index(drop=True) + + # Add some missing dates (realistic data quality issue) + missing_date_pct = 0.05 # 5% missing dates + num_missing = int(len(df) * missing_date_pct) + missing_indices = np.random.choice(df.index, size=num_missing, replace=False) + df.loc[missing_indices, 'InvoiceDate'] = pd.NaT + + # Save to CSV + output_path = Path(output_file) + df.to_csv(output_path, index=False) + print(f"\nāœ… Sample data generated: {output_path}") + print(f" Rows: {len(df):,}") + print(f" Date range: {df['InvoiceDate'].min()} to {df['InvoiceDate'].max()}") + print(f" Total revenue: ${df['USD'].sum() / 1e6:.2f}m") + + return df + +def generate_sample_data_for_template(): + """ + Generate sample data matching template's expected structure + Uses config.py column names + """ + from config import ( + REVENUE_COLUMN, DATE_COLUMN, CUSTOMER_COLUMN, ITEM_COLUMN, + QUANTITY_COLUMN, ANALYSIS_YEARS + ) + + print("Generating sample data for template...") + + df = generate_sample_sales_data( + num_customers=200, + num_products=100, + years=ANALYSIS_YEARS, + transactions_per_month=1000, + output_file='sample_sales_data.csv' + ) + + # Rename columns to match config (if different) + column_mapping = { + 'USD': REVENUE_COLUMN, + 'InvoiceDate': DATE_COLUMN, + 'Customer': CUSTOMER_COLUMN, + 'Item': ITEM_COLUMN, + 'Quantity': QUANTITY_COLUMN + } + + # Only rename if different + for old_name, new_name in column_mapping.items(): + if old_name in df.columns and old_name != new_name: + df = df.rename(columns={old_name: new_name}) + + # Save + output_path = Path('sample_sales_data.csv') + df.to_csv(output_path, index=False) + + print(f"\nāœ… Sample data saved to: {output_path}") + print(f" Ready to use with sales_analysis_template") + + return df + +# ============================================================================ +# MAIN +# ============================================================================ + +if __name__ == "__main__": + """Generate sample data""" + import sys + + if len(sys.argv) > 1: + # Custom generation + num_customers = int(sys.argv[1]) if len(sys.argv) > 1 else 100 + num_products = int(sys.argv[2]) if len(sys.argv) > 2 else 50 + generate_sample_sales_data( + num_customers=num_customers, + num_products=num_products + ) + else: + # Generate for template + generate_sample_data_for_template() diff --git a/logger_config.py b/logger_config.py new file mode 100644 index 0000000..969d021 --- /dev/null +++ b/logger_config.py @@ -0,0 +1,197 @@ +""" +Logging configuration for analysis scripts +Provides structured logging with file and console output + +Usage: + from logger_config import get_logger + + logger = get_logger('my_analysis') + logger.info("Analysis started") + logger.warning("Low data quality detected") + logger.error("Failed to load data") +""" +import logging +import sys +from pathlib import Path +from datetime import datetime +from config import COMPANY_NAME, OUTPUT_DIR + +# Global logger instance +_logger = None + +def setup_logging(log_level=logging.INFO, log_file=None, analysis_name=None): + """ + Setup logging configuration + + Args: + log_level: Logging level (DEBUG, INFO, WARNING, ERROR) + log_file: Path to log file (defaults to logs/analysis_YYYYMMDD_HHMMSS.log) + analysis_name: Name of analysis for log file naming + + Returns: + logging.Logger: Configured logger instance + """ + global _logger + + # Create logs directory + logs_dir = Path('logs') + logs_dir.mkdir(exist_ok=True) + + # Default log file name + if log_file is None: + timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') + if analysis_name: + safe_name = analysis_name.lower().replace(' ', '_').replace('/', '_') + log_file = logs_dir / f"{safe_name}_{timestamp}.log" + else: + log_file = logs_dir / f"analysis_{timestamp}.log" + else: + log_file = Path(log_file) + log_file.parent.mkdir(parents=True, exist_ok=True) + + # Create logger + logger = logging.getLogger(analysis_name or 'analysis') + logger.setLevel(log_level) + + # Remove existing handlers to avoid duplicates + logger.handlers = [] + + # Create formatters + detailed_formatter = logging.Formatter( + '%(asctime)s - %(name)s - %(levelname)s - %(message)s', + datefmt='%Y-%m-%d %H:%M:%S' + ) + + console_formatter = logging.Formatter( + '%(levelname)s - %(message)s' + ) + + # File handler (detailed) + file_handler = logging.FileHandler(log_file, encoding='utf-8') + file_handler.setLevel(log_level) + file_handler.setFormatter(detailed_formatter) + logger.addHandler(file_handler) + + # Console handler (simpler) + console_handler = logging.StreamHandler(sys.stdout) + console_handler.setLevel(log_level) + console_handler.setFormatter(console_formatter) + logger.addHandler(console_handler) + + # Log startup message + logger.info(f"="*60) + logger.info(f"Analysis: {analysis_name or 'Unknown'}") + logger.info(f"Company: {COMPANY_NAME}") + logger.info(f"Log File: {log_file}") + logger.info(f"="*60) + + _logger = logger + return logger + +def get_logger(analysis_name=None, log_level=logging.INFO): + """ + Get or create logger instance + + Args: + analysis_name: Name of analysis + log_level: Logging level (default: INFO) + + Returns: + logging.Logger: Logger instance + """ + global _logger + + if _logger is None: + _logger = setup_logging(log_level=log_level, analysis_name=analysis_name) + + return _logger + +def log_analysis_start(analysis_name, logger=None): + """ + Log analysis start + + Args: + analysis_name: Name of analysis + logger: Logger instance (creates one if None) + """ + if logger is None: + logger = get_logger(analysis_name) + + logger.info(f"Starting analysis: {analysis_name}") + logger.info(f"Timestamp: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") + +def log_analysis_end(analysis_name, success=True, logger=None): + """ + Log analysis completion + + Args: + analysis_name: Name of analysis + success: Whether analysis completed successfully + logger: Logger instance (creates one if None) + """ + if logger is None: + logger = get_logger(analysis_name) + + if success: + logger.info(f"Analysis completed successfully: {analysis_name}") + else: + logger.error(f"Analysis failed: {analysis_name}") + + logger.info(f"Timestamp: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") + logger.info("="*60) + +def log_data_loading(df, logger=None): + """ + Log data loading summary + + Args: + df: Loaded DataFrame + logger: Logger instance (creates one if None) + """ + if logger is None: + logger = get_logger() + + logger.info(f"Data loaded: {len(df):,} rows, {len(df.columns)} columns") + + from config import REVENUE_COLUMN, DATE_COLUMN + if REVENUE_COLUMN in df.columns: + revenue = df[REVENUE_COLUMN].sum() + logger.info(f"Total revenue: ${revenue / 1e6:.2f}m") + + if DATE_COLUMN in df.columns: + date_coverage = df[DATE_COLUMN].notna().sum() / len(df) * 100 + logger.info(f"Date coverage: {date_coverage:.1f}%") + +def log_error(error, logger=None, context=None): + """ + Log error with context + + Args: + error: Exception or error message + logger: Logger instance (creates one if None) + context: Additional context string + """ + if logger is None: + logger = get_logger() + + error_msg = str(error) + if context: + error_msg = f"{context}: {error_msg}" + + logger.error(error_msg, exc_info=True) + +# ============================================================================ +# EXAMPLE USAGE +# ============================================================================ + +if __name__ == "__main__": + """Example usage""" + logger = setup_logging(log_level=logging.DEBUG, analysis_name="Example Analysis") + + logger.debug("This is a debug message") + logger.info("This is an info message") + logger.warning("This is a warning message") + logger.error("This is an error message") + + log_analysis_start("Example Analysis", logger) + log_analysis_end("Example Analysis", success=True, logger) diff --git a/report_generator.py b/report_generator.py new file mode 100644 index 0000000..6939ee3 --- /dev/null +++ b/report_generator.py @@ -0,0 +1,228 @@ +""" +Report generation utility +Combines multiple charts and data into a PDF report + +Usage: + from report_generator import generate_pdf_report + + # Generate PDF report + generate_pdf_report( + charts=['chart1.png', 'chart2.png'], + title='Sales Analysis Report', + summary_data={'Total Revenue': 1000000} + ) +""" +from pathlib import Path +from datetime import datetime +from config import COMPANY_NAME, OUTPUT_DIR, REPORTS_DIR, ensure_directories + +def generate_pdf_report( + charts, + title=None, + summary_data=None, + output_filename=None, + output_dir=None +): + """ + Generate PDF report from charts and summary data + + Args: + charts: List of chart file paths (PNG files) + title: Report title (defaults to company name + date) + summary_data: Dictionary of summary metrics + output_filename: Output PDF filename (defaults to report_YYYYMMDD_HHMMSS.pdf) + output_dir: Output directory (defaults to config.REPORTS_DIR) + + Returns: + Path: Path to generated PDF file + + Raises: + ImportError: If reportlab is not installed + """ + try: + from reportlab.lib.pagesizes import letter, A4 + from reportlab.lib.units import inch + from reportlab.lib import colors + from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Image, Table, TableStyle, PageBreak + from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle + from reportlab.lib.enums import TA_CENTER, TA_LEFT + except ImportError: + raise ImportError( + "reportlab is required for PDF generation. Install with: pip install reportlab" + ) + + if output_dir is None: + output_dir = REPORTS_DIR + else: + output_dir = Path(output_dir) + + ensure_directories() + output_dir.mkdir(exist_ok=True) + + # Default filename + if output_filename is None: + timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') + output_filename = f"report_{timestamp}.pdf" + + output_path = output_dir / output_filename + + # Create PDF document + doc = SimpleDocTemplate( + str(output_path), + pagesize=letter, + rightMargin=0.75*inch, + leftMargin=0.75*inch, + topMargin=0.75*inch, + bottomMargin=0.75*inch + ) + + # Container for PDF elements + story = [] + + # Styles + styles = getSampleStyleSheet() + title_style = ParagraphStyle( + 'CustomTitle', + parent=styles['Heading1'], + fontSize=20, + textColor=colors.HexColor('#2E86AB'), + spaceAfter=30, + alignment=TA_CENTER + ) + + heading_style = ParagraphStyle( + 'CustomHeading', + parent=styles['Heading2'], + fontSize=14, + textColor=colors.HexColor('#2E86AB'), + spaceAfter=12 + ) + + # Title + if title is None: + title = f"{COMPANY_NAME} Sales Analysis Report" + + story.append(Paragraph(title, title_style)) + story.append(Spacer(1, 0.2*inch)) + + # Report metadata + metadata_text = f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}" + story.append(Paragraph(metadata_text, styles['Normal'])) + story.append(Spacer(1, 0.3*inch)) + + # Summary data table + if summary_data: + story.append(Paragraph("Summary", heading_style)) + + # Create table + table_data = [['Metric', 'Value']] + for key, value in summary_data.items(): + # Format value + if isinstance(value, (int, float)): + if abs(value) >= 1e6: + formatted_value = f"${value / 1e6:.2f}m" + elif abs(value) >= 1e3: + formatted_value = f"${value / 1e3:.2f}k" + else: + formatted_value = f"${value:.2f}" + else: + formatted_value = str(value) + + table_data.append([key, formatted_value]) + + table = Table(table_data, colWidths=[3*inch, 2*inch]) + table.setStyle(TableStyle([ + ('BACKGROUND', (0, 0), (-1, 0), colors.HexColor('#2E86AB')), + ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke), + ('ALIGN', (0, 0), (-1, -1), 'LEFT'), + ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'), + ('FONTSIZE', (0, 0), (-1, 0), 12), + ('BOTTOMPADDING', (0, 0), (-1, 0), 12), + ('BACKGROUND', (0, 1), (-1, -1), colors.beige), + ('GRID', (0, 0), (-1, -1), 1, colors.black), + ('ROWBACKGROUNDS', (0, 1), (-1, -1), [colors.white, colors.lightgrey]) + ])) + + story.append(table) + story.append(Spacer(1, 0.3*inch)) + + # Add charts + if charts: + story.append(Paragraph("Charts", heading_style)) + + for i, chart_path in enumerate(charts, 1): + chart_path = Path(chart_path) + + if not chart_path.exists(): + print(f"Warning: Chart not found: {chart_path}") + continue + + # Add chart title + chart_title = f"Chart {i}: {chart_path.stem.replace('_', ' ').title()}" + story.append(Paragraph(chart_title, styles['Heading3'])) + story.append(Spacer(1, 0.1*inch)) + + # Add image + try: + img = Image(str(chart_path), width=6*inch, height=4*inch) + story.append(img) + except Exception as e: + error_msg = f"Error loading chart: {e}" + story.append(Paragraph(error_msg, styles['Normal'])) + + # Add page break between charts (except last one) + if i < len(charts): + story.append(PageBreak()) + + # Build PDF + doc.build(story) + + print(f"PDF report generated: {output_path}") + + return output_path + +def generate_simple_report(charts, title=None, output_filename=None): + """ + Generate a simple PDF report (wrapper with defaults) + + Args: + charts: List of chart file paths + title: Report title + output_filename: Output filename + + Returns: + Path: Path to generated PDF + """ + return generate_pdf_report( + charts=charts, + title=title, + output_filename=output_filename + ) + +# ============================================================================ +# EXAMPLE USAGE +# ============================================================================ + +if __name__ == "__main__": + """Example usage""" + from config import OUTPUT_DIR + + # Find charts in output directory + chart_files = list(OUTPUT_DIR.glob('*.png')) + + if chart_files: + print(f"Found {len(chart_files)} charts") + + # Generate report + report_path = generate_pdf_report( + charts=[str(f) for f in chart_files[:5]], # Limit to 5 charts + title="Sales Analysis Report", + summary_data={ + 'Total Charts': len(chart_files), + 'Report Date': datetime.now().strftime('%Y-%m-%d') + } + ) + + print(f"Report saved to: {report_path}") + else: + print("No charts found in output directory") diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..7ac385a --- /dev/null +++ b/requirements.txt @@ -0,0 +1,30 @@ +# Python dependencies for Sales Analysis Template +# Install with: pip install -r requirements.txt + +# Core data analysis +pandas>=2.0.0 +numpy>=1.24.0 + +# Visualization +matplotlib>=3.7.0 +seaborn>=0.12.0 + +# Export utilities (optional - uncomment if needed) +# openpyxl>=3.1.0 # For Excel export (export_utils.py) + +# Interactive visualizations (optional - uncomment if needed) +# plotly>=5.17.0 # For interactive charts (analysis_utils.py) + +# Report generation (optional - uncomment if needed) +# reportlab>=4.0.0 # For PDF reports (report_generator.py) + +# Statistical analysis (optional - uncomment if needed) +# scipy>=1.10.0 # For statistical analysis, product lifecycle (statistical_utils.py) + +# Testing (optional - uncomment if needed) +# pytest>=7.4.0 # For unit tests + +# Advanced analysis (optional - uncomment if needed) +# pmdarima>=2.0.0 # For time series forecasting +# mlxtend>=0.22.0 # For market basket analysis +# scikit-learn>=1.3.0 # For machine learning analyses diff --git a/run_all_analyses.py b/run_all_analyses.py new file mode 100644 index 0000000..c756522 --- /dev/null +++ b/run_all_analyses.py @@ -0,0 +1,185 @@ +""" +Batch runner for all analysis scripts +Runs all analyses in sequence and generates a summary report + +To use: +1. Add your analysis scripts to the ANALYSIS_SCRIPTS list below +2. Run: python run_all_analyses.py +""" +import subprocess +import sys +from pathlib import Path +from datetime import datetime +import time + +# ============================================================================ +# CONFIGURATION +# ============================================================================ + +# List of analysis scripts to run +# TODO: Add your analysis scripts here +ANALYSIS_SCRIPTS = [ + # Example structure - customize for your analyses: + # 'check_annual_revenue.py', + # 'revenue_analysis.py', + # 'geographic_analysis.py', + # 'customer_segmentation.py', + # 'product_analysis.py', + # Add your analysis scripts here... +] + +# Timeout per script (in seconds) +SCRIPT_TIMEOUT = 600 # 10 minutes + +# ============================================================================ +# HELPER FUNCTIONS +# ============================================================================ + +def run_script(script_path): + """Run a single analysis script""" + script_name = Path(script_path).name + print(f"\n{'='*60}") + print(f"Running: {script_name}") + print(f"{'='*60}") + + start_time = time.time() + + try: + result = subprocess.run( + [sys.executable, script_path], + capture_output=True, + text=True, + timeout=SCRIPT_TIMEOUT + ) + + elapsed = time.time() - start_time + + if result.returncode == 0: + print(f"āœ… {script_name} completed successfully ({elapsed:.1f}s)") + if result.stdout: + # Print last 10 lines of output + lines = result.stdout.strip().split('\n') + if len(lines) > 10: + print(" ... (output truncated)") + for line in lines[-10:]: + print(f" {line}") + else: + for line in lines: + print(f" {line}") + return True, elapsed, None + else: + print(f"āŒ {script_name} failed ({elapsed:.1f}s)") + if result.stderr: + print(f" Error: {result.stderr[:500]}") + return False, elapsed, result.stderr + + except subprocess.TimeoutExpired: + elapsed = time.time() - start_time + print(f"ā±ļø {script_name} timed out after {elapsed:.1f}s") + return False, elapsed, "Timeout" + except Exception as e: + elapsed = time.time() - start_time + print(f"āŒ {script_name} error: {str(e)}") + return False, elapsed, str(e) + +# ============================================================================ +# MAIN FUNCTION +# ============================================================================ + +def main(): + """Run all analysis scripts""" + from config import COMPANY_NAME + + print(f"\n{'='*60}") + print(f"{COMPANY_NAME} Sales Analysis - Batch Runner") + print(f"Started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") + print(f"{'='*60}\n") + + # Check which scripts exist + existing_scripts = [] + missing_scripts = [] + + for script in ANALYSIS_SCRIPTS: + script_path = Path(script) + if script_path.exists(): + existing_scripts.append(script) + else: + missing_scripts.append(script) + + if missing_scripts: + print(f"āš ļø Warning: {len(missing_scripts)} scripts not found:") + for script in missing_scripts: + print(f" - {script}") + print() + + if not existing_scripts: + print("āŒ No analysis scripts found!") + print(" Please add analysis scripts to ANALYSIS_SCRIPTS list in run_all_analyses.py") + return + + print(f"Found {len(existing_scripts)} analysis scripts to run\n") + + # Run scripts + results = [] + total_start = time.time() + + for script in existing_scripts: + success, elapsed, error = run_script(script) + results.append({ + 'script': script, + 'success': success, + 'elapsed': elapsed, + 'error': error + }) + + total_elapsed = time.time() - total_start + + # Print summary + print(f"\n{'='*60}") + print("Batch Run Summary") + print(f"{'='*60}\n") + + successful = [r for r in results if r['success']] + failed = [r for r in results if not r['success']] + + print(f"Total scripts: {len(results)}") + print(f"āœ… Successful: {len(successful)}") + print(f"āŒ Failed: {len(failed)}") + print(f"ā±ļø Total time: {total_elapsed/60:.1f} minutes\n") + + if failed: + print("Failed scripts:") + for r in failed: + print(f" āŒ {r['script']} ({r['elapsed']:.1f}s)") + if r['error']: + print(f" Error: {r['error'][:100]}") + print() + + # Save summary to file + summary_file = Path('analysis_run_summary.txt') + with open(summary_file, 'w') as f: + f.write(f"{COMPANY_NAME} Sales Analysis - Batch Run Summary\n") + f.write(f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n") + f.write(f"{'='*60}\n\n") + f.write(f"Total scripts: {len(results)}\n") + f.write(f"Successful: {len(successful)}\n") + f.write(f"Failed: {len(failed)}\n") + f.write(f"Total time: {total_elapsed/60:.1f} minutes\n\n") + + if successful: + f.write("Successful scripts:\n") + for r in successful: + f.write(f" āœ… {r['script']} ({r['elapsed']:.1f}s)\n") + f.write("\n") + + if failed: + f.write("Failed scripts:\n") + for r in failed: + f.write(f" āŒ {r['script']} ({r['elapsed']:.1f}s)\n") + if r['error']: + f.write(f" Error: {r['error']}\n") + + print(f"Summary saved to: {summary_file}") + +if __name__ == "__main__": + main() diff --git a/setup_wizard.py b/setup_wizard.py new file mode 100644 index 0000000..a4f0fc3 --- /dev/null +++ b/setup_wizard.py @@ -0,0 +1,240 @@ +""" +Interactive setup wizard for configuring the sales analysis template +Asks clarifying questions to configure config.py for your specific company and data +""" +import os +import sys +from pathlib import Path + +def print_header(text): + """Print a formatted header""" + print("\n" + "="*70) + print(f" {text}") + print("="*70 + "\n") + +def ask_question(prompt, default=None, validator=None): + """ + Ask a question and return the answer + + Args: + prompt: Question to ask + default: Default value if user just presses Enter + validator: Optional function to validate input + + Returns: + User's answer (or default) + """ + if default: + full_prompt = f"{prompt} [{default}]: " + else: + full_prompt = f"{prompt}: " + + while True: + answer = input(full_prompt).strip() + if not answer and default: + return default + elif not answer: + print(" Please provide an answer.") + continue + + if validator: + try: + return validator(answer) + except Exception as e: + print(f" Invalid input: {e}") + continue + + return answer + +def validate_yes_no(answer): + """Validate yes/no answer""" + answer_lower = answer.lower() + if answer_lower in ['y', 'yes', 'true', '1']: + return True + elif answer_lower in ['n', 'no', 'false', '0']: + return False + else: + raise ValueError("Please answer 'yes' or 'no'") + +def validate_int(answer): + """Validate integer answer""" + return int(answer) + +def validate_file_exists(answer): + """Validate that file exists""" + if not Path(answer).exists(): + raise ValueError(f"File not found: {answer}") + return answer + +def main(): + """Run the setup wizard""" + print_header("Sales Analysis Template - Setup Wizard") + print("This wizard will help you configure the template for your company's data.") + print("You can press Enter to accept defaults (shown in brackets).\n") + + responses = {} + + # Company Information + print_header("Company Information") + responses['company_name'] = ask_question("Company Name", default="Your Company Name") + responses['analysis_date'] = ask_question("Analysis Date (YYYY-MM-DD)", default="2026-01-12") + + # Data File + print_header("Data File Configuration") + print("Where is your sales data CSV file located?") + data_file = ask_question("Data file name (e.g., sales_data.csv)", default="sales_data.csv") + + # Check if file exists + if Path(data_file).exists(): + print(f" āœ“ Found: {data_file}") + else: + print(f" ⚠ Warning: {data_file} not found. Make sure to place it in the template directory.") + + responses['data_file'] = data_file + + # Column Mapping + print_header("Column Mapping") + print("What are the column names in your CSV file?") + print("(Press Enter to accept defaults if your columns match common names)\n") + + responses['revenue_column'] = ask_question("Revenue/Amount column name", default="USD") + responses['date_column'] = ask_question("Primary date column name", default="InvoiceDate") + + has_fallback = ask_question("Do you have fallback date columns (Month, Year)?", default="yes", validator=validate_yes_no) + if has_fallback: + fallback_str = ask_question("Fallback date columns (comma-separated)", default="Month, Year") + responses['date_fallback'] = [col.strip() for col in fallback_str.split(',')] + else: + responses['date_fallback'] = [] + + responses['customer_column'] = ask_question("Customer/Account column name", default="Customer") + responses['item_column'] = ask_question("Item/Product column name", default="Item") + + has_quantity = ask_question("Do you have a Quantity column?", default="yes", validator=validate_yes_no) + if has_quantity: + responses['quantity_column'] = ask_question("Quantity column name", default="Quantity") + else: + responses['quantity_column'] = None + + # Date Range + print_header("Date Range Configuration") + responses['min_year'] = ask_question("Minimum year to include in analysis", default="2021", validator=validate_int) + responses['max_date'] = ask_question("Maximum date (YYYY-MM-DD)", default="2025-09-30") + + years_str = ask_question("Analysis years (comma-separated, e.g., 2021,2022,2023,2024,2025)", default="2021,2022,2023,2024,2025") + responses['analysis_years'] = [int(y.strip()) for y in years_str.split(',')] + + # LTM Configuration + print_header("LTM (Last Twelve Months) Configuration") + print("LTM is used for the most recent partial year to enable apples-to-apples comparison.") + print("Example: If your latest data is through September 2025, use Oct 2024 - Sep 2025.\n") + + use_ltm = ask_question("Do you need LTM for the most recent year?", default="yes", validator=validate_yes_no) + responses['ltm_enabled'] = use_ltm + + if use_ltm: + responses['ltm_start_month'] = ask_question("LTM start month (1-12)", default="10", validator=validate_int) + responses['ltm_start_year'] = ask_question("LTM start year", default="2024", validator=validate_int) + responses['ltm_end_month'] = ask_question("LTM end month (1-12)", default="9", validator=validate_int) + responses['ltm_end_year'] = ask_question("LTM end year", default="2025", validator=validate_int) + else: + responses['ltm_start_month'] = 10 + responses['ltm_start_year'] = 2024 + responses['ltm_end_month'] = 9 + responses['ltm_end_year'] = 2025 + + # Exclusion Filters + print_header("Exclusion Filters (Optional)") + use_exclusions = ask_question("Do you need to exclude specific segments (e.g., test accounts, business units)?", default="no", validator=validate_yes_no) + responses['exclusions_enabled'] = use_exclusions + + if use_exclusions: + responses['exclude_column'] = ask_question("Column name to filter on", default="Country") + exclude_values_str = ask_question("Values to exclude (comma-separated)", default="") + responses['exclude_values'] = [v.strip() for v in exclude_values_str.split(',') if v.strip()] + else: + responses['exclude_column'] = None + responses['exclude_values'] = [] + + # Generate config.py + print_header("Generating Configuration") + print("Updating config.py with your settings...") + + # Read current config.py + config_path = Path('config.py') + if not config_path.exists(): + print("ERROR: config.py not found!") + return + + with open(config_path, 'r', encoding='utf-8') as f: + config_content = f.read() + + # Replace values + replacements = { + "COMPANY_NAME = \"Your Company Name\"": f"COMPANY_NAME = \"{responses['company_name']}\"", + "ANALYSIS_DATE = \"2026-01-12\"": f"ANALYSIS_DATE = \"{responses['analysis_date']}\"", + "DATA_FILE = 'sales_data.csv'": f"DATA_FILE = '{responses['data_file']}'", + "REVENUE_COLUMN = 'USD'": f"REVENUE_COLUMN = '{responses['revenue_column']}'", + "DATE_COLUMN = 'InvoiceDate'": f"DATE_COLUMN = '{responses['date_column']}'", + "DATE_FALLBACK_COLUMNS = ['Month', 'Year']": f"DATE_FALLBACK_COLUMNS = {responses['date_fallback']}", + "CUSTOMER_COLUMN = 'Customer'": f"CUSTOMER_COLUMN = '{responses['customer_column']}'", + "ITEM_COLUMN = 'Item'": f"ITEM_COLUMN = '{responses['item_column']}'", + "QUANTITY_COLUMN = 'Quantity'": f"QUANTITY_COLUMN = '{responses['quantity_column']}'" if responses['quantity_column'] else "QUANTITY_COLUMN = None", + "MIN_YEAR = 2021": f"MIN_YEAR = {responses['min_year']}", + "MAX_DATE = pd.Timestamp('2025-09-30')": f"MAX_DATE = pd.Timestamp('{responses['max_date']}')", + "ANALYSIS_YEARS = [2021, 2022, 2023, 2024, 2025]": f"ANALYSIS_YEARS = {responses['analysis_years']}", + "LTM_ENABLED = True": f"LTM_ENABLED = {responses['ltm_enabled']}", + "LTM_START_MONTH = 10": f"LTM_START_MONTH = {responses['ltm_start_month']}", + "LTM_START_YEAR = 2024": f"LTM_START_YEAR = {responses['ltm_start_year']}", + "LTM_END_MONTH = 9": f"LTM_END_MONTH = {responses['ltm_end_month']}", + "LTM_END_YEAR = 2025": f"LTM_END_YEAR = {responses['ltm_end_year']}", + } + + # Handle exclusions + if responses['exclusions_enabled']: + exclusions_config = f"""EXCLUSION_FILTERS = {{ + 'enabled': True, + 'exclude_by_column': '{responses['exclude_column']}', + 'exclude_values': {responses['exclude_values']} +}}""" + # Replace the exclusion filters section + import re + pattern = r"EXCLUSION_FILTERS = \{.*?\}" + config_content = re.sub(pattern, exclusions_config, config_content, flags=re.DOTALL) + else: + exclusions_config = """EXCLUSION_FILTERS = { + 'enabled': False, + 'exclude_by_column': None, + 'exclude_values': [] +}""" + import re + pattern = r"EXCLUSION_FILTERS = \{.*?\}" + config_content = re.sub(pattern, exclusions_config, config_content, flags=re.DOTALL) + + # Apply replacements + for old, new in replacements.items(): + if old in config_content: + config_content = config_content.replace(old, new) + + # Write updated config + with open(config_path, 'w', encoding='utf-8') as f: + f.write(config_content) + + print(" āœ“ Configuration updated successfully!") + + # Summary + print_header("Setup Complete") + print("Your configuration has been saved to config.py") + print("\nNext steps:") + print("1. Place your data file in the template directory (if not already there)") + print("2. Test data loading: python -c \"from data_loader import load_sales_data; from config import get_data_path; df = load_sales_data(get_data_path()); print(f'Loaded {len(df):,} rows')\"") + print("3. Review config.py and adjust any settings as needed") + print("4. Start creating your analysis scripts using analysis_template.py") + print("\nFor help, see README.md") + +if __name__ == "__main__": + try: + main() + except KeyboardInterrupt: + print("\n\nSetup cancelled by user.") + sys.exit(0) diff --git a/statistical_utils.py b/statistical_utils.py new file mode 100644 index 0000000..c93b58f --- /dev/null +++ b/statistical_utils.py @@ -0,0 +1,321 @@ +""" +Statistical analysis utilities +Common statistical operations for sales analysis + +Usage: + from statistical_utils import calculate_yoy_growth, calculate_cagr, calculate_correlation + + # Calculate year-over-year growth + growth = calculate_yoy_growth(current_value=100, previous_value=90) + + # Calculate CAGR + cagr = calculate_cagr(start_value=100, end_value=150, periods=3) +""" +import pandas as pd +import numpy as np +from scipy import stats + +def calculate_yoy_growth(current, previous): + """ + Calculate year-over-year growth percentage + + Args: + current: Current period value + previous: Previous period value + + Returns: + float: Growth percentage (can be negative) + + Example: + calculate_yoy_growth(110, 100) # Returns 10.0 + calculate_yoy_growth(90, 100) # Returns -10.0 + """ + if previous == 0: + return np.nan if current == 0 else np.inf + + return ((current - previous) / previous) * 100 + +def calculate_cagr(start_value, end_value, periods): + """ + Calculate Compound Annual Growth Rate (CAGR) + + Args: + start_value: Starting value + end_value: Ending value + periods: Number of periods (years) + + Returns: + float: CAGR as percentage + + Example: + calculate_cagr(100, 150, 3) # Returns ~14.47% + """ + if start_value <= 0 or periods <= 0: + return np.nan + + if end_value <= 0: + return np.nan + + cagr = ((end_value / start_value) ** (1 / periods) - 1) * 100 + return cagr + +def calculate_correlation(df, col1, col2): + """ + Calculate correlation between two columns + + Args: + df: DataFrame + col1: First column name + col2: Second column name + + Returns: + float: Correlation coefficient (-1 to 1) + """ + if col1 not in df.columns or col2 not in df.columns: + return np.nan + + # Convert to numeric + series1 = pd.to_numeric(df[col1], errors='coerce') + series2 = pd.to_numeric(df[col2], errors='coerce') + + # Remove NaN pairs + valid_mask = series1.notna() & series2.notna() + if valid_mask.sum() < 2: + return np.nan + + correlation = series1[valid_mask].corr(series2[valid_mask]) + return correlation + +def calculate_trend_slope(y_values): + """ + Calculate linear trend slope + + Args: + y_values: Array-like of y values + + Returns: + float: Slope of linear trend + """ + if len(y_values) < 2: + return np.nan + + x_values = np.arange(len(y_values)) + + # Remove NaN values + valid_mask = ~np.isnan(y_values) + if valid_mask.sum() < 2: + return np.nan + + x_valid = x_values[valid_mask] + y_valid = y_values[valid_mask] + + slope, intercept, r_value, p_value, std_err = stats.linregress(x_valid, y_valid) + return slope + +def calculate_percent_change(series, periods=1): + """ + Calculate percent change over periods + + Args: + series: Pandas Series + periods: Number of periods to shift (default: 1) + + Returns: + Series: Percent change + """ + return series.pct_change(periods=periods) * 100 + +def calculate_moving_average(series, window=3): + """ + Calculate moving average + + Args: + series: Pandas Series + window: Window size for moving average + + Returns: + Series: Moving average + """ + return series.rolling(window=window, center=False).mean() + +def calculate_volatility(series, window=12): + """ + Calculate rolling volatility (standard deviation) + + Args: + series: Pandas Series + window: Window size for rolling calculation + + Returns: + Series: Rolling volatility + """ + return series.rolling(window=window, center=False).std() + +def calculate_z_score(value, mean, std): + """ + Calculate z-score + + Args: + value: Value to score + mean: Mean of distribution + std: Standard deviation of distribution + + Returns: + float: Z-score + """ + if std == 0: + return np.nan + + return (value - mean) / std + +def test_statistical_significance(group1, group2, alpha=0.05): + """ + Test statistical significance between two groups (t-test) + + Args: + group1: First group (array-like) + group2: Second group (array-like) + alpha: Significance level (default: 0.05) + + Returns: + dict: Test results with p-value, significant flag, etc. + """ + group1 = np.array(group1) + group2 = np.array(group2) + + # Remove NaN values + group1 = group1[~np.isnan(group1)] + group2 = group2[~np.isnan(group2)] + + if len(group1) < 2 or len(group2) < 2: + return { + 'p_value': np.nan, + 'significant': False, + 'test_statistic': np.nan, + 'error': 'Insufficient data' + } + + # Perform t-test + t_statistic, p_value = stats.ttest_ind(group1, group2) + + return { + 'p_value': float(p_value), + 'significant': p_value < alpha, + 'test_statistic': float(t_statistic), + 'alpha': alpha, + 'group1_mean': float(np.mean(group1)), + 'group2_mean': float(np.mean(group2)), + 'group1_std': float(np.std(group1)), + 'group2_std': float(np.std(group2)) + } + +def calculate_confidence_interval(series, confidence=0.95): + """ + Calculate confidence interval for a series + + Args: + series: Pandas Series + confidence: Confidence level (default: 0.95 for 95%) + + Returns: + dict: Mean, lower bound, upper bound + """ + series_clean = series.dropna() + + if len(series_clean) == 0: + return { + 'mean': np.nan, + 'lower': np.nan, + 'upper': np.nan, + 'confidence': confidence + } + + mean = series_clean.mean() + std = series_clean.std() + n = len(series_clean) + + # Calculate standard error + se = std / np.sqrt(n) + + # Calculate critical value (z-score for normal distribution) + alpha = 1 - confidence + z_critical = stats.norm.ppf(1 - alpha/2) + + margin = z_critical * se + + return { + 'mean': float(mean), + 'lower': float(mean - margin), + 'upper': float(mean + margin), + 'confidence': confidence, + 'margin': float(margin) + } + +def calculate_annual_growth_rates(values, years): + """ + Calculate year-over-year growth rates for annual data + + Args: + values: Array-like of annual values + years: Array-like of corresponding years + + Returns: + DataFrame: Years, values, and growth rates + """ + df = pd.DataFrame({ + 'Year': years, + 'Value': values + }) + + df['YoY_Growth'] = calculate_percent_change(df['Value']) + df['YoY_Change'] = df['Value'].diff() + + return df + +def calculate_seasonality_index(monthly_series): + """ + Calculate seasonality index for monthly data + + Args: + monthly_series: Series with datetime index (monthly frequency) + + Returns: + Series: Seasonality index (1.0 = average, >1.0 = above average, <1.0 = below average) + """ + if not isinstance(monthly_series.index, pd.DatetimeIndex): + raise ValueError("Series must have DatetimeIndex") + + # Extract month + monthly_series = monthly_series.copy() + monthly_series['Month'] = monthly_series.index.month + + # Calculate average by month + monthly_avg = monthly_series.groupby('Month').mean() + overall_avg = monthly_series.mean() + + # Calculate seasonality index + seasonality = monthly_avg / overall_avg + + return seasonality + +# ============================================================================ +# EXAMPLE USAGE +# ============================================================================ + +if __name__ == "__main__": + """Example usage""" + # YoY Growth + growth = calculate_yoy_growth(110, 100) + print(f"Year-over-year growth: {growth:.2f}%") + + # CAGR + cagr = calculate_cagr(100, 150, 3) + print(f"CAGR: {cagr:.2f}%") + + # Sample data for correlation + df = pd.DataFrame({ + 'Revenue': [100, 110, 120, 130, 140], + 'Quantity': [10, 11, 12, 13, 14] + }) + corr = calculate_correlation(df, 'Revenue', 'Quantity') + print(f"Correlation: {corr:.2f}") diff --git a/tests/test_analysis_utils.py b/tests/test_analysis_utils.py new file mode 100644 index 0000000..808276f --- /dev/null +++ b/tests/test_analysis_utils.py @@ -0,0 +1,85 @@ +""" +Unit tests for analysis_utils.py +""" +import pytest +import pandas as pd +import numpy as np +from pathlib import Path +import sys + +# Add parent directory to path +sys.path.insert(0, str(Path(__file__).parent.parent)) + +from analysis_utils import ( + millions_formatter, thousands_formatter, + get_millions_formatter, get_thousands_formatter, + format_currency, calculate_price_per_unit, + sort_mixed_years, safe_year_labels +) + +class TestFormatters: + """Test formatting functions""" + + def test_millions_formatter(self): + """Test millions formatter""" + assert millions_formatter(10.5, None) == '$10.5m' + assert millions_formatter(0, None) == '$0.0m' + assert millions_formatter(100.0, None) == '$100.0m' + + def test_thousands_formatter(self): + """Test thousands formatter""" + assert thousands_formatter(10.5, None) == '$10.5k' + assert thousands_formatter(0, None) == '$0.0k' + + def test_format_currency(self): + """Test currency formatting""" + assert format_currency(1000000) == '$1.00m' + assert format_currency(1000, millions=False) == '$1.00k' + assert format_currency(np.nan) == 'N/A' + +class TestPriceCalculation: + """Test price calculation functions""" + + def test_calculate_price_per_unit(self): + """Test price per unit calculation""" + df = pd.DataFrame({ + 'Quantity': [10, 20, 30], + 'Revenue': [100, 200, 300] + }) + + price = calculate_price_per_unit(df, 'Quantity', 'Revenue') + assert price == 10.0 # (100+200+300) / (10+20+30) + + def test_calculate_price_per_unit_with_outliers(self): + """Test price calculation excludes outliers""" + df = pd.DataFrame({ + 'Quantity': [10, 20, 30, 2000], # 2000 is outlier + 'Revenue': [100, 200, 300, 10000] + }) + + # Should exclude quantity > 1000 by default + price = calculate_price_per_unit(df, 'Quantity', 'Revenue') + assert price == 10.0 # Only first 3 rows + +class TestYearHandling: + """Test year handling functions""" + + def test_sort_mixed_years(self): + """Test sorting mixed int/str years""" + df = pd.DataFrame({ + 'Year': [2023, '2025 (LTM)', 2024, 2022], + 'Value': [100, 150, 120, 90] + }) + + sorted_df = sort_mixed_years(df, 'Year') + assert sorted_df['Year'].iloc[0] == 2022 + assert sorted_df['Year'].iloc[-1] == '2025 (LTM)' + + def test_safe_year_labels(self): + """Test year label conversion""" + years = [2021, 2022, '2025 (LTM)'] + labels = safe_year_labels(years) + assert labels == ['2021', '2022', '2025 (LTM)'] + +if __name__ == "__main__": + pytest.main([__file__, '-v']) diff --git a/tests/test_config_validator.py b/tests/test_config_validator.py new file mode 100644 index 0000000..a23be3e --- /dev/null +++ b/tests/test_config_validator.py @@ -0,0 +1,45 @@ +""" +Unit tests for config_validator.py +""" +import pytest +import pandas as pd +from pathlib import Path +import sys + +# Add parent directory to path +sys.path.insert(0, str(Path(__file__).parent.parent)) + +from config_validator import validate_config + +class TestConfigValidator: + """Test configuration validation""" + + def test_validate_config_missing_column(self): + """Test validation catches missing columns""" + df = pd.DataFrame({ + 'SomeColumn': [1, 2, 3] + }) + + errors, warnings = validate_config(df) + + # Should have errors for missing required columns + assert len(errors) > 0 + assert any('not found' in error.lower() for error in errors) + + def test_validate_config_valid_data(self): + """Test validation with valid data""" + df = pd.DataFrame({ + 'InvoiceDate': pd.to_datetime(['2023-01-01', '2023-02-01']), + 'USD': [100.0, 200.0], + 'Year': [2023, 2023] + }) + + errors, warnings = validate_config(df) + + # Should have minimal errors (may have warnings about missing optional columns) + # But should not have critical errors if basic structure is correct + critical_errors = [e for e in errors if 'not found' in e.lower() and 'USD' in e or 'InvoiceDate' in e] + assert len(critical_errors) == 0 + +if __name__ == "__main__": + pytest.main([__file__, '-v']) diff --git a/tests/test_data_loader.py b/tests/test_data_loader.py new file mode 100644 index 0000000..da073a2 --- /dev/null +++ b/tests/test_data_loader.py @@ -0,0 +1,68 @@ +""" +Integration tests for data_loader.py +""" +import pytest +import pandas as pd +import numpy as np +from pathlib import Path +import sys +import tempfile +import os + +# Add parent directory to path +sys.path.insert(0, str(Path(__file__).parent.parent)) + +from data_loader import load_sales_data, validate_data_structure + +class TestDataLoader: + """Test data loading functions""" + + def test_load_sales_data_basic(self): + """Test basic data loading""" + # Create temporary CSV + with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False) as f: + f.write('InvoiceDate,USD,Customer\n') + f.write('2023-01-01,100.0,Customer1\n') + f.write('2023-02-01,200.0,Customer2\n') + temp_path = f.name + + try: + # Temporarily update config + import config + original_data_file = config.DATA_FILE + config.DATA_FILE = Path(temp_path).name + + df = load_sales_data(Path(temp_path)) + + assert len(df) == 2 + assert 'Year' in df.columns + assert 'YearMonth' in df.columns + + # Restore config + config.DATA_FILE = original_data_file + finally: + os.unlink(temp_path) + + def test_validate_data_structure(self): + """Test data structure validation""" + # Valid DataFrame + df_valid = pd.DataFrame({ + 'InvoiceDate': pd.to_datetime(['2023-01-01', '2023-02-01']), + 'USD': [100.0, 200.0] + }) + + is_valid, msg = validate_data_structure(df_valid) + assert is_valid + assert msg == "OK" + + # Invalid DataFrame (missing column) + df_invalid = pd.DataFrame({ + 'InvoiceDate': pd.to_datetime(['2023-01-01']) + }) + + is_valid, msg = validate_data_structure(df_invalid) + assert not is_valid + assert 'Missing required column' in msg + +if __name__ == "__main__": + pytest.main([__file__, '-v']) diff --git a/validate_revenue.py b/validate_revenue.py new file mode 100644 index 0000000..629c1c3 --- /dev/null +++ b/validate_revenue.py @@ -0,0 +1,95 @@ +""" +Revenue validation utility +Validates that revenue calculations are consistent across analyses +""" +import pandas as pd +from config import ( + REVENUE_COLUMN, ANALYSIS_YEARS, VALIDATION_ENABLED, + EXPECTED_REVENUE, REVENUE_TOLERANCE_PCT, LTM_ENABLED, + get_ltm_period +) +from analysis_utils import get_annual_data + +def validate_revenue(dataframe: pd.DataFrame, analysis_name: str = "Analysis") -> None: + """ + Print annual revenue summary for validation. + + This function helps ensure that: + 1. Data loading is working correctly + 2. Revenue calculations are consistent + 3. Filters are not accidentally excluding too much data + + Args: + dataframe: DataFrame with revenue and date columns (should have REVENUE_COLUMN and Year) + analysis_name: Name of the analysis (for logging/display) + + Example: + >>> validate_revenue(df, "Revenue Analysis") + >>> # Prints annual revenue summary by year + """ + df = dataframe.copy() + + # Ensure date column is datetime + from config import DATE_COLUMN + if DATE_COLUMN in df.columns: + df[DATE_COLUMN] = pd.to_datetime(df[DATE_COLUMN], errors='coerce', format='mixed') + + # Filter to analysis years + df = df[df['Year'].isin(ANALYSIS_YEARS)] + + # Calculate annual revenue + annual_revenue = {} + ltm_start, ltm_end = get_ltm_period() if LTM_ENABLED else (None, None) + + for year in sorted(ANALYSIS_YEARS): + if year in df['Year'].unique(): + year_data, year_label = get_annual_data(df, year, ltm_start, ltm_end) + if len(year_data) > 0: + revenue = year_data[REVENUE_COLUMN].sum() + annual_revenue[year_label] = revenue + + # Print summary + print(f"\n{'='*60}") + print(f"Annual Revenue Validation - {analysis_name}") + print(f"{'='*60}") + + if annual_revenue: + for year_label, revenue in annual_revenue.items(): + formatted = f"${revenue / 1e6:.2f}m" + print(f" {year_label}: {formatted}") + + # Validation against expected values + if VALIDATION_ENABLED and EXPECTED_REVENUE: + print(f"\nValidation Check:") + all_valid = True + for year_label, actual_revenue in annual_revenue.items(): + # Try to match year label to expected revenue + year_key = None + if isinstance(year_label, str): + # Extract year number from label (e.g., "2025 (LTM 9/2025)" -> 2025) + import re + year_match = re.search(r'(\d{4})', str(year_label)) + if year_match: + year_key = int(year_match.group(1)) + else: + year_key = year_label + + if year_key in EXPECTED_REVENUE: + expected = EXPECTED_REVENUE[year_key] + tolerance = expected * REVENUE_TOLERANCE_PCT + diff = abs(actual_revenue - expected) + + if diff <= tolerance: + print(f" āœ“ {year_label}: Within tolerance ({diff/1e6:.2f}m difference)") + else: + print(f" āœ— {year_label}: Outside tolerance (expected ${expected/1e6:.2f}m, got ${actual_revenue/1e6:.2f}m, diff: ${diff/1e6:.2f}m)") + all_valid = False + + if all_valid: + print(" All validations passed!") + else: + print(" WARNING: Some validations failed. Check data loading and filters.") + else: + print(" No revenue data found for analysis years") + + print(f"{'='*60}\n")