Initial commit: sales analysis template

Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
Jonathan Pressnell
2026-02-06 09:16:34 -05:00
commit cf0b596449
38 changed files with 8001 additions and 0 deletions

View File

@@ -0,0 +1,307 @@
# Advanced Analysis Patterns
This document provides patterns for sophisticated, production-grade analyses that leverage the full capabilities of the template framework.
## ⭐ Using Cursor AI Effectively
When working in Cursor, you can ask the AI to:
- "Create a cohort analysis script using the template patterns"
- "Add statistical significance testing to this analysis"
- "Generate a multi-dimensional analysis with product, customer, and geography"
- "Create a forecasting analysis with confidence intervals"
The AI will automatically use these patterns and utilities.
## Advanced Analysis Types
### 1. Multi-Dimensional Analysis
**Pattern:** Analyze across multiple dimensions simultaneously (e.g., Product × Customer × Geography)
```python
from data_loader import load_sales_data
from analysis_utils import calculate_annual_metrics, get_ltm_period_config
from config import REVENUE_COLUMN, ITEM_COLUMN, CUSTOMER_COLUMN, REGION_COLUMN
df = load_sales_data(get_data_path())
# Multi-dimensional pivot
pivot = df.pivot_table(
index=[ITEM_COLUMN, CUSTOMER_COLUMN],
columns=REGION_COLUMN,
values=REVENUE_COLUMN,
aggfunc='sum',
fill_value=0
)
# Or use data_processing helper
from data_processing import create_pivot_table
pivot = create_pivot_table(
df,
index=[ITEM_COLUMN, CUSTOMER_COLUMN],
columns=REGION_COLUMN,
values=REVENUE_COLUMN
)
```
### 2. Cohort Analysis with Retention Metrics
**Pattern:** Track customer cohorts over time with retention and revenue metrics
```python
from examples.cohort_analysis import create_cohorts, calculate_cohort_metrics
df_cohort = create_cohorts(df)
cohort_metrics = calculate_cohort_metrics(df_cohort)
# Calculate Net Revenue Retention (NRR)
nrr = cohort_metrics.groupby('Cohort').agg({
'Revenue_Retention': lambda x: x.iloc[-1] if len(x) > 0 else 0
})
```
### 3. Statistical Significance Testing
**Pattern:** Compare segments with statistical tests
```python
from statistical_utils import test_statistical_significance
# Compare two groups
group1 = df[df['Segment'] == 'A'][REVENUE_COLUMN]
group2 = df[df['Segment'] == 'B'][REVENUE_COLUMN]
result = test_statistical_significance(group1, group2)
if result['significant']:
print(f"Significant difference (p={result['p_value']:.4f})")
```
### 4. Price-Volume-Mix (PVM) Decomposition
**Pattern:** Decompose revenue changes into price, volume, and mix effects
```python
from config import QUANTITY_COLUMN, REVENUE_COLUMN
def pvm_decomposition(df_base, df_current):
"""Decompose revenue change into price, volume, mix effects"""
base_price = df_base[REVENUE_COLUMN].sum() / df_base[QUANTITY_COLUMN].sum()
current_price = df_current[REVENUE_COLUMN].sum() / df_current[QUANTITY_COLUMN].sum()
base_volume = df_base[QUANTITY_COLUMN].sum()
current_volume = df_current[QUANTITY_COLUMN].sum()
# Price effect
price_effect = (current_price - base_price) * base_volume
# Volume effect
volume_effect = (current_volume - base_volume) * base_price
# Mix effect (residual)
total_change = df_current[REVENUE_COLUMN].sum() - df_base[REVENUE_COLUMN].sum()
mix_effect = total_change - price_effect - volume_effect
return {
'price_effect': price_effect,
'volume_effect': volume_effect,
'mix_effect': mix_effect,
'total_change': total_change
}
```
### 5. Time Series Forecasting
**Pattern:** Forecast future revenue with confidence intervals
```python
from data_processing import prepare_time_series
from statistical_utils import calculate_confidence_interval
# Prepare time series
ts = prepare_time_series(df, freq='M')
# Simple forecast (extend trend)
from scipy import stats
x = np.arange(len(ts))
slope, intercept, r_value, p_value, std_err = stats.linregress(x, ts.values)
# Forecast next 12 months
future_x = np.arange(len(ts), len(ts) + 12)
forecast = slope * future_x + intercept
# Calculate confidence intervals
ci = calculate_confidence_interval(ts, confidence=0.95)
```
### 6. Customer Lifetime Value (CLV) Analysis
**Pattern:** Calculate CLV using historical data
```python
from config import CUSTOMER_COLUMN, REVENUE_COLUMN, DATE_COLUMN
def calculate_clv(df, years=3):
"""Calculate customer lifetime value"""
customer_metrics = df.groupby(CUSTOMER_COLUMN).agg({
REVENUE_COLUMN: 'sum',
DATE_COLUMN: ['min', 'max', 'count']
}).reset_index()
customer_metrics.columns = [CUSTOMER_COLUMN, 'Total_Revenue', 'First_Purchase', 'Last_Purchase', 'Order_Count']
# Calculate customer age (years)
customer_metrics['Customer_Age_Years'] = (
(customer_metrics['Last_Purchase'] - customer_metrics['First_Purchase']).dt.days / 365.25
)
# Annual revenue
customer_metrics['Annual_Revenue'] = customer_metrics['Total_Revenue'] / customer_metrics['Customer_Age_Years'].replace(0, 1)
# Projected CLV
customer_metrics['CLV'] = customer_metrics['Annual_Revenue'] * years
return customer_metrics
```
### 7. Market Basket Analysis
**Pattern:** Find product associations and cross-sell opportunities
```python
from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.preprocessing import TransactionEncoder
# Prepare transaction data
transactions = df.groupby(INVOICE_NUMBER_COLUMN)[ITEM_COLUMN].apply(list).tolist()
# Encode transactions
te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
df_encoded = pd.DataFrame(te_ary, columns=te.columns_)
# Find frequent itemsets
frequent_itemsets = apriori(df_encoded, min_support=0.01, use_colnames=True)
# Generate association rules
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.5)
```
### 8. Segmentation with Machine Learning
**Pattern:** Advanced customer segmentation using clustering
```python
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
# Prepare features
features = df.groupby(CUSTOMER_COLUMN).agg({
REVENUE_COLUMN: ['sum', 'mean', 'count'],
DATE_COLUMN: lambda x: (x.max() - x.min()).days
}).reset_index()
features.columns = [CUSTOMER_COLUMN, 'Total_Revenue', 'Avg_Order', 'Order_Count', 'Customer_Tenure']
# Scale features
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features[['Total_Revenue', 'Avg_Order', 'Order_Count', 'Customer_Tenure']])
# Cluster
kmeans = KMeans(n_clusters=5, random_state=42)
features['Segment'] = kmeans.fit_predict(features_scaled)
```
### 9. Anomaly Detection
**Pattern:** Identify unusual patterns in data
```python
from statistical_utils import calculate_z_score
# Calculate z-scores for revenue
mean_revenue = df[REVENUE_COLUMN].mean()
std_revenue = df[REVENUE_COLUMN].std()
df['Revenue_Z_Score'] = df[REVENUE_COLUMN].apply(
lambda x: calculate_z_score(x, mean_revenue, std_revenue)
)
# Flag anomalies (|z| > 3)
df['Is_Anomaly'] = df['Revenue_Z_Score'].abs() > 3
```
### 10. Competitive Analysis Framework
**Pattern:** Compare performance across dimensions
```python
from statistical_utils import calculate_yoy_growth, calculate_cagr
def competitive_analysis(df, dimension_col):
"""Compare performance across dimension (e.g., products, regions)"""
analysis = df.groupby(dimension_col).agg({
REVENUE_COLUMN: ['sum', 'mean', 'count']
}).reset_index()
analysis.columns = [dimension_col, 'Total_Revenue', 'Avg_Order', 'Order_Count']
# Calculate growth rates
for year in sorted(df['Year'].unique())[1:]:
prev_year = year - 1
current = df[df['Year'] == year].groupby(dimension_col)[REVENUE_COLUMN].sum()
previous = df[df['Year'] == prev_year].groupby(dimension_col)[REVENUE_COLUMN].sum()
growth = calculate_yoy_growth(current, previous)
analysis[f'Growth_{year}'] = growth
return analysis
```
## Best Practices for Advanced Analyses
1. **Always validate data quality first:**
```python
from data_quality import generate_data_quality_report
report = generate_data_quality_report(df)
```
2. **Use logging for complex analyses:**
```python
from logger_config import get_logger
logger = get_logger('advanced_analysis')
logger.info("Starting complex analysis...")
```
3. **Export intermediate results:**
```python
from export_utils import export_to_excel
export_to_excel(intermediate_df, 'intermediate_results.xlsx')
```
4. **Generate comprehensive reports:**
```python
from report_generator import generate_pdf_report
generate_pdf_report(charts=['chart1.png', 'chart2.png'], summary_data=summary)
```
5. **Test statistical significance:**
```python
from statistical_utils import test_statistical_significance
# Always test before making conclusions
```
## Cursor AI Prompts for Advanced Analyses
When using Cursor, try these prompts:
- **"Create a cohort retention analysis with heatmaps"**
- **"Build a price-volume-mix decomposition analysis"**
- **"Generate a customer lifetime value analysis with segmentation"**
- **"Create a forecasting model with confidence intervals"**
- **"Build a multi-dimensional analysis across product, customer, and geography"**
- **"Create an anomaly detection analysis for unusual transactions"**
The AI will automatically use these patterns and the template utilities.
---
**Last Updated:** January 2026
**For:** Advanced users and AI-assisted development

View File

@@ -0,0 +1,316 @@
# AI Assistant Guide for Sales Analysis Template
This guide helps you effectively use Cursor's AI assistant to create sophisticated sales analyses.
## 🎯 Quick Start with AI
### Basic Prompt Structure
When asking the AI to create an analysis, use this structure:
```
Create a [ANALYSIS_TYPE] analysis that:
1. [Specific requirement 1]
2. [Specific requirement 2]
3. Uses the sales_analysis_template patterns
4. Includes [specific visualizations/metrics]
```
### Example Prompts
**Simple Analysis:**
```
Create an annual revenue trend analysis using the template patterns,
with LTM support and proper chart formatting.
```
**Advanced Analysis:**
```
Create a customer cohort retention analysis that:
1. Groups customers by first purchase month
2. Calculates retention rates for 12 periods
3. Shows revenue retention metrics
4. Creates heatmap visualizations
5. Uses the template's cohort analysis patterns
```
**Multi-Dimensional Analysis:**
```
Create a product performance analysis across regions that:
1. Analyzes top products by revenue
2. Shows regional distribution
3. Calculates growth rates by region
4. Creates multi-panel visualizations
5. Exports results to Excel
```
## 📋 Template-Aware Prompts
The AI automatically knows about:
- `data_loader.py` - Always use this for loading data
- `analysis_utils.py` - Use utilities for formatting, LTM, etc.
- `config.py` - Use config values, never hardcode
- Template patterns - Follows best practices automatically
### What the AI Knows
When you mention the template, the AI will:
- ✅ Use `load_sales_data()` instead of `pd.read_csv()`
- ✅ Use `setup_revenue_chart()` for charts
- ✅ Divide revenue by 1e6 before plotting
- ✅ Use config values from `config.py`
- ✅ Apply exclusion filters if configured
- ✅ Validate data after loading
- ✅ Use LTM patterns correctly
## 🔧 Common AI Tasks
### 1. Create New Analysis Script
**Prompt:**
```
Create a new analysis script called [name].py that:
- Follows the template structure
- Analyzes [specific metric/dimension]
- Creates [type of visualization]
- Uses template utilities
```
**AI will:**
- Copy structure from `analysis_template.py`
- Use proper imports
- Follow template patterns
- Include validation
### 2. Add Advanced Features
**Prompt:**
```
Add statistical significance testing to [analysis].py:
- Compare [group1] vs [group2]
- Show p-values and confidence intervals
- Use statistical_utils functions
```
### 3. Fix Common Issues
**Prompt:**
```
Fix the chart formatting in [analysis].py - it's showing scientific notation.
```
**AI will:**
- Add `data / 1e6` conversion
- Use `setup_revenue_chart()`
- Fix formatting issues
### 4. Enhance Existing Analysis
**Prompt:**
```
Enhance [analysis].py to:
- Add export to Excel functionality
- Include data quality checks
- Add logging
- Generate PDF report
```
## 🚀 Advanced AI Prompts
### Multi-Step Analysis
```
Create a comprehensive customer analysis that:
1. Segments customers using RFM
2. Calculates CLV for each segment
3. Identifies at-risk customers
4. Creates cohort retention analysis
5. Generates PDF report with all charts
```
### Data Quality First
```
Before running the analysis, check data quality:
1. Run data quality report
2. Fix any critical issues
3. Validate configuration
4. Then proceed with analysis
```
### Statistical Analysis
```
Add statistical analysis to [analysis].py:
- Calculate year-over-year growth with significance testing
- Show confidence intervals for forecasts
- Test differences between segments
- Use statistical_utils functions
```
## 💡 Pro Tips
### 1. Reference Existing Examples
```
Create an analysis similar to examples/customer_segmentation.py
but for product segmentation instead.
```
### 2. Use Template Utilities
```
Use the template's export_utils to save results to Excel,
and report_generator to create a PDF report.
```
### 3. Leverage Cursor Rules
The AI automatically reads `.cursor/rules/` files, so you can say:
```
Follow the advanced_analysis_patterns.md guide to create
a price-volume-mix decomposition analysis.
```
### 4. Iterative Development
```
Start with a basic version, then enhance it:
1. First version: Simple revenue trend
2. Add: Statistical significance
3. Add: Export functionality
4. Add: PDF report generation
```
## 🎨 Visualization Prompts
### Create Specific Chart Types
```
Create a heatmap showing [metric] across [dimension1] and [dimension2],
using seaborn and following template chart formatting.
```
```
Create an interactive Plotly chart for [analysis],
saving it as HTML using the template's interactive chart functions.
```
### Multi-Panel Visualizations
```
Create a 2x2 subplot showing:
- Top left: Revenue trend
- Top right: Customer count trend
- Bottom left: Average order value
- Bottom right: Growth rates
All using template chart formatting.
```
## 📊 Data Analysis Prompts
### Cohort Analysis
```
Create a cohort analysis that:
1. Groups customers by first purchase month
2. Tracks retention for 12 periods
3. Calculates revenue retention
4. Creates retention heatmap
5. Uses examples/cohort_analysis.py as reference
```
### Forecasting
```
Create a revenue forecasting analysis:
1. Prepare time series data
2. Fit trend model
3. Forecast next 12 months
4. Show confidence intervals
5. Use statistical_utils for calculations
```
### Segmentation
```
Create an advanced customer segmentation:
1. Calculate RFM scores
2. Apply clustering algorithm
3. Analyze segment characteristics
4. Create segment visualizations
5. Export segment data to Excel
```
## 🔍 Debugging with AI
### Fix Errors
```
I'm getting [error message] in [file].py.
Fix it using template best practices.
```
### Optimize Performance
```
Optimize [analysis].py for large datasets:
- Use efficient pandas operations
- Add progress indicators
- Consider data sampling if needed
```
### Improve Code Quality
```
Refactor [analysis].py to:
- Use more template utilities
- Follow template patterns better
- Add proper error handling
- Include logging
```
## 📝 Documentation Prompts
### Add Documentation
```
Add comprehensive docstrings to [analysis].py following
the template's documentation style.
```
### Create README
```
Create a README for [analysis].py explaining:
- What it does
- How to run it
- What outputs it generates
- Dependencies required
```
## 🎯 Best Practices for AI Interaction
1. **Be Specific:** Mention template files and utilities by name
2. **Reference Examples:** Point to existing examples when relevant
3. **Iterate:** Start simple, then add complexity
4. **Use Template Terms:** Mention "LTM", "config values", "template patterns"
5. **Ask for Validation:** Request data quality checks and validation
## Example Full Workflow
```
1. "Check my configuration using config_validator.py"
2. "Run data quality report on my data"
3. "Create a revenue trend analysis using template patterns"
4. "Add statistical significance testing to the analysis"
5. "Export results to Excel and generate PDF report"
6. "Create a cohort analysis similar to the example"
```
The AI will guide you through each step using template best practices.
---
**Last Updated:** January 2026
**For:** Cursor AI users working with sales_analysis_template

View File

@@ -0,0 +1,161 @@
# Common Analysis Patterns
## ⭐ RECOMMENDED: Use Utilities
**Always prefer `analysis_utils.py` and `config.py` over manual implementations:**
- Consistent formatting
- Fewer errors
- Easier maintenance
- Standardized output
## Standard Script Structure (Using Utilities)
**RECOMMENDED:** Use `analysis_utils.py` and `config.py` for consistency:
```python
# 1. IMPORTS
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from data_loader import load_sales_data, validate_data_structure
from validate_revenue import validate_revenue
from analysis_utils import (
get_ltm_period_config, get_annual_data, calculate_annual_metrics,
get_millions_formatter, setup_revenue_chart, save_chart,
format_currency, print_annual_summary, sort_mixed_years,
apply_exclusion_filters
)
from config import (
DATA_FILE, OUTPUT_DIR, CHART_SIZES, ensure_directories,
get_data_path, REVENUE_COLUMN, COMPANY_NAME
)
# 2. LOAD DATA (ALWAYS use data_loader)
df = load_sales_data(get_data_path())
# 3. VALIDATE DATA STRUCTURE
is_valid, msg = validate_data_structure(df)
if not is_valid:
print(f"ERROR: {msg}")
return
# 4. APPLY EXCLUSION FILTERS (if configured)
df = apply_exclusion_filters(df)
# 5. SETUP LTM (if doing annual comparisons and LTM is enabled)
ltm_start, ltm_end = get_ltm_period_config()
# 6. DATA PREPARATION
# Convert columns, filter data, create derived columns
# 7. ANALYSIS LOGIC
# Use calculate_annual_metrics() for annual aggregations
# 8. VISUALIZATIONS
# Use setup_revenue_chart() and save_chart() from analysis_utils
# 9. VALIDATION
validate_revenue(df, "Analysis Name")
```
## Annual Aggregation Pattern
**RECOMMENDED:** Use `calculate_annual_metrics()` from `analysis_utils.py`:
```python
from analysis_utils import calculate_annual_metrics, get_ltm_period_config
from config import REVENUE_COLUMN
ltm_start, ltm_end = get_ltm_period_config()
def calculate_metrics(year_data):
"""Calculate metrics for a single year"""
return {
'Revenue': year_data[REVENUE_COLUMN].sum(),
# ... other metrics
}
annual_df = calculate_annual_metrics(df, calculate_metrics, ltm_start, ltm_end)
```
## Chart Formatting Pattern
**ALWAYS use this pattern for revenue charts:**
```python
from analysis_utils import setup_revenue_chart, save_chart
from config import CHART_SIZES
fig, ax = plt.subplots(figsize=CHART_SIZES['medium'])
# Divide data by 1e6 BEFORE plotting
ax.plot(data / 1e6, ...)
# OR
ax.bar(x, values / 1e6, ...)
# Apply formatter automatically
setup_revenue_chart(ax)
# Save chart
save_chart(fig, 'chart_name.png')
plt.close()
```
## Mixed Type Handling
When dealing with year columns that may contain mixed int/str types (e.g., "2025 (LTM 9/2025)"):
```python
from analysis_utils import sort_mixed_years
# Sort DataFrame by year
df_sorted = sort_mixed_years(df, year_col='Year')
# For chart labels
years = df_sorted['Year'].tolist()
x_pos = range(len(years))
ax.set_xticks(x_pos)
ax.set_xticklabels(years, rotation=45, ha='right')
```
## Price Calculation Pattern
```python
from analysis_utils import calculate_price_per_unit
from config import QUANTITY_COLUMN, REVENUE_COLUMN
# Calculate average price per unit (excludes outliers automatically)
price_per_unit = calculate_price_per_unit(df, QUANTITY_COLUMN, REVENUE_COLUMN)
```
## Exclusion Filters Pattern
If you need to exclude specific segments (e.g., test accounts, business units):
```python
from analysis_utils import apply_exclusion_filters
# Configure in config.py:
# EXCLUSION_FILTERS = {
# 'enabled': True,
# 'exclude_by_column': 'Country',
# 'exclude_values': ['KVT', 'Test']
# }
df = apply_exclusion_filters(df)
```
## Using Configuration Values
**ALWAYS use config values instead of hardcoding:**
```python
from config import (
REVENUE_COLUMN, # Use this instead of 'USD' or 'Amount'
CUSTOMER_COLUMN, # Use this instead of 'Customer'
DATE_COLUMN, # Use this instead of 'InvoiceDate'
COMPANY_NAME, # Use this for titles
ANALYSIS_YEARS, # Use this for year filtering
CHART_SIZES, # Use this for figure sizes
)
```

View File

@@ -0,0 +1,111 @@
# Chart Formatting Rules
## ⭐ RECOMMENDED: Use analysis_utils.py
**Prefer utility functions:**
```python
from analysis_utils import setup_revenue_chart, save_chart, get_millions_formatter
from config import CHART_SIZES, OUTPUT_DIR
fig, ax = plt.subplots(figsize=CHART_SIZES['medium'])
ax.plot(data / 1e6, ...)
setup_revenue_chart(ax) # Applies formatter automatically
save_chart(fig, 'chart.png') # Saves to charts/ directory
```
## Revenue Charts: Millions Formatter
**ALWAYS use this pattern for revenue charts:**
```python
from analysis_utils import setup_revenue_chart
# Divide data by 1e6 BEFORE plotting
ax.plot(data / 1e6, ...)
# OR
ax.bar(x, values / 1e6, ...)
# Apply formatter automatically
setup_revenue_chart(ax)
```
**Manual approach (if not using utilities):**
```python
from matplotlib.ticker import FuncFormatter
def millions_formatter(x, pos):
return f'${x:.1f}m'
ax.plot(data / 1e6, ...)
ax.yaxis.set_major_formatter(FuncFormatter(millions_formatter))
ax.set_ylabel('Revenue (Millions USD)')
```
## Thousands Formatter (for smaller values)
```python
from analysis_utils import get_thousands_formatter
ax.xaxis.set_major_formatter(get_thousands_formatter())
ax.barh(x, values / 1e3, ...)
ax.set_xlabel('Value (Thousands USD)')
```
## Chart Labeling with LTM
**If LTM is enabled, ALWAYS include LTM notation:**
```python
from config import get_ltm_label, COMPANY_NAME
title = f'Annual Revenue Trend - {COMPANY_NAME}'
ltm_label = get_ltm_label()
if ltm_label:
title += f'\n({ltm_label})'
ax.set_title(title)
```
## Chart Sizes
**Use predefined sizes from config:**
```python
from config import CHART_SIZES
fig, ax = plt.subplots(figsize=CHART_SIZES['medium']) # (10, 6)
# Options: 'small' (6, 4), 'medium' (10, 6), 'large' (12, 8), 'wide' (14, 6)
```
## Common Mistakes
**WRONG:**
```python
ax.plot(revenue, ...) # Shows scientific notation (1e8)
```
**CORRECT:**
```python
ax.plot(revenue / 1e6, ...) # Divide first
setup_revenue_chart(ax) # Then format
```
## Saving Charts
**ALWAYS use save_chart() utility:**
```python
from analysis_utils import save_chart
save_chart(fig, 'chart_name.png') # Saves to charts/ with proper settings
plt.close() # Don't forget to close!
```
## Chart Styling
**Configure style in config.py:**
```python
# In config.py:
CHART_STYLE = 'seaborn-v0_8' # Options: 'default', 'ggplot', 'seaborn-v0_8'
# In your script:
import matplotlib.pyplot as plt
plt.style.use(CHART_STYLE) # Apply before creating figures
```

View File

@@ -0,0 +1,389 @@
# Code Quality & Best Practices
**Comprehensive guide for writing Cursor-optimized code in the sales analysis template.**
This document combines code quality standards and Cursor best practices to ensure AI assistants can effectively understand, modify, and extend the codebase.
## Type Hints
### When to Use Type Hints
Use type hints for:
- Function parameters
- Return values
- Class attributes
- Complex data structures
### Example Pattern
```python
from typing import Dict, List, Optional, Tuple
import pandas as pd
def calculate_annual_metrics(
df: pd.DataFrame,
metrics_func: callable,
ltm_start: Optional[pd.Period] = None,
ltm_end: Optional[pd.Period] = None
) -> pd.DataFrame:
"""
Calculate annual metrics for all years
Args:
df: DataFrame with 'Year' and 'YearMonth' columns
metrics_func: Function that takes a DataFrame and returns a dict of metrics
ltm_start: LTM start period (defaults to config if None)
ltm_end: LTM end period (defaults to config if None)
Returns:
DataFrame with 'Year' index and metric columns
"""
# Implementation
```
## Docstrings
### Docstring Format
All functions should use Google-style docstrings:
```python
def function_name(param1: type, param2: type) -> return_type:
"""
Brief description of what the function does.
More detailed explanation if needed. Can span multiple lines.
Explain any complex logic or important considerations.
Args:
param1: Description of param1
param2: Description of param2
Returns:
Description of return value
Raises:
ValueError: When and why this exception is raised
Example:
>>> result = function_name(value1, value2)
>>> print(result)
expected_output
"""
```
### Required Elements
- Brief one-line summary
- Detailed description (if needed)
- Args section (all parameters)
- Returns section (return value)
- Raises section (if exceptions raised)
- Example section (for complex functions)
## Variable Naming
### Conventions
- **Descriptive names:** `customer_revenue` not `cr`
- **Consistent prefixes:** `df_` for DataFrames, `annual_` for annual metrics
- **Clear abbreviations:** `ltm` for Last Twelve Months (well-known)
- **Avoid single letters:** Except for loop variables (`i`, `j`, `k`)
### Good Examples
```python
# Good
customer_revenue_by_year = df.groupby(['Customer', 'Year'])[REVENUE_COLUMN].sum()
annual_metrics_df = calculate_annual_metrics(df, metrics_func)
ltm_start_period, ltm_end_period = get_ltm_period_config()
# Bad
cr = df.groupby(['C', 'Y'])['R'].sum()
am = calc(df, mf)
s, e = get_ltm()
```
## Error Messages
### Structure
Error messages should be:
1. **Specific:** What exactly went wrong
2. **Actionable:** How to fix it
3. **Contextual:** Where it occurred
4. **Helpful:** Reference to documentation
### Good Error Messages
```python
# Good
raise ValueError(
f"Required column '{REVENUE_COLUMN}' not found in data.\n"
f"Available columns: {list(df.columns)}\n"
f"Please update config.py REVENUE_COLUMN to match your data.\n"
f"See .cursor/rules/data_loading.md for more help."
)
# Bad
raise ValueError("Column not found")
```
## Code Comments
### When to Comment
- Complex logic that isn't immediately obvious
- Business rules or domain-specific knowledge
- Workarounds or non-obvious solutions
- Performance considerations
- TODO items with context
### Comment Style
```python
# Good: Explains WHY, not WHAT
# Use LTM for most recent year to enable apples-to-apples comparison
# with full calendar years (avoids partial year bias)
if year == LTM_END_YEAR and LTM_ENABLED:
year_data = get_ltm_data(df, ltm_start, ltm_end)
# Bad: States the obvious
# Check if year equals LTM_END_YEAR
if year == LTM_END_YEAR:
```
## Function Design
### Single Responsibility
Each function should do one thing well:
```python
# Good: Single responsibility
def calculate_revenue(df: pd.DataFrame) -> float:
"""Calculate total revenue from DataFrame"""
return df[REVENUE_COLUMN].sum()
def calculate_customer_count(df: pd.DataFrame) -> int:
"""Calculate unique customer count"""
return df[CUSTOMER_COLUMN].nunique()
# Bad: Multiple responsibilities
def calculate_metrics(df):
"""Calculate revenue and customer count"""
revenue = df[REVENUE_COLUMN].sum()
customers = df[CUSTOMER_COLUMN].nunique()
return revenue, customers
```
### Function Length
- Keep functions under 50 lines when possible
- Break complex functions into smaller helper functions
- Use descriptive function names that explain purpose
## Import Organization
### Standard Order
1. Standard library imports
2. Third-party imports (pandas, numpy, matplotlib)
3. Local/template imports (data_loader, analysis_utils, config)
### Example
```python
# Standard library
from pathlib import Path
from typing import Dict, Optional
from datetime import datetime
# Third-party
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# Template imports
from data_loader import load_sales_data, validate_data_structure
from analysis_utils import calculate_annual_metrics, setup_revenue_chart
from config import REVENUE_COLUMN, CHART_SIZES, COMPANY_NAME
```
## Constants and Configuration
### Use Config Values
```python
# Good: From config
from config import REVENUE_COLUMN, DATE_COLUMN
revenue = df[REVENUE_COLUMN].sum()
# Bad: Hardcoded
revenue = df['USD'].sum()
```
### Magic Numbers
Avoid magic numbers - use named constants or config:
```python
# Good: Named constant
MILLIONS_DIVISOR = 1e6
revenue_millions = revenue / MILLIONS_DIVISOR
# Or from config
CHART_DPI = 300 # In config.py
# Bad: Magic number
revenue_millions = revenue / 1000000
```
## Testing Considerations
### Testable Code
Write code that's easy to test:
- Pure functions when possible (no side effects)
- Dependency injection for external dependencies
- Clear inputs and outputs
### Example
```python
# Good: Testable
def calculate_metrics(year_data: pd.DataFrame, revenue_col: str) -> Dict:
"""Calculate metrics - easy to test with sample data"""
return {
'Revenue': year_data[revenue_col].sum(),
'Count': len(year_data)
}
# Harder to test: Depends on global config
def calculate_metrics(year_data):
"""Uses global REVENUE_COLUMN - harder to test"""
return {'Revenue': year_data[REVENUE_COLUMN].sum()}
```
## AI-Friendly Patterns
### Clear Intent
Code should clearly express intent:
```python
# Good: Intent is clear
customers_with_revenue = df[df[REVENUE_COLUMN] > 0][CUSTOMER_COLUMN].unique()
# Less clear: Requires understanding of pandas
customers_with_revenue = df.loc[df[REVENUE_COLUMN] > 0, CUSTOMER_COLUMN].unique()
```
### Explicit Over Implicit
```python
# Good: Explicit
if LTM_ENABLED and ltm_start is not None and ltm_end is not None:
use_ltm = True
else:
use_ltm = False
# Less clear: Implicit truthiness
use_ltm = LTM_ENABLED and ltm_start and ltm_end
```
## Documentation for AI
### Help AI Understand Context
Add comments that help AI understand business context:
```python
# LTM (Last Twelve Months) is used for the most recent partial year
# to enable fair comparison with full calendar years.
# Example: If latest data is through Sep 2025, use Oct 2024 - Sep 2025
if year == LTM_END_YEAR and LTM_ENABLED:
# Use 12-month rolling period instead of partial calendar year
year_data = get_ltm_data(df, ltm_start, ltm_end)
```
## Cursor-Specific Optimizations
### AI-Friendly Code Structure
Code should be structured so Cursor AI can:
1. **Understand intent** - Clear function names and comments
2. **Generate code** - Follow established patterns
3. **Fix errors** - Actionable error messages
4. **Extend functionality** - Modular, reusable functions
### Example: AI-Generated Code Pattern
When AI generates code, it should automatically:
```python
# AI recognizes this pattern and replicates it
def main():
# 1. Load data (AI knows to use data_loader)
df = load_sales_data(get_data_path())
# 2. Validate (AI knows to check structure)
is_valid, msg = validate_data_structure(df)
if not is_valid:
print(f"ERROR: {msg}")
return
# 3. Apply filters (AI knows exclusion filters)
df = apply_exclusion_filters(df)
# 4. Analysis logic (AI follows template patterns)
# ...
# 5. Create charts (AI knows formatting rules)
# ...
# 6. Validate revenue (AI knows to validate)
validate_revenue(df, ANALYSIS_NAME)
```
### Help AI Generate Better Code
Add context comments that help AI:
```python
# LTM (Last Twelve Months) is used for the most recent partial year
# to enable fair comparison with full calendar years.
# Example: If latest data is through Sep 2025, use Oct 2024 - Sep 2025
# This avoids partial-year bias in year-over-year comparisons.
if year == LTM_END_YEAR and LTM_ENABLED:
# Use 12-month rolling period instead of partial calendar year
year_data = get_ltm_data(df, ltm_start, ltm_end)
year_label = get_ltm_label() # Returns "2025 (LTM 9/2025)"
```
## Summary Checklist
For Cursor-optimized code:
- ✅ Comprehensive docstrings with examples
- ✅ Type hints on functions
- ✅ Descriptive variable names
- ✅ Clear comments for business logic
- ✅ Structured error messages
- ✅ Consistent code patterns
- ✅ Use config values (never hardcode)
- ✅ Follow template utilities
- ✅ Include validation steps
- ✅ Reference documentation
## Summary
Follow these standards to ensure:
1. AI can understand code structure
2. AI can modify code safely
3. AI can generate new code following patterns
4. Code is maintainable and readable
5. Errors are clear and actionable
6. Cursor AI can assist effectively
---
**Last Updated:** January 2026
**For:** Cursor AI optimization and human developers

View File

@@ -0,0 +1,109 @@
# Common Errors and Troubleshooting
**Quick reference for fixing common issues. For error handling patterns when writing code, see `error_handling.md`.**
## Data Loading Errors
### Error: "Data file not found"
**Cause:** DATA_FILE path in config.py is incorrect
**Fix:**
1. Check that your CSV file exists
2. Update `DATA_FILE` in config.py with correct filename
3. If file is in a subdirectory, set `DATA_DIR` in config.py
### Error: "Required column 'USD' not found"
**Cause:** Column name in data doesn't match config
**Fix:**
1. Check your CSV column names
2. Update `REVENUE_COLUMN` in config.py to match your data
3. Update other column mappings (DATE_COLUMN, CUSTOMER_COLUMN, etc.)
### Error: "All InvoiceDate values are NaN"
**Cause:** Date column parsing failed
**Fix:**
1. Check date format in your CSV
2. Add fallback date columns to `DATE_FALLBACK_COLUMNS` in config.py
3. Ensure at least one date column exists (Month, Year, etc.)
## Analysis Errors
### Error: "DataFrame is empty" after filtering
**Cause:** Date range or year filters too restrictive
**Fix:**
1. Check `MIN_YEAR` and `MAX_DATE` in config.py
2. Check `ANALYSIS_YEARS` includes years in your data
3. Verify date parsing worked (check data_loader output)
### Error: Charts show scientific notation (1e8)
**Cause:** Forgot to divide by 1e6 before plotting
**Fix:**
```python
# WRONG:
ax.plot(revenue, ...)
# CORRECT:
ax.plot(revenue / 1e6, ...)
setup_revenue_chart(ax)
```
### Error: "Year column has mixed types"
**Cause:** LTM year is string "2025 (LTM 9/2025)" while others are int
**Fix:**
```python
from analysis_utils import sort_mixed_years
df_sorted = sort_mixed_years(df, year_col='Year')
```
## Configuration Errors
### Error: LTM not working correctly
**Cause:** LTM configuration incorrect
**Fix:**
1. Check `LTM_ENABLED = True` in config.py
2. Verify `LTM_START_MONTH`, `LTM_START_YEAR`, `LTM_END_MONTH`, `LTM_END_YEAR`
3. Ensure dates are within your data range
### Error: Exclusion filters not working
**Cause:** Filter configuration incorrect
**Fix:**
1. Check `EXCLUSION_FILTERS['enabled'] = True`
2. Verify `exclude_by_column` matches a column in your data
3. Check `exclude_values` list is correct
## Import Errors
### Error: "No module named 'config'"
**Cause:** Running script from wrong directory
**Fix:**
1. Run scripts from template root directory
2. Or add template directory to Python path
### Error: "No module named 'data_loader'"
**Cause:** Missing import or wrong directory
**Fix:**
1. Ensure all template files are in the same directory
2. Check import statements match file names
## Best Practices to Avoid Errors
1. **Always use utilities:** Use `analysis_utils.py` functions instead of manual code
2. **Validate data:** Run `validate_data_structure()` after loading
3. **Check config:** Verify all column names match your data (use `config_validator.py`)
4. **Test incrementally:** Test data loading before running full analysis
5. **Read error messages:** They usually tell you exactly what's wrong
6. **Use Cursor AI:** Ask AI to fix errors - it knows template patterns
## Using Cursor AI to Fix Errors
When you encounter an error, ask Cursor AI:
```
"Fix this error: [paste error message]"
```
The AI will:
- ✅ Understand the error context
- ✅ Reference template patterns
- ✅ Suggest specific fixes
- ✅ Use template utilities correctly
**See also:** `.cursor/rules/error_handling.md` for how to write error messages that help AI fix issues.

View File

@@ -0,0 +1,69 @@
# Data Loading Rules
## CRITICAL: Always Use data_loader.py
**NEVER load data directly with `pd.read_csv()`. Always use:**
```python
from data_loader import load_sales_data
from config import get_data_path
df = load_sales_data(get_data_path())
```
## Why This Matters
The `data_loader.py` implements intelligent fallback logic to ensure 100% date coverage:
1. **Primary:** Parse primary date column (from config.DATE_COLUMN)
2. **Fallback 1:** Use fallback date columns if primary is missing (from config.DATE_FALLBACK_COLUMNS)
3. **Fallback 2:** Use Year column if both missing
4. **Result:** Maximum date coverage possible
## What data_loader.py Provides
- **Date Column:** Properly parsed datetime with fallback logic
- **Year:** Extracted year (100% coverage via fallback)
- **YearMonth:** Period format for monthly aggregations
- **Revenue Column:** Converted to numeric (from config.REVENUE_COLUMN)
## Column Configuration
Before using, configure column names in `config.py`:
- `REVENUE_COLUMN`: Your revenue/amount column name
- `DATE_COLUMN`: Primary date column name
- `DATE_FALLBACK_COLUMNS`: List of fallback date columns
- `CUSTOMER_COLUMN`: Customer/account column name
- Other columns as needed
## Common Mistakes
**WRONG:**
```python
df = pd.read_csv('sales_data.csv')
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
df = df.dropna(subset=['Date']) # May drop significant data!
```
**CORRECT:**
```python
from data_loader import load_sales_data
from config import get_data_path
df = load_sales_data(get_data_path()) # Uses fallback logic
```
## Data File Location
The data file path is configured in `config.py`:
- `DATA_FILE`: Filename (e.g., 'sales_data.csv')
- `DATA_DIR`: Optional subdirectory (defaults to current directory)
- Use `get_data_path()` to get the full path
## Validation
After loading, validate data structure:
```python
from data_loader import validate_data_structure
is_valid, msg = validate_data_structure(df)
if not is_valid:
print(f"ERROR: {msg}")
```

View File

@@ -0,0 +1,276 @@
# Error Handling Best Practices
This guide defines how to handle errors in a way that's helpful for both users and AI assistants.
## Error Message Structure
### Required Elements
Every error message should include:
1. **What went wrong** - Specific error description
2. **Where it occurred** - File/function context
3. **Why it happened** - Root cause explanation
4. **How to fix** - Actionable steps
5. **Reference** - Link to relevant documentation
### Template
```python
raise ErrorType(
f"[What] - [Specific description]\n"
f"\n"
f"Context: [Where/When this occurred]\n"
f"Reason: [Why this happened]\n"
f"\n"
f"Solution:\n"
f"1. [Step 1]\n"
f"2. [Step 2]\n"
f"\n"
f"For more help, see: [Documentation reference]"
)
```
## Common Error Patterns
### Data Loading Errors
```python
# Good: Comprehensive error message
if REVENUE_COLUMN not in df.columns:
available_cols = list(df.columns)[:10] # Show first 10
raise ValueError(
f"Required column '{REVENUE_COLUMN}' not found in data.\n"
f"\n"
f"Context: Loading data from {filepath}\n"
f"Available columns: {available_cols}\n"
f"\n"
f"Solution:\n"
f"1. Check your CSV file column names\n"
f"2. Update REVENUE_COLUMN in config.py to match your data\n"
f"3. Run: python config_validator.py to validate configuration\n"
f"\n"
f"For more help, see: .cursor/rules/data_loading.md"
)
# Bad: Vague error
if REVENUE_COLUMN not in df.columns:
raise ValueError("Column not found")
```
### Configuration Errors
```python
# Good: Actionable error
if LTM_ENABLED and (LTM_START is None or LTM_END is None):
raise ValueError(
f"LTM configuration error: LTM_ENABLED is True but LTM period is not set.\n"
f"\n"
f"Context: Configuration in config.py\n"
f"Current values: LTM_ENABLED={LTM_ENABLED}, LTM_START={LTM_START}, LTM_END={LTM_END}\n"
f"\n"
f"Solution:\n"
f"1. Set LTM_START_MONTH, LTM_START_YEAR, LTM_END_MONTH, LTM_END_YEAR in config.py\n"
f"2. Or set LTM_ENABLED = False if you don't need LTM\n"
f"3. Run: python config_validator.py to check configuration\n"
f"\n"
f"For more help, see: .cursor/rules/ltm_methodology.md"
)
```
### Data Quality Errors
```python
# Good: Helpful data quality error
if date_coverage < 0.5: # Less than 50% coverage
raise ValueError(
f"Data quality issue: Only {date_coverage:.1%} of rows have valid dates.\n"
f"\n"
f"Context: Date parsing in data_loader.py\n"
f"Rows with dates: {date_count:,} / {total_rows:,}\n"
f"\n"
f"Solution:\n"
f"1. Check date format in your CSV file\n"
f"2. Add fallback date columns to DATE_FALLBACK_COLUMNS in config.py\n"
f"3. Ensure at least one date column (Month, Year) exists\n"
f"4. Run: python data_quality.py to analyze data quality\n"
f"\n"
f"For more help, see: .cursor/rules/data_loading.md"
)
```
## Error Handling Patterns
### Try-Except with Context
```python
# Good: Provides context and recovery options
try:
df = load_sales_data(get_data_path())
except FileNotFoundError as e:
error_msg = (
f"Data file not found: {e}\n"
f"\n"
f"Context: Attempting to load data for analysis\n"
f"Expected file: {get_data_path()}\n"
f"\n"
f"Solution:\n"
f"1. Check that your CSV file exists at the expected location\n"
f"2. Update DATA_FILE in config.py with correct filename\n"
f"3. Or update DATA_DIR if file is in a subdirectory\n"
f"4. Run: python setup_wizard.py to reconfigure\n"
f"\n"
f"For more help, see: .cursor/rules/common_errors.md"
)
raise FileNotFoundError(error_msg) from e
```
### Validation with Helpful Messages
```python
# Good: Validates and provides specific guidance
def validate_data_structure(df: pd.DataFrame) -> Tuple[bool, str]:
"""
Validate DataFrame has required structure
Returns:
Tuple[bool, str]: (is_valid, error_message)
If is_valid is False, error_message contains actionable guidance
"""
errors = []
if REVENUE_COLUMN not in df.columns:
errors.append(
f"Missing required column '{REVENUE_COLUMN}'. "
f"Update REVENUE_COLUMN in config.py to match your data."
)
if DATE_COLUMN not in df.columns:
errors.append(
f"Missing required column '{DATE_COLUMN}'. "
f"Update DATE_COLUMN in config.py or add fallback columns."
)
if len(df) == 0:
errors.append(
f"DataFrame is empty. Check date filters (MIN_YEAR, MAX_DATE) in config.py."
)
if errors:
error_msg = "Data validation failed:\n" + "\n".join(f" - {e}" for e in errors)
error_msg += "\n\nRun: python config_validator.py for detailed validation"
return False, error_msg
return True, "OK"
```
## Warning Messages
### When to Use Warnings
Use warnings (not errors) for:
- Non-critical data quality issues
- Optional features that aren't configured
- Deprecated functionality
- Performance considerations
### Warning Format
```python
import warnings
# Good: Informative warning
if date_coverage < 0.9: # Less than 90% but not critical
warnings.warn(
f"Date coverage is {date_coverage:.1%} ({missing_count:,} rows missing dates).\n"
f"Consider adding fallback date columns to improve coverage.\n"
f"See .cursor/rules/data_loading.md for details.",
UserWarning
)
```
## Logging Errors
### Use Structured Logging
```python
from logger_config import get_logger
logger = get_logger('analysis_name')
try:
df = load_sales_data(get_data_path())
except Exception as e:
logger.error(
f"Failed to load data: {e}",
exc_info=True, # Include stack trace
extra={
'file_path': str(get_data_path()),
'config_file': 'config.py',
'suggestion': 'Run config_validator.py to check configuration'
}
)
raise
```
## AI-Friendly Error Messages
### Help AI Understand and Fix
Error messages should help AI assistants:
1. Understand what went wrong
2. Know where to look for fixes
3. Suggest specific solutions
4. Reference relevant documentation
```python
# Good: AI can parse and act on this
if column not in df.columns:
raise ValueError(
f"Column '{column}' not found.\n"
f"Available: {list(df.columns)}\n"
f"Fix: Update {column}_COLUMN in config.py\n"
f"See: .cursor/rules/data_loading.md"
)
# Bad: AI has no context
if column not in df.columns:
raise ValueError("Not found")
```
## Error Recovery
### Provide Recovery Options
```python
# Good: Offers recovery path
def load_sales_data(filepath=None):
try:
df = pd.read_csv(filepath)
except FileNotFoundError:
# Suggest alternatives
suggestions = [
f"1. Check file path: {filepath}",
f"2. Update DATA_FILE in config.py",
f"3. Run: python setup_wizard.py",
f"4. Generate sample data: python generate_sample_data.py"
]
raise FileNotFoundError(
f"Data file not found: {filepath}\n"
f"\n"
f"Options:\n" + "\n".join(suggestions)
)
```
## Summary
Good error handling:
- ✅ Specific and actionable
- ✅ Provides context
- ✅ Suggests solutions
- ✅ References documentation
- ✅ Helps both users and AI assistants
---
**Last Updated:** January 2026
**For:** Error handling in sales_analysis_template

View File

@@ -0,0 +1,89 @@
# LTM (Last Twelve Months) Methodology Rules
## ⭐ RECOMMENDED: Use analysis_utils.py
**Prefer utility functions:**
```python
from analysis_utils import get_ltm_period_config, get_annual_data, calculate_annual_metrics
from config import get_ltm_period, get_ltm_label
ltm_start, ltm_end = get_ltm_period_config()
year_data, year_label = get_annual_data(df, 2025, ltm_start, ltm_end)
```
## What is LTM?
**LTM (Last Twelve Months)** = Rolling 12-month period for the most recent partial year
- **Purpose:** Apples-to-apples comparison with full calendar years
- **Example:** If latest data is through September 2025, use Oct 2024 - Sep 2025 (12 months)
## When to Use LTM
- **Full calendar years (2021-2024):** Use complete year data
- **Most recent partial year (2025):** Use LTM if you only have partial year data
- **Complete years only:** Disable LTM in config if all years are complete
## Configuration
**Configure in config.py:**
```python
LTM_ENABLED = True # Set to False if all years are complete
LTM_START_MONTH = 10 # Month number (1-12)
LTM_START_YEAR = 2024
LTM_END_MONTH = 9
LTM_END_YEAR = 2025
```
## Implementation Pattern
```python
from analysis_utils import get_ltm_period_config, get_annual_data
ltm_start, ltm_end = get_ltm_period_config()
for year in sorted(df['Year'].unique()):
year_data, year_label = get_annual_data(df, year, ltm_start, ltm_end)
# year_label will be "2025 (LTM 9/2025)" for LTM year, or "2025" for regular year
```
## Labeling Requirements
**ALWAYS label LTM year with notation in:**
- Chart titles
- Chart x-axis labels
- Table headers
- Print statements
- Report text
**Example:**
```python
from config import get_ltm_label
ltm_label = get_ltm_label() # Returns "2025 (LTM 9/2025)" or None
if ltm_label:
title = f'Annual Revenue Trend\n({ltm_label})'
```
## Common Mistakes
**WRONG:**
```python
year_2025_data = df[df['Year'] == 2025] # Uses partial year (not comparable)
```
**CORRECT:**
```python
from analysis_utils import get_annual_data
ltm_start, ltm_end = get_ltm_period_config()
year_2025_data, year_label = get_annual_data(df, 2025, ltm_start, ltm_end)
```
## Disabling LTM
If all years in your analysis are complete calendar years:
```python
# In config.py:
LTM_ENABLED = False
```
Then all years will be treated as full calendar years.

203
EXAMPLES.md Normal file
View File

@@ -0,0 +1,203 @@
# Example Analysis Scripts
This directory contains working example analysis scripts that demonstrate how to use the sales analysis template framework.
## Available Examples
### 1. Annual Revenue Trend (`examples/annual_revenue_trend.py`)
**Purpose:** Simple annual revenue analysis with LTM support
**What it demonstrates:**
- Loading data using `data_loader`
- Calculating annual metrics with LTM
- Creating a revenue trend chart
- Following template best practices
**Usage:**
```bash
python examples/annual_revenue_trend.py
```
**Output:**
- Chart: `charts/annual_revenue_trend.png`
- Console output with annual revenue summary
---
### 2. Customer Segmentation (`examples/customer_segmentation.py`)
**Purpose:** Customer segmentation using RFM (Recency, Frequency, Monetary) methodology
**What it demonstrates:**
- Customer-level aggregation
- RFM scoring and segmentation
- Segment analysis and visualization
- Multiple chart generation
**Usage:**
```bash
python examples/customer_segmentation.py
```
**Output:**
- Chart: `charts/customer_segmentation.png`
- Console output with segment summary
**Segments:**
- **Champions:** High recency, frequency, and monetary value
- **Loyal Customers:** Regular customers with good value
- **At Risk:** Recent but declining frequency
- **Hibernating:** Low recency, may need reactivation
- **Potential Loyalists:** Good recency and frequency, lower value
- **Need Attention:** Mixed signals, need engagement
---
### 3. Product Performance (`examples/product_performance.py`)
**Purpose:** Product mix and performance analysis
**What it demonstrates:**
- Product-level aggregation
- Product performance metrics
- Top products identification
- Product mix visualization
**Usage:**
```bash
python examples/product_performance.py
```
**Output:**
- Chart: `charts/product_performance.png`
- Console output with top products summary
---
## How to Use Examples
### Step 1: Configure Template
Before running examples, ensure your template is configured:
```bash
python setup_wizard.py
```
Or manually update `config.py` with your data file and column mappings.
### Step 2: Prepare Data
Place your sales data CSV file in the template directory, or update `DATA_DIR` in `config.py`.
Alternatively, generate sample data for testing:
```bash
python generate_sample_data.py
```
### Step 3: Run Example
```bash
python examples/annual_revenue_trend.py
```
### Step 4: Customize
Copy an example script and modify it for your needs:
```bash
cp examples/annual_revenue_trend.py my_analysis.py
# Edit my_analysis.py
python my_analysis.py
```
---
## Example Patterns
### Pattern 1: Simple Annual Analysis
```python
from data_loader import load_sales_data
from analysis_utils import calculate_annual_metrics, get_ltm_period_config
from config import REVENUE_COLUMN
df = load_sales_data(get_data_path())
ltm_start, ltm_end = get_ltm_period_config()
def calculate_metrics(year_data):
return {'Revenue': year_data[REVENUE_COLUMN].sum()}
annual_df = calculate_annual_metrics(df, calculate_metrics, ltm_start, ltm_end)
```
### Pattern 2: Customer-Level Analysis
```python
from config import CUSTOMER_COLUMN, REVENUE_COLUMN
customer_metrics = df.groupby(CUSTOMER_COLUMN).agg({
REVENUE_COLUMN: 'sum',
DATE_COLUMN: 'count'
}).reset_index()
```
### Pattern 3: Product-Level Analysis
```python
from config import ITEM_COLUMN, REVENUE_COLUMN
product_metrics = df.groupby(ITEM_COLUMN)[REVENUE_COLUMN].sum().sort_values(ascending=False)
top_10 = product_metrics.head(10)
```
---
## Learning Path
1. **Start with:** `annual_revenue_trend.py` - Simplest example
2. **Then try:** `product_performance.py` - More complex aggregation
3. **Advanced:** `customer_segmentation.py` - Multi-step analysis with custom logic
---
## Troubleshooting
**"Module not found" errors:**
- Ensure you're running from the template root directory
- Check that all template files are present
**"Data file not found" errors:**
- Run `setup_wizard.py` to configure data file path
- Or update `DATA_FILE` in `config.py`
**"Column not found" errors:**
- Update column mappings in `config.py`
- Run `python config_validator.py` to check configuration
---
## Advanced Examples
For more sophisticated analyses, see:
- `.cursor/rules/advanced_analysis_patterns.md` - Advanced analysis patterns
- `.cursor/rules/ai_assistant_guide.md` - How to use Cursor AI effectively
## Next Steps
After running examples:
1. Review the generated charts
2. Examine the code to understand patterns
3. Copy an example and customize for your analysis
4. Check `.cursor/rules/analysis_patterns.md` for more patterns
5. Read `.cursor/rules/advanced_analysis_patterns.md` for advanced techniques
6. Use Cursor AI with prompts from `ai_assistant_guide.md`
7. Read `README.md` for comprehensive documentation
---
**Last Updated:** January 2026
**Template Version:** 1.0

175
QUICK_START.md Normal file
View File

@@ -0,0 +1,175 @@
# Quick Start Guide
**For Cursor Users:** This template is optimized for Cursor AI. Just ask: *"Create a revenue analysis using the template"* and the AI will handle everything.
## 🚀 Get Started in 5 Minutes
### Step 1: Install Dependencies
```bash
pip install -r requirements.txt
```
### Step 2: Run Setup Wizard
```bash
python setup_wizard.py
```
The wizard will ask you:
- Company name
- Data file location
- Column names in your CSV
- Date range
- LTM configuration (if needed)
### Step 3: Test Data Loading
```bash
python -c "from data_loader import load_sales_data; from config import get_data_path; df = load_sales_data(get_data_path()); print(f'✓ Loaded {len(df):,} rows')"
```
### Step 4: Run Example Analysis (Recommended)
```bash
# Try an example first to see how it works
python examples/annual_revenue_trend.py
```
### Step 5: Create Your First Analysis
```bash
cp analysis_template.py my_analysis.py
# Or copy an example
cp examples/annual_revenue_trend.py my_analysis.py
# Edit my_analysis.py
python my_analysis.py
```
---
## 📋 Essential Configuration Checklist
Before running analyses, verify in `config.py`:
- [ ] `COMPANY_NAME` - Your company name
- [ ] `DATA_FILE` - Your CSV filename
- [ ] `REVENUE_COLUMN` - Your revenue column name
- [ ] `DATE_COLUMN` - Your date column name
- [ ] `CUSTOMER_COLUMN` - Your customer column name
- [ ] `ANALYSIS_YEARS` - Years to include
- [ ] `MIN_YEAR` and `MAX_DATE` - Date range
- [ ] `LTM_ENABLED` - Set to False if all years complete
---
## 💡 Common Patterns
### Load Data
```python
from data_loader import load_sales_data
from config import get_data_path
df = load_sales_data(get_data_path())
```
### Calculate Annual Metrics
```python
from analysis_utils import calculate_annual_metrics, get_ltm_period_config
from config import REVENUE_COLUMN
ltm_start, ltm_end = get_ltm_period_config()
def calculate_metrics(year_data):
return {'Revenue': year_data[REVENUE_COLUMN].sum()}
annual_df = calculate_annual_metrics(df, calculate_metrics, ltm_start, ltm_end)
```
### Create Chart
```python
from analysis_utils import setup_revenue_chart, save_chart
from config import CHART_SIZES
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=CHART_SIZES['medium'])
ax.plot(data / 1e6, ...) # Divide by 1e6!
setup_revenue_chart(ax)
save_chart(fig, 'chart.png')
plt.close()
```
---
## ⚠️ Critical Rules
1. **ALWAYS use `data_loader.py`** - Never `pd.read_csv()` directly
2. **ALWAYS divide by 1e6** before plotting revenue
3. **ALWAYS use `setup_revenue_chart()`** for revenue charts
4. **ALWAYS use config values** - Never hardcode column names
5. **ALWAYS validate data** after loading
## 💡 New Utilities
### Data Quality Check
```bash
python -c "from data_quality import generate_data_quality_report, print_data_quality_report; from data_loader import load_sales_data; from config import get_data_path; df = load_sales_data(get_data_path()); report = generate_data_quality_report(df); print_data_quality_report(report)"
```
### Configuration Validation
```bash
python config_validator.py
```
### Export Results
```python
from export_utils import export_to_excel
export_to_excel(df, 'results.xlsx')
```
### Generate Sample Data
```bash
python generate_sample_data.py
```
---
## 🐛 Quick Troubleshooting
**"Data file not found"**
→ Check `DATA_FILE` in config.py
**"Column not found"**
→ Update column mappings in config.py
**Charts show 1e8 (scientific notation)**
→ Divide by 1e6 before plotting: `ax.plot(data / 1e6, ...)`
**"DataFrame is empty"**
→ Check `MIN_YEAR`, `MAX_DATE`, and `ANALYSIS_YEARS` in config.py
---
## 🎯 Using Cursor AI (Recommended)
This template is optimized for Cursor. Instead of manual setup, just ask:
```
"Create a revenue trend analysis using template patterns"
```
The AI will:
- ✅ Use all template utilities automatically
- ✅ Follow best practices
- ✅ Include proper validation
- ✅ Generate production-ready code
**See:** `.cursor/rules/ai_assistant_guide.md` for complete prompt library
## 📚 Next Steps
- **Run examples:** Try `examples/annual_revenue_trend.py` to see it in action
- **Check data quality:** Run `python data_quality.py` to analyze your data
- **Validate config:** Run `python config_validator.py` to check configuration
- **Read documentation:** See `README.md` for comprehensive guide
- **Review patterns:** Check `.cursor/rules/` for detailed patterns
- **See examples:** Check `EXAMPLES.md` for example script guide
---
**Need help?** Check `.cursor/rules/common_errors.md` for detailed troubleshooting.

589
README.md Normal file
View File

@@ -0,0 +1,589 @@
# Sales Analysis Template
**A best-in-class, reusable template for sales invoice detail analysis**
**Optimized for Cursor AI** - Just ask the AI to create analyses and it handles everything automatically.
This template provides a complete framework for analyzing sales data from any company. It's designed to be:
- **Flexible:** Works with different column names, date formats, and data structures
- **Automated:** Interactive setup wizard configures everything for your company
- **AI-Optimized:** Fully optimized for Cursor - AI knows all patterns and generates code automatically
- **Production-Ready:** Includes error handling, validation, and best practices
---
## 🚀 Quick Start
### 1. Setup (Automated)
Run the interactive setup wizard:
```bash
python setup_wizard.py
```
The wizard will ask you about:
- Company name and analysis date
- Data file location
- Column names in your CSV
- Date range and LTM configuration
- Exclusion filters (if needed)
### 2. Manual Setup (Alternative)
If you prefer to configure manually:
1. **Update `config.py`** with your company-specific settings:
- `COMPANY_NAME`: Your company name
- `DATA_FILE`: Your CSV filename
- `REVENUE_COLUMN`: Your revenue/amount column name
- `DATE_COLUMN`: Your primary date column
- Column mappings for Customer, Item, etc.
- Date range and LTM settings
2. **Place your data file** in the template directory (or update `DATA_DIR` in config.py)
### 3. Test Data Loading
Verify your configuration works:
```bash
python -c "from data_loader import load_sales_data; from config import get_data_path; df = load_sales_data(get_data_path()); print(f'Loaded {len(df):,} rows')"
```
### 4. Create Your First Analysis
Copy the template and customize:
```bash
cp analysis_template.py my_first_analysis.py
# Edit my_first_analysis.py with your analysis logic
python my_first_analysis.py
```
---
## 📁 Project Structure
```
sales_analysis_template/
├── README.md # This file
├── QUICK_START.md # Quick start guide
├── TEMPLATE_OVERVIEW.md # High-level overview
├── TEMPLATE_SUMMARY.md # Comprehensive template summary
├── EXAMPLES.md # Example scripts guide
├── SETUP_CHECKLIST.md # Setup verification checklist
├── requirements.txt # Python dependencies
├── setup_wizard.py # Interactive setup wizard
├── config.py # ⭐ Configuration (customize for your company)
├── config_validator.py # Configuration validation utility
├── data_loader.py # ⭐ Data loading with fallback logic
├── data_quality.py # Data quality reporting
├── data_processing.py # Data transformation utilities
├── analysis_utils.py # ⭐ Common utilities (formatters, LTM, helpers)
├── statistical_utils.py # Statistical analysis utilities
├── validate_revenue.py # Revenue validation utility
├── export_utils.py # Export to CSV/Excel
├── report_generator.py # PDF report generation
├── logger_config.py # Logging configuration
├── analysis_template.py # Template for creating new analyses
├── run_all_analyses.py # Batch runner for all scripts
├── generate_sample_data.py # Generate sample data for testing
├── examples/ # Example analysis scripts
│ ├── annual_revenue_trend.py # Simple annual revenue analysis
│ ├── customer_segmentation.py # RFM customer segmentation
│ ├── cohort_analysis.py # Customer cohort analysis
│ └── product_performance.py # Product performance analysis
├── tests/ # Unit tests
│ ├── test_data_loader.py # Data loader tests
│ ├── test_analysis_utils.py # Analysis utils tests
│ └── test_config_validator.py # Config validator tests
└── .cursor/
└── rules/ # Cursor IDE rules (auto-loaded)
├── ai_assistant_guide.md # Complete AI assistant guide
├── advanced_analysis_patterns.md # Advanced techniques
├── analysis_patterns.md # Common analysis patterns
├── chart_formatting.md # Chart formatting rules
├── code_quality.md # Code quality standards
├── common_errors.md # Error troubleshooting
├── data_loading.md # Data loading patterns
├── error_handling.md # Error handling patterns
└── ltm_methodology.md # LTM methodology
```
---
## 🔧 Configuration Guide
### Required Configuration
**In `config.py`, you MUST configure:**
1. **Company Information:**
```python
COMPANY_NAME = "Your Company Name"
```
2. **Data File:**
```python
DATA_FILE = 'your_sales_data.csv'
```
3. **Column Mappings:**
```python
REVENUE_COLUMN = 'USD' # Your revenue column name
DATE_COLUMN = 'InvoiceDate' # Your date column name
CUSTOMER_COLUMN = 'Customer' # Your customer column name
```
4. **Date Range:**
```python
MIN_YEAR = 2021
MAX_DATE = pd.Timestamp('2025-09-30')
ANALYSIS_YEARS = [2021, 2022, 2023, 2024, 2025]
```
### Optional Configuration
**LTM (Last Twelve Months):**
```python
LTM_ENABLED = True # Set to False if all years are complete
LTM_START_MONTH = 10
LTM_START_YEAR = 2024
LTM_END_MONTH = 9
LTM_END_YEAR = 2025
```
**Exclusion Filters:**
```python
EXCLUSION_FILTERS = {
'enabled': True,
'exclude_by_column': 'Country',
'exclude_values': ['Test', 'KVT']
}
```
**See `config.py` for all available options and detailed comments.**
---
## 📊 Data Requirements
### Required Columns
Your CSV file must have:
- **Revenue column:** A numeric column with sales amounts (configured as `REVENUE_COLUMN`)
- **Date column:** At least one date column (configured as `DATE_COLUMN`)
### Recommended Columns
For full analysis capabilities, include:
- **Customer/Account:** For customer segmentation and analysis
- **Item/Product:** For product analysis
- **Quantity:** For price calculations
- **Geographic:** Region, Country for geographic analysis
- **Segments:** Technology, EndMarket, ProductGroup for segmentation
### Date Column Fallback
The data loader supports fallback logic:
1. **Primary:** Uses `DATE_COLUMN` (e.g., InvoiceDate)
2. **Fallback 1:** Uses columns in `DATE_FALLBACK_COLUMNS` (e.g., Month, Year)
3. **Fallback 2:** Constructs from Year column if available
This ensures maximum date coverage even if some rows have missing dates.
---
## 💻 Creating Analysis Scripts
### Using the Template
1. **Copy the template:**
```bash
cp analysis_template.py my_analysis.py
```
2. **Update configuration:**
```python
ANALYSIS_NAME = "My Analysis"
DESCRIPTION = "Description of what this analysis does"
```
3. **Implement your logic:**
- Use `calculate_annual_metrics()` for annual aggregations
- Use `setup_revenue_chart()` and `save_chart()` for visualizations
- Follow patterns from `.cursor/rules/analysis_patterns.md`
4. **Run your analysis:**
```bash
python my_analysis.py
```
### Standard Pattern
```python
from data_loader import load_sales_data, validate_data_structure
from analysis_utils import (
get_ltm_period_config, calculate_annual_metrics,
setup_revenue_chart, save_chart, apply_exclusion_filters
)
from config import get_data_path, REVENUE_COLUMN, CHART_SIZES
# Load and validate
df = load_sales_data(get_data_path())
is_valid, msg = validate_data_structure(df)
if not is_valid:
print(f"ERROR: {msg}")
return
# Apply filters
df = apply_exclusion_filters(df)
# Calculate metrics
ltm_start, ltm_end = get_ltm_period_config()
annual_df = calculate_annual_metrics(df, calculate_metrics, ltm_start, ltm_end)
# Create charts
fig, ax = plt.subplots(figsize=CHART_SIZES['medium'])
ax.plot(data / 1e6, ...)
setup_revenue_chart(ax)
save_chart(fig, 'chart.png')
```
---
## 🎯 Key Features
### 1. Flexible Data Loading
- Handles different column names via configuration
- Fallback logic for date parsing (100% coverage)
- Automatic validation and error reporting
### 2. LTM (Last Twelve Months) Support
- Automatic LTM calculation for partial years
- Apples-to-apples comparison with full calendar years
- Configurable LTM periods
### 3. Standardized Chart Formatting
- Automatic millions formatter for revenue charts
- Consistent styling and sizing
- Professional output ready for reports
- Optional interactive charts with Plotly
### 4. Exclusion Filters
- Easy configuration for excluding segments
- Useful for excluding test accounts, business units, etc.
### 5. Revenue Validation
- Automatic validation after each analysis
- Ensures data loading is working correctly
- Optional validation against expected values
### 6. Example Scripts
- Working examples for common analyses
- Demonstrates best practices
- Easy to customize and extend
### 7. Data Export
- Export results to CSV and Excel
- Formatted summary tables
- Multiple sheet support
### 8. Data Quality Reporting
- Comprehensive data quality checks
- Missing value analysis
- Outlier detection
- Data profiling
### 9. Configuration Validation
- Early error detection
- Validates column mappings
- Checks date ranges and LTM configuration
### 10. Statistical Utilities
- Year-over-year growth calculations
- CAGR (Compound Annual Growth Rate)
- Correlation analysis
- Statistical significance testing
### 11. Report Generation
- Combine multiple charts into PDF reports
- Professional formatting
- Summary tables and metadata
### 12. Logging Infrastructure
- Structured logging with file and console output
- Analysis execution tracking
- Configurable log levels
---
## 📚 Documentation
### For AI Agents (Cursor IDE)
The `.cursor/rules/` directory contains comprehensive rules that are automatically loaded by Cursor:
- **`ai_assistant_guide.md`:** Complete guide with ready-to-use prompts
- **`advanced_analysis_patterns.md`:** Advanced techniques (cohort, PVM, forecasting, etc.)
- **`analysis_patterns.md`:** Standard patterns for creating analyses
- **`data_loading.md`:** Always use `data_loader.py`, never `pd.read_csv()` directly
- **`chart_formatting.md`:** How to format charts correctly
- **`ltm_methodology.md`:** LTM implementation and usage
- **`common_errors.md`:** Troubleshooting guide
- **`code_quality.md`:** Code quality standards and Cursor best practices
- **`error_handling.md`:** How to write AI-friendly error messages
### For Developers
- **`config.py`:** Heavily commented with all configuration options
- **`analysis_template.py`:** Template with examples and comments
- **`analysis_utils.py`:** Well-documented utility functions
---
## 🔍 Common Analysis Types
This template supports all standard sales analyses:
### Revenue Analyses
- Annual revenue trends
- Monthly revenue analysis
- Revenue by segment/product/geography
### Customer Analyses
- Customer segmentation (RFM)
- Customer concentration
- Churn analysis
- Cohort analysis
- Customer lifetime value (CLV)
### Product Analyses
- Product performance
- Product lifecycle
- BCG matrix
- Market basket analysis
### Financial Analyses
- Price elasticity
- Contribution margin
- Price vs volume analysis
### Advanced Analyses
- Seasonality analysis
- Time series forecasting
- Customer churn prediction
**See `examples/` directory for working example scripts, or the original Dukane project for 24+ production analysis scripts.**
---
## 🛠️ Dependencies
Install required packages:
```bash
pip install -r requirements.txt
```
**Core dependencies:**
- `pandas` - Data manipulation
- `numpy` - Numerical operations
- `matplotlib` - Charting
- `seaborn` - Enhanced visualizations
**Optional dependencies** (uncomment in requirements.txt if needed):
- `openpyxl` - Excel export (export_utils.py)
- `plotly` - Interactive charts (analysis_utils.py)
- `reportlab` - PDF reports (report_generator.py)
- `scipy` - Statistical analysis (statistical_utils.py)
- `pytest` - Unit testing
- `pmdarima` - Time series forecasting
- `mlxtend` - Market basket analysis
- `scikit-learn` - Machine learning
---
## ⚠️ Important Notes
### Always Use Utilities
**✅ DO:**
```python
from data_loader import load_sales_data
from analysis_utils import setup_revenue_chart, save_chart
from config import REVENUE_COLUMN, CHART_SIZES
```
**❌ DON'T:**
```python
df = pd.read_csv('data.csv') # Use data_loader instead
ax.plot(revenue, ...) # Divide by 1e6 first, use setup_revenue_chart()
```
### Chart Formatting
**ALWAYS divide revenue by 1e6 before plotting:**
```python
ax.plot(revenue / 1e6, ...) # Convert to millions
setup_revenue_chart(ax) # Apply formatter
```
### LTM Labeling
**ALWAYS label LTM years correctly:**
```python
from config import get_ltm_label
ltm_label = get_ltm_label() # Returns "2025 (LTM 9/2025)" or None
if ltm_label:
title += f'\n({ltm_label})'
```
---
## 🐛 Troubleshooting
### Data Loading Issues
**Problem:** "Data file not found"
- **Solution:** Check `DATA_FILE` path in config.py
- **Solution:** Ensure file is in template directory or update `DATA_DIR`
**Problem:** "Required column 'USD' not found"
- **Solution:** Update `REVENUE_COLUMN` in config.py to match your CSV
- **Solution:** Check all column mappings in config.py
**Problem:** "All dates are NaN"
- **Solution:** Add fallback date columns to `DATE_FALLBACK_COLUMNS`
- **Solution:** Check date format in your CSV
### Analysis Issues
**Problem:** Charts show scientific notation (1e8)
- **Solution:** Divide by 1e6 before plotting: `ax.plot(data / 1e6, ...)`
- **Solution:** Use `setup_revenue_chart(ax)` to apply formatter
**Problem:** "DataFrame is empty" after filtering
- **Solution:** Check `MIN_YEAR` and `MAX_DATE` in config.py
- **Solution:** Verify `ANALYSIS_YEARS` includes years in your data
**See `.cursor/rules/common_errors.md` for more troubleshooting help.**
---
## 📝 Example Workflow
### Complete Analysis Workflow
1. **Setup:**
```bash
python setup_wizard.py
```
2. **Test data loading:**
```bash
python -c "from data_loader import load_sales_data; from config import get_data_path; df = load_sales_data(get_data_path()); print(f'✓ Loaded {len(df):,} rows')"
```
3. **Create analysis:**
```bash
cp analysis_template.py revenue_analysis.py
# Edit revenue_analysis.py
```
4. **Run analysis:**
```bash
python revenue_analysis.py
```
5. **Add to batch runner:**
```python
# In run_all_analyses.py:
ANALYSIS_SCRIPTS = [
'revenue_analysis.py',
# ... other analyses
]
```
6. **Run all analyses:**
```bash
python run_all_analyses.py
```
---
## 🤝 Best Practices
1. **Always validate data** after loading:
```python
is_valid, msg = validate_data_structure(df)
```
2. **Use configuration values** instead of hardcoding:
```python
from config import REVENUE_COLUMN # ✅
revenue = df['USD'].sum() # ❌ Hardcoded
```
3. **Apply exclusion filters** if configured:
```python
df = apply_exclusion_filters(df)
```
4. **Validate revenue** at end of each analysis:
```python
validate_revenue(df, "Analysis Name")
```
5. **Use utility functions** for consistency:
```python
from analysis_utils import calculate_annual_metrics, setup_revenue_chart
```
---
## 📄 License
This template is provided as-is for use in sales analysis projects.
---
## 🙏 Acknowledgments
This template is based on best practices developed during the Dukane Corporation sales analysis project, which included 24+ production-ready analysis scripts and comprehensive documentation.
---
## 📞 Support
For questions or issues:
1. Check `.cursor/rules/` for detailed patterns and troubleshooting
2. Review `config.py` comments for configuration options
3. See example analyses in the original Dukane project
---
**Last Updated:** January 2026
**Template Version:** 1.0
**Status:** Production Ready

118
SETUP_CHECKLIST.md Normal file
View File

@@ -0,0 +1,118 @@
# Setup Checklist
Use this checklist to ensure your template is properly configured before running analyses.
## ✅ Initial Setup
- [ ] **Install dependencies**
```bash
pip install -r requirements.txt
```
- [ ] **Run setup wizard**
```bash
python setup_wizard.py
```
- [ ] **Place data file** in template directory (or update `DATA_DIR` in config.py)
## ✅ Configuration Verification
Open `config.py` and verify:
- [ ] **Company Information**
- [ ] `COMPANY_NAME` is set
- [ ] `ANALYSIS_DATE` is current
- [ ] **Data File**
- [ ] `DATA_FILE` matches your CSV filename
- [ ] File exists in expected location
- [ ] **Column Mappings**
- [ ] `REVENUE_COLUMN` matches your CSV
- [ ] `DATE_COLUMN` matches your CSV
- [ ] `CUSTOMER_COLUMN` matches your CSV (if applicable)
- [ ] `ITEM_COLUMN` matches your CSV (if applicable)
- [ ] `QUANTITY_COLUMN` matches your CSV (if applicable)
- [ ] **Date Configuration**
- [ ] `MIN_YEAR` is correct
- [ ] `MAX_DATE` is correct
- [ ] `ANALYSIS_YEARS` includes all years you want to analyze
- [ ] **LTM Configuration** (if needed)
- [ ] `LTM_ENABLED` is set correctly
- [ ] `LTM_START_MONTH`, `LTM_START_YEAR` are correct
- [ ] `LTM_END_MONTH`, `LTM_END_YEAR` are correct
- [ ] **Exclusion Filters** (if needed)
- [ ] `EXCLUSION_FILTERS['enabled']` is set correctly
- [ ] `exclude_by_column` matches a column in your data
- [ ] `exclude_values` list is correct
## ✅ Data Loading Test
- [ ] **Test data loading**
```bash
python -c "from data_loader import load_sales_data; from config import get_data_path; df = load_sales_data(get_data_path()); print(f'✓ Loaded {len(df):,} rows')"
```
- [ ] **Verify date coverage**
- Check output shows good date coverage (>95% recommended)
- Verify date range matches expectations
- [ ] **Verify revenue column**
- Check that revenue values are numeric
- Verify no unexpected NaN values
## ✅ First Analysis Test
- [ ] **Copy template**
```bash
cp analysis_template.py test_analysis.py
```
- [ ] **Run test analysis**
```bash
python test_analysis.py
```
- [ ] **Verify outputs**
- [ ] Chart generated successfully
- [ ] Chart saved to `charts/` directory
- [ ] Revenue validation passed
- [ ] No errors in console output
## ✅ Common Issues Check
Before running full analyses, verify:
- [ ] **Column names match** - All column mappings in config.py match your CSV
- [ ] **Date format works** - Dates are parsing correctly (check data_loader output)
- [ ] **Date range is correct** - MIN_YEAR and MAX_DATE include your data
- [ ] **LTM is configured** - If using LTM, dates are within your data range
- [ ] **Exclusions work** - If using exclusions, column and values are correct
## ✅ Ready for Production
Once all checks pass:
- [ ] **Create your analyses** using `analysis_template.py`
- [ ] **Add to batch runner** in `run_all_analyses.py`
- [ ] **Run all analyses** to generate complete analysis suite
---
## 🐛 Troubleshooting
If any check fails:
1. **Data loading issues:** See `.cursor/rules/data_loading.md`
2. **Configuration issues:** Review `config.py` comments
3. **Common errors:** See `.cursor/rules/common_errors.md`
4. **Pattern questions:** See `.cursor/rules/analysis_patterns.md`
---
**Checklist Version:** 1.0
**Last Updated:** January 2026

150
TEMPLATE_OVERVIEW.md Normal file
View File

@@ -0,0 +1,150 @@
# Sales Analysis Template - Overview
**Start here for a high-level understanding of the template.**
For detailed setup, see `QUICK_START.md`. For complete documentation, see `README.md`.
## 🎯 Purpose
This template provides a **production-ready, reusable framework** for analyzing sales invoice detail data from any company. It's designed to be:
- **Flexible:** Works with different column names, date formats, and data structures
- **Automated:** Interactive setup wizard configures everything
- **AI-Optimized:** Fully optimized for Cursor AI - just ask and the AI generates complete analyses
- **Best-in-Class:** Based on proven patterns from 24+ production analyses
## 📦 What's Included
### Core Framework
- **`config.py`** - Centralized configuration (customize for your company)
- **`data_loader.py`** - Intelligent data loading with fallback logic
- **`analysis_utils.py`** - Common utilities (formatters, LTM, helpers)
- **`validate_revenue.py`** - Revenue validation utility
### Templates & Tools
- **`analysis_template.py`** - Template for creating new analyses
- **`run_all_analyses.py`** - Batch runner for all scripts
- **`setup_wizard.py`** - Interactive setup wizard
### Documentation
- **`README.md`** - Comprehensive documentation
- **`QUICK_START.md`** - Quick reference guide
- **`.cursor/rules/`** - Cursor IDE rules for automation
### Configuration
- **`requirements.txt`** - Python dependencies
- **`.gitignore`** - Git ignore patterns
## 🚀 Quick Start
1. **Run setup wizard:**
```bash
python setup_wizard.py
```
2. **Test data loading:**
```bash
python -c "from data_loader import load_sales_data; from config import get_data_path; df = load_sales_data(get_data_path()); print(f'✓ Loaded {len(df):,} rows')"
```
3. **Create your first analysis:**
```bash
cp analysis_template.py my_analysis.py
# Edit my_analysis.py
python my_analysis.py
```
## 🎨 Key Features
### 1. Flexible Data Loading
- Handles different column names via configuration
- Fallback logic for date parsing (100% coverage)
- Automatic validation
### 2. LTM Support
- Automatic Last Twelve Months calculation
- Apples-to-apples comparison with full years
- Configurable periods
### 3. Standardized Formatting
- Automatic millions formatter for revenue
- Consistent chart styling
- Professional output
### 4. Exclusion Filters
- Easy configuration for excluding segments
- Useful for test accounts, business units, etc.
### 5. AI Automation
- Comprehensive Cursor rules
- Automated agent assistance
- Best practices enforcement
## 📊 Analysis Types Supported
This template supports all standard sales analyses:
- **Revenue:** Annual trends, monthly analysis, by segment
- **Customer:** Segmentation, concentration, churn, CLV
- **Product:** Performance, lifecycle, BCG matrix
- **Financial:** Price elasticity, margins
- **Advanced:** Seasonality, forecasting, predictions
## 🔧 Customization Points
All customization happens in `config.py`:
1. **Company Info:** Name, analysis date
2. **Data File:** Location, filename
3. **Column Mappings:** Revenue, date, customer, product, etc.
4. **Date Range:** Years, LTM configuration
5. **Filters:** Exclusion rules
6. **Chart Settings:** Sizes, styles, DPI
## 📚 Documentation Structure
- **`README.md`** - Complete guide (start here)
- **`QUICK_START.md`** - Quick start (includes Cursor tips)
- **`EXAMPLES.md`** - Example scripts guide
- **`TEMPLATE_SUMMARY.md`** - Comprehensive template overview
- **`.cursor/rules/`** - Detailed patterns for AI agents (auto-loaded by Cursor)
- **`config.py`** - Heavily commented configuration
## 🎓 Learning Path
1. **Read:** `QUICK_START.md` (5 minutes)
2. **Run:** `setup_wizard.py` (2 minutes)
3. **Test:** Data loading (1 minute)
4. **Create:** First analysis using `analysis_template.py` (15 minutes)
5. **Explore:** `.cursor/rules/` for patterns (as needed)
## 💡 Best Practices
1. **Always use utilities** - Don't reinvent the wheel
2. **Use config values** - Never hardcode column names
3. **Validate data** - After loading and after analysis
4. **Follow patterns** - See `.cursor/rules/analysis_patterns.md`
5. **Test incrementally** - Test data loading before full analysis
## 🔍 What Makes This "Best-in-Class"
1. **Proven Patterns:** Based on 24+ production analyses
2. **Flexibility:** Works with any data structure
3. **Automation:** Setup wizard + AI-friendly rules
4. **Documentation:** Comprehensive guides and examples
5. **Error Handling:** Validation and troubleshooting built-in
6. **Consistency:** Standardized formatting and patterns
## 📈 Next Steps
1. Run `setup_wizard.py` to configure for your company
2. Review `config.py` to understand all options
3. Create your first analysis using `analysis_template.py`
4. Explore `.cursor/rules/` for detailed patterns
5. Build your analysis suite
---
**Template Version:** 1.0
**Last Updated:** January 2026
**Status:** Production Ready

254
TEMPLATE_SUMMARY.md Normal file
View File

@@ -0,0 +1,254 @@
# Sales Analysis Template - Summary
**This document provides a comprehensive overview of the template structure and capabilities.**
For quick start, see `QUICK_START.md`. For detailed documentation, see `README.md`.
## 📋 What This Template Provides
This template was created based on the comprehensive Dukane Corporation sales analysis project, which included 24+ production-ready analysis scripts. All best practices, patterns, and lessons learned have been distilled into this reusable template.
## 📁 Complete File Structure
```
sales_analysis_template/
├── README.md # Comprehensive documentation
├── QUICK_START.md # Quick reference guide
├── TEMPLATE_OVERVIEW.md # Template overview and features
├── TEMPLATE_SUMMARY.md # This file
├── EXAMPLES.md # Example scripts guide
├── SETUP_CHECKLIST.md # Setup verification checklist
├── requirements.txt # Python dependencies
├── .gitignore # Git ignore patterns
├── Core Framework Files:
│ ├── config.py # ⭐ Centralized configuration
│ ├── config_validator.py # Configuration validation utility
│ ├── data_loader.py # ⭐ Intelligent data loading
│ ├── data_quality.py # Data quality reporting
│ ├── data_processing.py # Data transformation utilities
│ ├── analysis_utils.py # ⭐ Common utilities
│ ├── statistical_utils.py # Statistical analysis utilities
│ └── validate_revenue.py # Revenue validation
├── Utility Files:
│ ├── export_utils.py # Export to CSV/Excel
│ ├── report_generator.py # PDF report generation
│ ├── logger_config.py # Logging configuration
│ └── generate_sample_data.py # Generate sample data for testing
├── Templates & Tools:
│ ├── analysis_template.py # Template for new analyses
│ ├── run_all_analyses.py # Batch runner
│ └── setup_wizard.py # Interactive setup wizard
├── examples/ # Example analysis scripts
│ ├── annual_revenue_trend.py # Simple annual revenue analysis
│ ├── customer_segmentation.py # RFM customer segmentation
│ ├── cohort_analysis.py # Customer cohort analysis
│ └── product_performance.py # Product performance analysis
├── tests/ # Unit tests
│ ├── test_data_loader.py # Data loader tests
│ ├── test_analysis_utils.py # Analysis utils tests
│ └── test_config_validator.py # Config validator tests
└── .cursor/
└── rules/ # Cursor IDE rules (auto-loaded)
├── ai_assistant_guide.md # Complete AI assistant guide
├── advanced_analysis_patterns.md # Advanced techniques
├── analysis_patterns.md # Analysis patterns
├── chart_formatting.md # Chart formatting rules
├── code_quality.md # Code quality standards
├── common_errors.md # Error troubleshooting
├── data_loading.md # Data loading patterns
├── error_handling.md # Error handling patterns
└── ltm_methodology.md # LTM methodology
```
## 🎯 Key Features Implemented
### 1. Flexible Configuration System
- **`config.py`**: Centralized configuration with extensive comments
- All column names, date ranges, and settings configurable
- No hardcoded values - everything comes from config
### 2. Intelligent Data Loading
- **`data_loader.py`**: Fallback logic for date parsing
- Handles missing dates gracefully
- 100% date coverage via fallback columns
- Automatic validation and error reporting
### 3. Comprehensive Utilities
- **`analysis_utils.py`**: All common functions in one place
- Chart formatters (millions, thousands)
- LTM calculation helpers
- Mixed type handling for years
- Price calculation utilities
- Exclusion filter helpers
### 4. Interactive Setup
- **`setup_wizard.py`**: Asks clarifying questions
- Automatically configures `config.py`
- Validates inputs
- Provides next steps
### 5. AI-Friendly Rules
- **`.cursor/rules/`**: Comprehensive Cursor IDE rules
- Auto-loaded by Cursor
- Enforces best practices
- Provides patterns and troubleshooting
### 6. Production-Ready Templates
- **`analysis_template.py`**: Complete template with examples
- **`run_all_analyses.py`**: Batch runner with error handling
- Follows all best practices
## 🔑 Design Principles
### Flexibility
- Works with any column names (configured in config.py)
- Handles different date formats
- Supports various data structures
- Optional features (LTM, exclusions) can be disabled
### Automation
- Setup wizard asks all necessary questions
- Cursor rules guide AI agents automatically
- Batch runner handles multiple analyses
- Validation catches errors early
### Best Practices
- Always use utilities (never reinvent the wheel)
- Consistent formatting across all analyses
- Proper error handling and validation
- Comprehensive documentation
### Reusability
- Generic enough for any company
- Specific enough to be immediately useful
- Well-documented for future agents
- Easy to extend with new analyses
## 📊 Analysis Types Supported
The template supports all standard sales analyses:
### Revenue Analyses
- Annual revenue trends
- Monthly revenue analysis
- Revenue by segment/product/geography
### Customer Analyses
- Customer segmentation (RFM)
- Customer concentration
- Churn analysis
- Cohort analysis
- Customer lifetime value (CLV)
### Product Analyses
- Product performance
- Product lifecycle
- BCG matrix
- Market basket analysis
### Financial Analyses
- Price elasticity
- Contribution margin
- Price vs volume analysis
### Advanced Analyses
- Seasonality analysis
- Time series forecasting
- Customer churn prediction
## 🚀 Usage Workflow
1. **Setup** (5 minutes)
- Run `setup_wizard.py`
- Answer questions about your data
- Configuration automatically updated
2. **Test** (2 minutes)
- Test data loading
- Verify configuration works
3. **Create** (15 minutes)
- Copy `analysis_template.py`
- Customize for your analysis
- Run and verify
4. **Scale** (ongoing)
- Create multiple analyses
- Add to batch runner
- Generate complete analysis suite
## 💡 What Makes This "Best-in-Class"
1. **Proven Patterns**: Based on 24+ production analyses
2. **Comprehensive**: Covers all common analysis types
3. **Flexible**: Works with any data structure
4. **Automated**: Setup wizard + AI-friendly rules
5. **Documented**: Extensive documentation at every level
6. **Production-Ready**: Error handling, validation, best practices
## 📚 Documentation Hierarchy
1. **`QUICK_START.md`** - Start here (5-minute overview, includes Cursor tips)
2. **`README.md`** - Complete guide (comprehensive)
3. **`EXAMPLES.md`** - Example scripts guide
4. **`TEMPLATE_OVERVIEW.md`** - High-level overview
5. **`SETUP_CHECKLIST.md`** - Verification checklist
6. **`.cursor/rules/`** - Detailed patterns for AI agents (auto-loaded by Cursor)
7. **`config.py`** - Inline comments for all options
## 🎓 Learning Resources
- **Quick Start**: `QUICK_START.md` - Get running in 5 minutes
- **Full Guide**: `README.md` - Complete documentation
- **Patterns**: `.cursor/rules/analysis_patterns.md` - Code patterns
- **Troubleshooting**: `.cursor/rules/common_errors.md` - Fix issues
- **Examples**: `analysis_template.py` - Working example
## ✅ Quality Assurance
All components include:
- ✅ Error handling
- ✅ Input validation
- ✅ Comprehensive comments
- ✅ Type hints where helpful
- ✅ Documentation strings
- ✅ Best practices enforcement
## 🔄 Future Enhancements
Potential additions (not included in v1.0):
- Example analysis scripts (can be added from Dukane project)
- Unit tests
- CI/CD configuration
- Docker containerization
- Additional visualization libraries
## 📝 Notes for Users
1. **First Time**: Start with `QUICK_START.md` and `setup_wizard.py`
2. **Configuration**: All customization in `config.py`
3. **Creating Analyses**: Use `analysis_template.py` as starting point
4. **AI Assistance**: Cursor rules are auto-loaded, just ask for help
5. **Troubleshooting**: Check `.cursor/rules/common_errors.md` first
## 🎉 Success Criteria
The template is ready when:
- ✅ Setup wizard runs successfully
- ✅ Data loads without errors
- ✅ First analysis generates charts
- ✅ All validations pass
- ✅ Documentation is clear
---
**Template Version:** 1.0
**Created:** January 2026
**Based On:** Dukane Corporation Sales Analysis Project
**Status:** Production Ready ✅

147
analysis_template.py Normal file
View File

@@ -0,0 +1,147 @@
"""
Template for creating new analysis scripts
Copy this file and modify for your specific analysis
Usage:
1. Copy this file: cp analysis_template.py my_new_analysis.py
2. Update the ANALYSIS_NAME and DESCRIPTION
3. Implement your analysis logic in the main() function
4. Update the chart generation section
5. Run: python my_new_analysis.py
"""
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
# Import utilities
from data_loader import load_sales_data, validate_data_structure
from validate_revenue import validate_revenue
from analysis_utils import (
get_ltm_period_config, get_annual_data, calculate_annual_metrics,
get_millions_formatter, setup_revenue_chart, save_chart,
format_currency, print_annual_summary, sort_mixed_years,
apply_exclusion_filters
)
from config import (
DATA_FILE, OUTPUT_DIR, ANALYSIS_YEARS, MAX_DATE,
CHART_SIZES, ensure_directories, get_data_path, COMPANY_NAME
)
# ============================================================================
# CONFIGURATION
# ============================================================================
ANALYSIS_NAME = "Template Analysis"
DESCRIPTION = "Template for new analyses - customize this for your specific analysis"
# ============================================================================
# MAIN ANALYSIS FUNCTION
# ============================================================================
def main():
"""Main analysis function"""
print(f"\n{'='*60}")
print(f"{ANALYSIS_NAME}")
print(f"{'='*60}\n")
# 1. Load data
print("Loading data...")
try:
df = load_sales_data(get_data_path())
print(f"Loaded {len(df):,} transactions")
except Exception as e:
print(f"ERROR loading data: {e}")
return
# 2. Validate data structure
is_valid, msg = validate_data_structure(df)
if not is_valid:
print(f"ERROR: {msg}")
return
print("Data validation passed")
# 3. Apply exclusion filters (if configured)
df = apply_exclusion_filters(df)
# 4. Filter by date range
from config import MIN_YEAR, DATE_COLUMN
df = df[df['Year'] >= MIN_YEAR]
if DATE_COLUMN in df.columns:
df = df[df[DATE_COLUMN] <= MAX_DATE]
# 5. Setup LTM period (if enabled)
ltm_start, ltm_end = get_ltm_period_config()
if ltm_start and ltm_end:
print(f"LTM period: {ltm_start} to {ltm_end}")
# 6. Prepare data
print("\nPreparing data...")
# Add your data preparation logic here
# Example: df['CustomColumn'] = df[REVENUE_COLUMN] * df[QUANTITY_COLUMN]
# 7. Calculate annual metrics
print("\nCalculating annual metrics...")
def calculate_metrics(year_data):
"""Calculate metrics for a single year"""
from config import REVENUE_COLUMN
return {
'Revenue': year_data[REVENUE_COLUMN].sum(),
# Add your custom metrics here
# 'CustomMetric': year_data['CustomColumn'].mean(),
}
annual_df = calculate_annual_metrics(df, calculate_metrics, ltm_start, ltm_end)
# 8. Print summary
print_annual_summary(annual_df, 'Revenue', 'Revenue')
# 9. Create visualizations
print("Generating charts...")
ensure_directories()
# Example chart: Annual revenue trend
fig, ax = plt.subplots(figsize=CHART_SIZES['medium'])
# Prepare data for plotting (handle mixed types)
annual_df_sorted = sort_mixed_years(annual_df.reset_index(), 'Year')
years = annual_df_sorted['Year'].tolist()
revenue = annual_df_sorted['Revenue'].values / 1e6 # Convert to millions
# Create chart
ax.plot(range(len(years)), revenue, marker='o', linewidth=2, markersize=8)
ax.set_xticks(range(len(years)))
ax.set_xticklabels(years, rotation=45, ha='right')
setup_revenue_chart(ax)
# Add LTM notation to title if applicable
title = f'Annual Revenue Trend - {COMPANY_NAME}'
if ltm_start and ltm_end:
from config import get_ltm_label
ltm_label = get_ltm_label()
if ltm_label:
title += f'\n({ltm_label})'
ax.set_title(title)
plt.tight_layout()
save_chart(fig, f'{ANALYSIS_NAME.lower().replace(" ", "_")}_trend.png')
plt.close()
# Add more charts as needed...
# 10. Validate revenue
print("\nValidating revenue...")
validate_revenue(df, ANALYSIS_NAME)
print(f"\n{ANALYSIS_NAME} complete!")
print(f"Charts saved to: {OUTPUT_DIR}")
# ============================================================================
# RUN ANALYSIS
# ============================================================================
if __name__ == "__main__":
main()

510
analysis_utils.py Normal file
View File

@@ -0,0 +1,510 @@
"""
Common utilities for analysis scripts
Provides formatters, LTM setup, and helper functions
This module is designed to work with any sales data structure
by using configuration from config.py
"""
import pandas as pd
import numpy as np
from matplotlib.ticker import FuncFormatter
from pathlib import Path
from config import (
REVENUE_COLUMN, LTM_ENABLED, get_ltm_period, get_ltm_label,
OUTPUT_DIR, CHART_DPI, CHART_BBOX
)
# ============================================================================
# CHART FORMATTERS
# ============================================================================
def millions_formatter(x: float, pos: int) -> str:
"""
Format numbers in millions for chart display (e.g., $99.9m)
This formatter is used with matplotlib FuncFormatter to display
revenue values in millions on chart axes.
Args:
x: Numeric value (already in millions, e.g., 99.9 for $99.9m)
pos: Position parameter (required by FuncFormatter, not used)
Returns:
str: Formatted string like "$99.9m"
Example:
>>> from matplotlib.ticker import FuncFormatter
>>> formatter = FuncFormatter(millions_formatter)
>>> ax.yaxis.set_major_formatter(formatter)
"""
return f'${x:.1f}m'
def thousands_formatter(x: float, pos: int) -> str:
"""
Format numbers in thousands for chart display (e.g., $99.9k)
Args:
x: Numeric value (already in thousands)
pos: Position parameter (required by FuncFormatter, not used)
Returns:
str: Formatted string like "$99.9k"
"""
return f'${x:.1f}k'
def get_millions_formatter() -> FuncFormatter:
"""
Get FuncFormatter for millions
Returns:
FuncFormatter: Configured formatter for millions display
"""
return FuncFormatter(millions_formatter)
def get_thousands_formatter() -> FuncFormatter:
"""
Get FuncFormatter for thousands
Returns:
FuncFormatter: Configured formatter for thousands display
"""
return FuncFormatter(thousands_formatter)
# ============================================================================
# LTM (Last Twelve Months) SETUP
# ============================================================================
def get_ltm_period_config():
"""
Get LTM period boundaries from config
Returns:
tuple: (ltm_start, ltm_end) as pd.Period objects, or (None, None) if disabled
"""
if LTM_ENABLED:
return get_ltm_period()
return None, None
def get_annual_data(df, year, ltm_start=None, ltm_end=None):
"""
Get data for a specific year, using LTM for the most recent partial year
Args:
df: DataFrame with 'Year' and 'YearMonth' columns
year: Year to extract (int)
ltm_start: LTM start period (defaults to config if None)
ltm_end: LTM end period (defaults to config if None)
Returns:
tuple: (year_data DataFrame, year_label string)
"""
from config import LTM_END_YEAR
# Get LTM period from config if not provided
if ltm_start is None or ltm_end is None:
ltm_start, ltm_end = get_ltm_period_config()
# Use LTM for the most recent year if enabled
if LTM_ENABLED and ltm_start and ltm_end and year == LTM_END_YEAR:
if 'YearMonth' in df.columns:
year_data = df[(df['YearMonth'] >= ltm_start) & (df['YearMonth'] <= ltm_end)]
year_label = get_ltm_label() or str(year)
else:
# Fallback if YearMonth not available
year_data = df[df['Year'] == year]
year_label = str(year)
else:
# Use full calendar year
year_data = df[df['Year'] == year]
year_label = str(year)
return year_data, year_label
def calculate_annual_metrics(df, metrics_func, ltm_start=None, ltm_end=None):
"""
Calculate annual metrics for all years, using LTM for most recent year
Args:
df: DataFrame with 'Year' and 'YearMonth' columns
metrics_func: Function that takes a DataFrame and returns a dict of metrics
ltm_start: LTM start period (defaults to config if None)
ltm_end: LTM end period (defaults to config if None)
Returns:
DataFrame with 'Year' index and metric columns
"""
from config import ANALYSIS_YEARS
if ltm_start is None or ltm_end is None:
ltm_start, ltm_end = get_ltm_period_config()
annual_data = []
for year in sorted(ANALYSIS_YEARS):
if year in df['Year'].unique():
year_data, year_label = get_annual_data(df, year, ltm_start, ltm_end)
if len(year_data) > 0:
metrics = metrics_func(year_data)
metrics['Year'] = year_label
annual_data.append(metrics)
if not annual_data:
return pd.DataFrame()
return pd.DataFrame(annual_data).set_index('Year')
# ============================================================================
# MIXED TYPE HANDLING
# ============================================================================
def create_year_sort_column(df, year_col='Year'):
"""
Create a numeric sort column for mixed int/str year columns
Args:
df: DataFrame
year_col: Name of year column
Returns:
Series with numeric sort values
"""
from config import LTM_END_YEAR
def sort_value(x):
if isinstance(x, str) and str(LTM_END_YEAR) in str(x):
return float(LTM_END_YEAR) + 0.5
elif isinstance(x, (int, float)):
return float(x)
else:
return 9999
return df[year_col].apply(sort_value)
def sort_mixed_years(df, year_col='Year'):
"""
Sort DataFrame by year column that may contain mixed int/str types
Args:
df: DataFrame
year_col: Name of year column
Returns:
Sorted DataFrame
"""
df = df.copy()
df['_Year_Sort'] = create_year_sort_column(df, year_col)
df = df.sort_values('_Year_Sort').drop(columns=['_Year_Sort'])
return df
def safe_year_labels(years):
"""
Convert year values to safe string labels for chart axes
Args:
years: Iterable of year values (int or str)
Returns:
List of string labels
"""
return [str(year) for year in years]
# ============================================================================
# CHART HELPERS
# ============================================================================
def setup_revenue_chart(ax, ylabel: str = 'Revenue (Millions USD)') -> None:
"""
Setup a chart axis for revenue display (millions)
CRITICAL: Always use this function for revenue charts. It applies
the millions formatter and standard styling.
IMPORTANT: Data must be divided by 1e6 BEFORE plotting:
ax.plot(revenue / 1e6, ...) # ✅ Correct
ax.plot(revenue, ...) # ❌ Wrong - will show scientific notation
Args:
ax: Matplotlib axis object to configure
ylabel: Y-axis label (default: 'Revenue (Millions USD)')
Returns:
None: Modifies ax in place
Example:
>>> import matplotlib.pyplot as plt
>>> from analysis_utils import setup_revenue_chart
>>> fig, ax = plt.subplots()
>>> ax.plot(revenue_data / 1e6, marker='o') # Divide by 1e6 first!
>>> setup_revenue_chart(ax)
>>> plt.show()
See Also:
- .cursor/rules/chart_formatting.md for detailed patterns
- save_chart() for saving charts
"""
ax.yaxis.set_major_formatter(get_millions_formatter())
ax.set_ylabel(ylabel)
ax.grid(True, alpha=0.3)
def save_chart(fig, filename, output_dir=None):
"""
Save chart to file with organized directory structure
Args:
fig: Matplotlib figure object
filename: Output filename (e.g., 'revenue_trend.png')
output_dir: Output directory (defaults to config.OUTPUT_DIR)
"""
if output_dir is None:
output_dir = OUTPUT_DIR
else:
output_dir = Path(output_dir)
output_dir.mkdir(exist_ok=True)
filepath = output_dir / filename
fig.savefig(filepath, dpi=CHART_DPI, bbox_inches=CHART_BBOX, format='png')
print(f"Chart saved: {filepath}")
# ============================================================================
# DATA VALIDATION
# ============================================================================
def validate_dataframe(df, required_columns=None):
"""
Validate DataFrame has required columns and basic data quality
Args:
df: DataFrame to validate
required_columns: List of required column names (defaults to config)
Returns:
tuple: (is_valid bool, error_message str)
"""
if required_columns is None:
required_columns = [REVENUE_COLUMN, 'Year']
if 'YearMonth' in df.columns:
required_columns.append('YearMonth')
missing_cols = [col for col in required_columns if col not in df.columns]
if missing_cols:
return False, f"Missing required columns: {missing_cols}"
if len(df) == 0:
return False, "DataFrame is empty"
if REVENUE_COLUMN in df.columns:
if df[REVENUE_COLUMN].isna().all():
return False, f"All {REVENUE_COLUMN} values are NaN"
return True, "OK"
# ============================================================================
# PRICE CALCULATION
# ============================================================================
def calculate_price_per_unit(df, quantity_col=None, revenue_col=None):
"""
Calculate average price per unit, excluding invalid quantities
Args:
df: DataFrame with quantity and revenue columns
quantity_col: Name of quantity column (defaults to config)
revenue_col: Name of revenue column (defaults to config)
Returns:
float: Average price per unit
"""
from config import QUANTITY_COLUMN, REVENUE_COLUMN, MIN_QUANTITY, MAX_QUANTITY
if quantity_col is None:
quantity_col = QUANTITY_COLUMN
if revenue_col is None:
revenue_col = REVENUE_COLUMN
# Check if quantity column exists
if quantity_col not in df.columns:
return np.nan
# Filter for valid quantity transactions
df_valid = df[(df[quantity_col] > MIN_QUANTITY) & (df[quantity_col] <= MAX_QUANTITY)].copy()
if len(df_valid) == 0:
return np.nan
total_revenue = df_valid[revenue_col].sum()
total_quantity = df_valid[quantity_col].sum()
if total_quantity == 0:
return np.nan
return total_revenue / total_quantity
# ============================================================================
# OUTPUT FORMATTING
# ============================================================================
def format_currency(value: float, millions: bool = True) -> str:
"""
Format currency value for console output
Args:
value: Numeric value to format
millions: If True, format as millions ($X.Xm), else thousands ($X.Xk)
Returns:
str: Formatted string like "$99.9m" or "$99.9k" or "N/A" if NaN
Example:
>>> format_currency(1000000)
'$1.00m'
>>> format_currency(1000, millions=False)
'$1.00k'
"""
if pd.isna(value):
return "N/A"
if millions:
return f"${value / 1e6:.2f}m"
else:
return f"${value / 1e3:.2f}k"
def print_annual_summary(annual_df, metric_col='Revenue', label='Revenue'):
"""
Print formatted annual summary to console
Args:
annual_df: DataFrame with annual metrics (indexed by Year)
metric_col: Column name to print
label: Label for the metric
"""
print(f"\n{label} by Year:")
print("-" * 40)
for year in annual_df.index:
value = annual_df.loc[year, metric_col]
formatted = format_currency(value)
print(f" {year}: {formatted}")
print()
# ============================================================================
# DATA FILTERING HELPERS
# ============================================================================
def apply_exclusion_filters(df):
"""
Apply exclusion filters from config
Args:
df: DataFrame to filter
Returns:
Filtered DataFrame
"""
from config import EXCLUSION_FILTERS
if not EXCLUSION_FILTERS.get('enabled', False):
return df
exclude_col = EXCLUSION_FILTERS.get('exclude_by_column')
exclude_values = EXCLUSION_FILTERS.get('exclude_values', [])
if exclude_col and exclude_col in df.columns and exclude_values:
original_count = len(df)
df_filtered = df[~df[exclude_col].isin(exclude_values)]
excluded_count = original_count - len(df_filtered)
if excluded_count > 0:
print(f"Excluded {excluded_count:,} rows based on {exclude_col} filter")
return df_filtered
return df
# ============================================================================
# INTERACTIVE VISUALIZATIONS (OPTIONAL - PLOTLY)
# ============================================================================
def create_interactive_chart(data, chart_type='line', title=None, xlabel=None, ylabel=None):
"""
Create interactive chart using Plotly (optional dependency)
Args:
data: DataFrame or dict with chart data
chart_type: Type of chart ('line', 'bar', 'scatter')
title: Chart title
xlabel: X-axis label
ylabel: Y-axis label
Returns:
plotly.graph_objects.Figure: Plotly figure object
Raises:
ImportError: If plotly is not installed
Example:
fig = create_interactive_chart(
{'x': [1, 2, 3], 'y': [10, 20, 30]},
chart_type='line',
title='Revenue Trend'
)
fig.show()
"""
try:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
except ImportError:
raise ImportError(
"plotly is required for interactive charts. Install with: pip install plotly"
)
fig = go.Figure()
if chart_type == 'line':
if isinstance(data, dict) and 'x' in data and 'y' in data:
fig.add_trace(go.Scatter(
x=data['x'],
y=data['y'],
mode='lines+markers',
name='Data'
))
elif chart_type == 'bar':
if isinstance(data, dict) and 'x' in data and 'y' in data:
fig.add_trace(go.Bar(
x=data['x'],
y=data['y'],
name='Data'
))
if title:
fig.update_layout(title=title)
if xlabel:
fig.update_xaxes(title_text=xlabel)
if ylabel:
fig.update_yaxes(title_text=ylabel)
fig.update_layout(
template='plotly_white',
hovermode='x unified'
)
return fig
def save_interactive_chart(fig, filename, output_dir=None):
"""
Save interactive Plotly chart to HTML file
Args:
fig: Plotly figure object
filename: Output filename (e.g., 'chart.html')
output_dir: Output directory (defaults to config.OUTPUT_DIR)
"""
if output_dir is None:
output_dir = OUTPUT_DIR
else:
output_dir = Path(output_dir)
output_dir.mkdir(exist_ok=True)
filepath = output_dir / filename
fig.write_html(str(filepath))
print(f"Interactive chart saved: {filepath}")
return filepath

277
config.py Normal file
View File

@@ -0,0 +1,277 @@
"""
Configuration file for sales analysis scripts
CONFIGURE THIS FILE FOR YOUR COMPANY'S SPECIFIC DATA STRUCTURE
This file should be customized based on:
- Your data file name and location
- Column names in your sales data
- Date range and LTM period
- Company-specific settings
CRITICAL: All column names, file paths, and settings are defined here.
Never hardcode these values in analysis scripts - always import from config.
Usage:
from config import REVENUE_COLUMN, DATE_COLUMN, get_data_path
revenue = df[REVENUE_COLUMN].sum() # ✅ Correct
revenue = df['USD'].sum() # ❌ Wrong - hardcoded
Quick Setup:
1. Run: python setup_wizard.py (interactive configuration)
2. Or manually edit this file following the TODO comments
3. Validate: python config_validator.py
See Also:
- .cursor/rules/analysis_patterns.md - How to use config values
- setup_wizard.py - Interactive configuration tool
- config_validator.py - Configuration validation
"""
from pathlib import Path
from typing import Optional, Tuple
import pandas as pd
# ============================================================================
# COMPANY INFORMATION
# ============================================================================
# TODO: Update these values for your company
COMPANY_NAME = "Your Company Name" # Update this
ANALYSIS_DATE = "2026-01-12" # Update this to current date
# ============================================================================
# DATA FILES
# ============================================================================
# TODO: Update with your actual data file name
DATA_FILE = 'sales_data.csv' # Update this to your CSV file name
OUTPUT_DIR = Path('charts')
REPORTS_DIR = Path('reports')
DATA_DIR = Path('data') # Optional: if data is in a subdirectory
# ============================================================================
# DATA COLUMN MAPPINGS
# ============================================================================
# TODO: Map these to your actual column names
# These are the expected column names - update if your CSV uses different names
# Revenue column (REQUIRED)
REVENUE_COLUMN = 'USD' # Common alternatives: 'Amount', 'Revenue', 'Total', 'Sales'
# Date columns (at least one required)
DATE_COLUMN = 'InvoiceDate' # Primary date column
DATE_FALLBACK_COLUMNS = ['Month', 'Year'] # Fallback columns if primary is missing
# Customer/Account columns
CUSTOMER_COLUMN = 'Customer' # Common alternatives: 'Account', 'CustomerName', 'Client'
# Product/Item columns
ITEM_COLUMN = 'Item' # Common alternatives: 'Product', 'SKU', 'ItemCode'
PRODUCT_GROUP_COLUMN = 'ProductGroup' # Optional: for product categorization
QUANTITY_COLUMN = 'Quantity' # Optional: for price calculations
# Geographic columns (optional)
REGION_COLUMN = 'Region' # Optional: for geographic analysis
COUNTRY_COLUMN = 'Country' # Optional: for country-level analysis
# Segment/Category columns (optional - customize based on your data)
SEGMENT_COLUMNS = {
'Technology': 'Technology', # Optional: technology/product type
'EndMarket': 'EndMarket', # Optional: end market/industry
'ProductGroup': 'ProductGroup', # Optional: product category
}
# Invoice/Transaction columns
INVOICE_NUMBER_COLUMN = 'Invoice #' # Optional: for transaction-level analysis
# ============================================================================
# DATE RANGE CONFIGURATION
# ============================================================================
# TODO: Update these based on your data and analysis needs
# Analysis years (years to include in analysis)
ANALYSIS_YEARS = [2021, 2022, 2023, 2024, 2025] # Update based on your data
# LTM (Last Twelve Months) Configuration
# For the most recent partial year, use LTM for apples-to-apples comparison
# Example: If latest data is through September 2025, use Oct 2024 - Sep 2025
LTM_ENABLED = True # Set to False if you have complete calendar years only
LTM_START_MONTH = 10 # Month number (1-12) for LTM start
LTM_START_YEAR = 2024 # Year for LTM start
LTM_END_MONTH = 9 # Month number (1-12) for LTM end
LTM_END_YEAR = 2025 # Year for LTM end
# Generate LTM period objects
if LTM_ENABLED:
LTM_START = pd.Period(f'{LTM_START_YEAR}-{LTM_START_MONTH:02d}', freq='M')
LTM_END = pd.Period(f'{LTM_END_YEAR}-{LTM_END_MONTH:02d}', freq='M')
LTM_LABEL = f'{LTM_END_YEAR} (LTM {LTM_END_MONTH}/{LTM_END_YEAR})'
else:
LTM_START = None
LTM_END = None
LTM_LABEL = None
# Data date range (filter data to this range)
MIN_YEAR = 2021 # Minimum year to include
MAX_DATE = pd.Timestamp('2025-09-30') # Maximum date to include (update based on your data)
# ============================================================================
# CHART SETTINGS
# ============================================================================
CHART_DPI = 300
CHART_FORMAT = 'png'
CHART_BBOX = 'tight'
CHART_STYLE = 'seaborn-v0_8' # Options: 'default', 'ggplot', 'seaborn-v0_8', etc.
# Chart size presets
CHART_SIZES = {
'small': (6, 4),
'medium': (10, 6),
'large': (12, 8),
'wide': (14, 6)
}
# ============================================================================
# DATA FILTERING
# ============================================================================
# Quantity filtering for price calculations (exclude outliers)
MIN_QUANTITY = 0 # Minimum valid quantity
MAX_QUANTITY = 1000 # Maximum valid quantity (adjust based on your data)
# Revenue filtering (optional - exclude negative values, returns, etc.)
EXCLUDE_NEGATIVE_REVENUE = False # Set to True to exclude negative revenue (returns/credits)
MIN_REVENUE = None # Optional: minimum revenue threshold
# ============================================================================
# EXCLUSION FILTERS (Optional)
# ============================================================================
# Use this section to exclude specific segments, customers, or products
# Example: Exclude a business unit, test accounts, etc.
EXCLUSION_FILTERS = {
'enabled': False, # Set to True to enable exclusions
'exclude_by_column': None, # Column name to filter on (e.g., 'Country', 'Segment')
'exclude_values': [], # List of values to exclude (e.g., ['KVT', 'Test'])
}
# ============================================================================
# VALIDATION THRESHOLDS (Optional)
# ============================================================================
# Expected revenue ranges for validation (update based on your company)
# These are used to validate that data loading is working correctly
VALIDATION_ENABLED = False # Set to True to enable validation
EXPECTED_REVENUE = {} # Example: {2021: 99_880_000, 2024: 89_990_000}
REVENUE_TOLERANCE_PCT = 0.01 # 1% tolerance for validation
# ============================================================================
# HELPER FUNCTIONS
# ============================================================================
def ensure_directories() -> None:
"""
Create output directories if they don't exist
Creates charts/ and reports/ directories for saving analysis outputs.
Called automatically by get_chart_path() and get_report_path().
Returns:
None: Creates directories in place
"""
OUTPUT_DIR.mkdir(exist_ok=True)
REPORTS_DIR.mkdir(exist_ok=True)
if DATA_DIR.exists():
DATA_DIR.mkdir(exist_ok=True)
def get_chart_path(filename: str) -> Path:
"""
Get full path for chart file
Args:
filename: Chart filename (e.g., 'revenue_trend.png')
Returns:
Path: Full path to chart file in OUTPUT_DIR
"""
ensure_directories()
return OUTPUT_DIR / filename
def get_report_path(filename: str) -> Path:
"""
Get full path for report file
Args:
filename: Report filename (e.g., 'analysis_report.pdf')
Returns:
Path: Full path to report file in REPORTS_DIR
"""
ensure_directories()
return REPORTS_DIR / filename
def get_data_path(filename: Optional[str] = None) -> Path:
"""
Get full path for data file
This function handles data file location logic:
- If DATA_DIR exists, looks there first
- Otherwise uses current directory
- Defaults to DATA_FILE from config if filename not provided
Args:
filename: Optional filename override (defaults to config.DATA_FILE)
Returns:
Path: Full path to data file
Example:
>>> from config import get_data_path
>>> data_path = get_data_path()
>>> print(f"Loading from: {data_path}")
"""
if filename is None:
filename = DATA_FILE
if DATA_DIR.exists():
return DATA_DIR / filename
return Path(filename)
def get_ltm_period() -> Tuple[Optional[pd.Period], Optional[pd.Period]]:
"""
Get LTM (Last Twelve Months) period boundaries from config
Returns LTM start and end periods if LTM is enabled and configured,
otherwise returns (None, None).
Returns:
Tuple[Optional[pd.Period], Optional[pd.Period]]:
(ltm_start, ltm_end) or (None, None) if disabled
Example:
>>> ltm_start, ltm_end = get_ltm_period()
>>> if ltm_start and ltm_end:
... print(f"LTM: {ltm_start} to {ltm_end}")
See Also:
- get_ltm_label() - Get formatted LTM label string
- .cursor/rules/ltm_methodology.md - LTM explanation
"""
if LTM_ENABLED and LTM_START and LTM_END:
return LTM_START, LTM_END
return None, None
def get_ltm_label() -> Optional[str]:
"""
Get LTM label string for display
Returns formatted label like "2025 (LTM 9/2025)" if LTM is enabled,
otherwise None. Use this in chart titles and labels.
Returns:
Optional[str]: LTM label string or None if LTM disabled
Example:
>>> from config import get_ltm_label
>>> ltm_label = get_ltm_label()
>>> if ltm_label:
... title = f'Revenue Trend\n({ltm_label})'
See Also:
- get_ltm_period() - Get LTM period objects
- .cursor/rules/ltm_methodology.md - LTM usage guide
"""
return LTM_LABEL if LTM_ENABLED else None

214
config_validator.py Normal file
View File

@@ -0,0 +1,214 @@
"""
Configuration validation utility
Validates configuration settings against data to catch errors early
Usage:
from config_validator import validate_config
# Validate configuration
errors, warnings = validate_config(df)
if errors:
print("Configuration errors found:", errors)
"""
import pandas as pd
from pathlib import Path
from config import (
DATA_FILE, REVENUE_COLUMN, DATE_COLUMN, DATE_FALLBACK_COLUMNS,
CUSTOMER_COLUMN, ITEM_COLUMN, QUANTITY_COLUMN,
MIN_YEAR, MAX_DATE, ANALYSIS_YEARS,
LTM_ENABLED, LTM_START, LTM_END, LTM_START_YEAR, LTM_END_YEAR,
EXCLUSION_FILTERS, get_data_path
)
def validate_config(df=None):
"""
Validate configuration against data
Args:
df: Optional DataFrame to validate against. If None, attempts to load data.
Returns:
tuple: (errors list, warnings list)
Example:
errors, warnings = validate_config(df)
if errors:
for error in errors:
print(f"ERROR: {error}")
if warnings:
for warning in warnings:
print(f"WARNING: {warning}")
"""
errors = []
warnings = []
# Load data if not provided
if df is None:
try:
from data_loader import load_sales_data
data_path = get_data_path()
if not data_path.exists():
errors.append(f"Data file not found: {data_path}")
return errors, warnings
df = load_sales_data(data_path)
except Exception as e:
errors.append(f"Could not load data for validation: {e}")
return errors, warnings
# 1. Validate required columns exist
required_columns = [REVENUE_COLUMN, DATE_COLUMN]
for col in required_columns:
if col not in df.columns:
errors.append(f"Required column '{col}' not found in data. Available columns: {list(df.columns)[:10]}")
# 2. Validate date column has valid dates
if DATE_COLUMN in df.columns:
date_coverage = df[DATE_COLUMN].notna().sum() / len(df) * 100
if date_coverage < 50:
errors.append(f"Date coverage is very low ({date_coverage:.1f}%). Check date column configuration.")
elif date_coverage < 90:
warnings.append(f"Date coverage is {date_coverage:.1f}%. Consider adding fallback date columns.")
# 3. Validate fallback date columns
if DATE_FALLBACK_COLUMNS:
missing_fallbacks = [col for col in DATE_FALLBACK_COLUMNS if col not in df.columns]
if missing_fallbacks:
warnings.append(f"Fallback date columns not found: {missing_fallbacks}")
# 4. Validate revenue column is numeric
if REVENUE_COLUMN in df.columns:
try:
pd.to_numeric(df[REVENUE_COLUMN], errors='coerce')
valid_revenue = df[REVENUE_COLUMN].notna().sum()
if valid_revenue == 0:
errors.append(f"Revenue column '{REVENUE_COLUMN}' has no valid numeric values")
elif valid_revenue < len(df) * 0.9:
warnings.append(f"Revenue column has {len(df) - valid_revenue} invalid values")
except Exception:
errors.append(f"Revenue column '{REVENUE_COLUMN}' cannot be converted to numeric")
# 5. Validate date range
if DATE_COLUMN in df.columns and df[DATE_COLUMN].notna().any():
min_date_in_data = df[DATE_COLUMN].min()
max_date_in_data = df[DATE_COLUMN].max()
if MIN_YEAR and min_date_in_data.year > MIN_YEAR:
warnings.append(f"MIN_YEAR ({MIN_YEAR}) is earlier than earliest data ({min_date_in_data.year})")
if MAX_DATE and max_date_in_data > MAX_DATE:
warnings.append(f"MAX_DATE ({MAX_DATE.date()}) is earlier than latest data ({max_date_in_data.date()})")
# 6. Validate analysis years
if 'Year' in df.columns:
available_years = sorted(df['Year'].unique())
missing_years = [year for year in ANALYSIS_YEARS if year not in available_years]
if missing_years:
warnings.append(f"ANALYSIS_YEARS includes years not in data: {missing_years}")
# 7. Validate LTM configuration
if LTM_ENABLED:
if LTM_START is None or LTM_END is None:
errors.append("LTM_ENABLED is True but LTM_START or LTM_END is None")
else:
if LTM_START > LTM_END:
errors.append(f"LTM_START ({LTM_START}) is after LTM_END ({LTM_END})")
if 'YearMonth' in df.columns:
available_periods = df['YearMonth'].unique()
if LTM_START not in available_periods:
warnings.append(f"LTM_START ({LTM_START}) not found in data")
if LTM_END not in available_periods:
warnings.append(f"LTM_END ({LTM_END}) not found in data")
# 8. Validate exclusion filters
if EXCLUSION_FILTERS.get('enabled', False):
exclude_col = EXCLUSION_FILTERS.get('exclude_by_column')
if exclude_col:
if exclude_col not in df.columns:
errors.append(f"Exclusion filter column '{exclude_col}' not found in data")
else:
exclude_values = EXCLUSION_FILTERS.get('exclude_values', [])
if exclude_values:
available_values = df[exclude_col].unique()
invalid_values = [v for v in exclude_values if v not in available_values]
if invalid_values:
warnings.append(f"Exclusion filter values not found in data: {invalid_values}")
# 9. Validate optional columns (warnings only)
optional_columns = {
'Customer': CUSTOMER_COLUMN,
'Item': ITEM_COLUMN,
'Quantity': QUANTITY_COLUMN
}
for col_type, col_name in optional_columns.items():
if col_name and col_name not in df.columns:
warnings.append(f"Optional {col_type} column '{col_name}' not found. Some analyses may not work.")
# 10. Validate data file exists
data_path = get_data_path()
if not data_path.exists():
errors.append(f"Data file not found: {data_path}")
return errors, warnings
def print_validation_report(errors, warnings):
"""
Print a formatted validation report
Args:
errors: List of error messages
warnings: List of warning messages
"""
print("\n" + "="*60)
print("Configuration Validation Report")
print("="*60)
if errors:
print(f"\n❌ ERRORS ({len(errors)}):")
for i, error in enumerate(errors, 1):
print(f" {i}. {error}")
else:
print("\n✅ No configuration errors found")
if warnings:
print(f"\n⚠️ WARNINGS ({len(warnings)}):")
for i, warning in enumerate(warnings, 1):
print(f" {i}. {warning}")
else:
print("\n✅ No warnings")
print("\n" + "="*60)
if errors:
return False
return True
def validate_and_report(df=None):
"""
Validate configuration and print report
Args:
df: Optional DataFrame to validate against
Returns:
bool: True if no errors, False otherwise
"""
errors, warnings = validate_config(df)
return print_validation_report(errors, warnings)
# ============================================================================
# STANDALONE VALIDATION SCRIPT
# ============================================================================
if __name__ == "__main__":
"""Run configuration validation"""
print("Validating configuration...")
is_valid = validate_and_report()
if is_valid:
print("\n✅ Configuration is valid!")
exit(0)
else:
print("\n❌ Configuration has errors. Please fix them before running analyses.")
exit(1)

224
data_loader.py Normal file
View File

@@ -0,0 +1,224 @@
"""
Generic data loading utility with flexible date handling
Handles various date column formats and fallback logic
This loader is designed to work with different CSV structures by:
1. Trying primary date column first
2. Falling back to alternative date columns if needed
3. Ensuring 100% date coverage
"""
import pandas as pd
import numpy as np
from pathlib import Path
from config import (
REVENUE_COLUMN, DATE_COLUMN, DATE_FALLBACK_COLUMNS,
get_data_path
)
def load_sales_data(filepath=None):
"""
Load sales data with flexible date handling
This function provides intelligent data loading with fallback logic:
1. Loads the CSV file
2. Converts revenue column to numeric
3. Attempts to parse dates using primary date column
4. Falls back to alternative date columns if needed (100% coverage)
5. Creates Year and YearMonth columns for analysis
CRITICAL: Always use this function instead of pd.read_csv() directly.
This ensures proper date parsing with fallback logic.
Args:
filepath: Path to the CSV file (defaults to config.DATA_FILE).
Can be str, Path, or None (uses config.get_data_path())
Returns:
pd.DataFrame: DataFrame with properly parsed dates and revenue.
Includes 'Year' and 'YearMonth' columns.
Raises:
FileNotFoundError: If data file doesn't exist.
Error message includes file path and suggests checking config.py
ValueError: If required columns (REVENUE_COLUMN) are missing.
Error message lists available columns and suggests updating config.py
Example:
>>> from data_loader import load_sales_data
>>> from config import get_data_path
>>> df = load_sales_data(get_data_path())
>>> print(f"Loaded {len(df):,} rows with {df['Year'].notna().sum():,} with dates")
See Also:
- .cursor/rules/data_loading.md for detailed patterns
- config.py for column name configuration
"""
# Get data file path
if filepath is None:
filepath = get_data_path()
else:
filepath = Path(filepath)
# Check if file exists
if not filepath.exists():
raise FileNotFoundError(
f"Data file not found: {filepath}\n"
f"Please update config.py with the correct DATA_FILE path."
)
# Load CSV
print(f"Loading data from: {filepath}")
df = pd.read_csv(filepath, low_memory=False)
print(f"Loaded {len(df):,} rows")
# Validate required columns
if REVENUE_COLUMN not in df.columns:
raise ValueError(
f"Required column '{REVENUE_COLUMN}' not found in data.\n"
f"Available columns: {list(df.columns)}\n"
f"Please update config.py REVENUE_COLUMN to match your data."
)
# Convert revenue column to numeric
df[REVENUE_COLUMN] = pd.to_numeric(df[REVENUE_COLUMN], errors='coerce')
# Count missing revenue values
missing_revenue = df[REVENUE_COLUMN].isna().sum()
if missing_revenue > 0:
print(f"Warning: {missing_revenue:,} rows have missing/invalid revenue values")
# Create working date column
df['WorkingDate'] = pd.NaT
# Try primary date column first
if DATE_COLUMN in df.columns:
print(f"Attempting to parse {DATE_COLUMN}...")
df['Date_Parsed'] = pd.to_datetime(df[DATE_COLUMN], errors='coerce', format='mixed')
parsed_count = df['Date_Parsed'].notna().sum()
df.loc[df['Date_Parsed'].notna(), 'WorkingDate'] = df.loc[df['Date_Parsed'].notna(), 'Date_Parsed']
print(f" Parsed {parsed_count:,} dates from {DATE_COLUMN}")
else:
print(f"Warning: Primary date column '{DATE_COLUMN}' not found")
# Use fallback date columns
if DATE_FALLBACK_COLUMNS:
for fallback_col in DATE_FALLBACK_COLUMNS:
if fallback_col in df.columns:
missing_dates = df['WorkingDate'].isna()
if missing_dates.sum() > 0:
print(f"Using fallback column: {fallback_col}...")
fallback_parsed = pd.to_datetime(
df.loc[missing_dates, fallback_col],
errors='coerce',
format='mixed'
)
newly_parsed = missing_dates & fallback_parsed.notna()
if newly_parsed.sum() > 0:
df.loc[newly_parsed, 'WorkingDate'] = fallback_parsed[newly_parsed]
print(f" Parsed {newly_parsed.sum():,} additional dates from {fallback_col}")
# Final fallback: try to construct from Year column if available
if 'Year' in df.columns and df['WorkingDate'].isna().sum() > 0:
missing_dates = df['WorkingDate'].isna()
year_values = pd.to_numeric(df.loc[missing_dates, 'Year'], errors='coerce')
valid_years = missing_dates & year_values.notna()
if valid_years.sum() > 0:
print(f"Using Year column for remaining {valid_years.sum():,} rows...")
df.loc[valid_years, 'WorkingDate'] = pd.to_datetime(
df.loc[valid_years, 'Year'].astype(int).astype(str) + '-01-01',
errors='coerce'
)
# Set WorkingDate as the primary date column
df[DATE_COLUMN] = df['WorkingDate']
# Clean up temporary columns
df = df.drop(columns=['Date_Parsed', 'WorkingDate'], errors='ignore')
# Extract Year from date column
df['Year'] = df[DATE_COLUMN].dt.year
# Fill missing Year from Year column if it exists and date is missing
if 'Year' in df.columns:
year_orig = pd.to_numeric(df['Year'], errors='coerce')
missing_year = df['Year'].isna()
if missing_year.sum() > 0 and 'Year' in df.columns:
year_fallback = pd.to_numeric(df.loc[missing_year, 'Year'], errors='coerce')
df.loc[missing_year & year_fallback.notna(), 'Year'] = year_fallback[missing_year & year_fallback.notna()]
# Create YearMonth for monthly analysis
if DATE_COLUMN in df.columns:
df['YearMonth'] = df[DATE_COLUMN].dt.to_period('M')
# Report date coverage
total_rows = len(df)
date_coverage = df[DATE_COLUMN].notna().sum()
coverage_pct = (date_coverage / total_rows * 100) if total_rows > 0 else 0
print(f"Date coverage: {date_coverage:,} / {total_rows:,} rows ({coverage_pct:.1f}%)")
if coverage_pct < 100:
print(f"Warning: {total_rows - date_coverage:,} rows have missing dates")
# Report date range
if df[DATE_COLUMN].notna().any():
min_date = df[DATE_COLUMN].min()
max_date = df[DATE_COLUMN].max()
print(f"Date range: {min_date.strftime('%Y-%m-%d')} to {max_date.strftime('%Y-%m-%d')}")
return df
def validate_data_structure(df: pd.DataFrame) -> tuple[bool, str]:
"""
Validate that loaded data has expected structure.
Checks for required columns, data quality, and basic validity.
Returns actionable error messages if validation fails.
Args:
df: DataFrame to validate (should be result of load_sales_data())
Returns:
tuple[bool, str]: (is_valid, error_message)
- is_valid: True if data structure is valid, False otherwise
- error_message: "OK" if valid, otherwise descriptive error message
Example:
>>> df = load_sales_data(get_data_path())
>>> is_valid, msg = validate_data_structure(df)
>>> if not is_valid:
... print(f"ERROR: {msg}")
See Also:
- load_sales_data() - Load data before validating
- config_validator.py - Comprehensive configuration validation
"""
from config import REVENUE_COLUMN, DATE_COLUMN
errors = []
# Check required columns
if REVENUE_COLUMN not in df.columns:
errors.append(f"Missing required column: {REVENUE_COLUMN}")
if DATE_COLUMN not in df.columns:
errors.append(f"Missing required column: {DATE_COLUMN}")
# Check data quality
if len(df) == 0:
errors.append("DataFrame is empty")
if REVENUE_COLUMN in df.columns:
if df[REVENUE_COLUMN].isna().all():
errors.append(f"All {REVENUE_COLUMN} values are NaN")
if df[REVENUE_COLUMN].notna().sum() == 0:
errors.append(f"No valid {REVENUE_COLUMN} values")
if DATE_COLUMN in df.columns:
if df[DATE_COLUMN].isna().all():
errors.append(f"All {DATE_COLUMN} values are NaN")
if errors:
return False, "; ".join(errors)
return True, "OK"

285
data_processing.py Normal file
View File

@@ -0,0 +1,285 @@
"""
Data processing utilities
Common data cleaning and transformation helpers
Usage:
from data_processing import clean_data, create_pivot_table, prepare_time_series
# Clean data
df_clean = clean_data(df)
# Create pivot table
pivot = create_pivot_table(df, index='Year', columns='Product', values='Revenue')
"""
import pandas as pd
import numpy as np
from config import REVENUE_COLUMN, DATE_COLUMN, MIN_QUANTITY, MAX_QUANTITY
def clean_data(df, remove_duplicates=True, handle_missing_dates=True):
"""
Clean data with common operations
Args:
df: DataFrame to clean
remove_duplicates: Whether to remove duplicate rows
handle_missing_dates: Whether to handle missing dates
Returns:
DataFrame: Cleaned DataFrame
"""
df_clean = df.copy()
# Remove duplicates
if remove_duplicates:
initial_count = len(df_clean)
df_clean = df_clean.drop_duplicates()
removed = initial_count - len(df_clean)
if removed > 0:
print(f"Removed {removed:,} duplicate rows")
# Handle missing dates
if handle_missing_dates and DATE_COLUMN in df_clean.columns:
missing_dates = df_clean[DATE_COLUMN].isna().sum()
if missing_dates > 0:
print(f"Warning: {missing_dates:,} rows have missing dates")
# Remove rows with negative revenue (if configured)
if REVENUE_COLUMN in df_clean.columns:
negative_revenue = (df_clean[REVENUE_COLUMN] < 0).sum()
if negative_revenue > 0:
print(f"Found {negative_revenue:,} rows with negative revenue")
# Optionally remove: df_clean = df_clean[df_clean[REVENUE_COLUMN] >= 0]
return df_clean
def create_pivot_table(df, index, columns=None, values=None, aggfunc='sum', fill_value=0):
"""
Create pivot table with common defaults
Args:
df: DataFrame
index: Column(s) to use as index
columns: Column(s) to use as columns
values: Column(s) to aggregate
aggfunc: Aggregation function (default: 'sum')
fill_value: Value to fill missing cells (default: 0)
Returns:
DataFrame: Pivot table
"""
if values is None and REVENUE_COLUMN in df.columns:
values = REVENUE_COLUMN
pivot = pd.pivot_table(
df,
index=index,
columns=columns,
values=values,
aggfunc=aggfunc,
fill_value=fill_value
)
return pivot
def prepare_time_series(df, date_column=None, value_column=None, freq='M'):
"""
Prepare time series data
Args:
df: DataFrame
date_column: Date column name (defaults to config.DATE_COLUMN)
value_column: Value column to aggregate (defaults to config.REVENUE_COLUMN)
freq: Frequency for resampling ('D', 'W', 'M', 'Q', 'Y')
Returns:
Series: Time series data
"""
if date_column is None:
date_column = DATE_COLUMN
if value_column is None:
value_column = REVENUE_COLUMN
if date_column not in df.columns:
raise ValueError(f"Date column '{date_column}' not found")
if value_column not in df.columns:
raise ValueError(f"Value column '{value_column}' not found")
# Ensure date column is datetime
df = df.copy()
df[date_column] = pd.to_datetime(df[date_column], errors='coerce')
# Set date as index
df_indexed = df.set_index(date_column)
# Resample and aggregate
time_series = df_indexed[value_column].resample(freq).sum()
return time_series
def aggregate_by_period(df, period='year', date_column=None, value_column=None):
"""
Aggregate data by time period
Args:
df: DataFrame
period: Period type ('year', 'month', 'quarter')
date_column: Date column name
value_column: Value column to aggregate
Returns:
DataFrame: Aggregated data
"""
if date_column is None:
date_column = DATE_COLUMN
if value_column is None:
value_column = REVENUE_COLUMN
df = df.copy()
df[date_column] = pd.to_datetime(df[date_column], errors='coerce')
# Extract period
if period == 'year':
df['Period'] = df[date_column].dt.year
elif period == 'month':
df['Period'] = df[date_column].dt.to_period('M')
elif period == 'quarter':
df['Period'] = df[date_column].dt.to_period('Q')
else:
raise ValueError(f"Unknown period: {period}")
# Aggregate
aggregated = df.groupby('Period')[value_column].agg(['sum', 'count', 'mean']).reset_index()
aggregated.columns = ['Period', 'Total', 'Count', 'Average']
return aggregated
def filter_outliers(df, column, method='iqr', lower_bound=None, upper_bound=None):
"""
Filter outliers from DataFrame
Args:
df: DataFrame
column: Column name to filter on
method: Method ('iqr' for interquartile range, 'zscore' for z-score)
lower_bound: Manual lower bound
upper_bound: Manual upper bound
Returns:
DataFrame: Filtered DataFrame
"""
df_filtered = df.copy()
if method == 'iqr':
q1 = df[column].quantile(0.25)
q3 = df[column].quantile(0.75)
iqr = q3 - q1
lower = lower_bound if lower_bound is not None else q1 - 1.5 * iqr
upper = upper_bound if upper_bound is not None else q3 + 1.5 * iqr
elif method == 'zscore':
mean = df[column].mean()
std = df[column].std()
lower = lower_bound if lower_bound is not None else mean - 3 * std
upper = upper_bound if upper_bound is not None else mean + 3 * std
else:
raise ValueError(f"Unknown method: {method}")
initial_count = len(df_filtered)
df_filtered = df_filtered[(df_filtered[column] >= lower) & (df_filtered[column] <= upper)]
removed = initial_count - len(df_filtered)
if removed > 0:
print(f"Removed {removed:,} outliers from {column} ({removed/initial_count*100:.1f}%)")
return df_filtered
def normalize_column(df, column, method='min_max'):
"""
Normalize a column
Args:
df: DataFrame
column: Column name to normalize
method: Normalization method ('min_max', 'zscore')
Returns:
Series: Normalized values
"""
if method == 'min_max':
min_val = df[column].min()
max_val = df[column].max()
if max_val - min_val == 0:
return pd.Series([0] * len(df), index=df.index)
return (df[column] - min_val) / (max_val - min_val)
elif method == 'zscore':
mean = df[column].mean()
std = df[column].std()
if std == 0:
return pd.Series([0] * len(df), index=df.index)
return (df[column] - mean) / std
else:
raise ValueError(f"Unknown method: {method}")
def create_derived_columns(df):
"""
Create common derived columns
Args:
df: DataFrame
Returns:
DataFrame: DataFrame with derived columns
"""
df_derived = df.copy()
# Extract year, month, quarter if date column exists
if DATE_COLUMN in df_derived.columns:
df_derived[DATE_COLUMN] = pd.to_datetime(df_derived[DATE_COLUMN], errors='coerce')
if 'Year' not in df_derived.columns:
df_derived['Year'] = df_derived[DATE_COLUMN].dt.year
if 'Month' not in df_derived.columns:
df_derived['Month'] = df_derived[DATE_COLUMN].dt.month
if 'Quarter' not in df_derived.columns:
df_derived['Quarter'] = df_derived[DATE_COLUMN].dt.quarter
if 'YearMonth' not in df_derived.columns:
df_derived['YearMonth'] = df_derived[DATE_COLUMN].dt.to_period('M')
# Calculate price per unit if quantity and revenue exist
from config import QUANTITY_COLUMN
if QUANTITY_COLUMN in df_derived.columns and REVENUE_COLUMN in df_derived.columns:
df_derived['Price_Per_Unit'] = df_derived[REVENUE_COLUMN] / df_derived[QUANTITY_COLUMN].replace(0, np.nan)
return df_derived
# ============================================================================
# EXAMPLE USAGE
# ============================================================================
if __name__ == "__main__":
"""Example usage"""
# Create sample data
df = pd.DataFrame({
'InvoiceDate': pd.date_range('2023-01-01', periods=100, freq='D'),
'USD': np.random.normal(1000, 200, 100),
'Quantity': np.random.randint(1, 100, 100)
})
# Clean data
df_clean = clean_data(df)
print(f"Cleaned data: {len(df_clean)} rows")
# Create pivot table
df_clean['Year'] = df_clean['InvoiceDate'].dt.year
pivot = create_pivot_table(df_clean, index='Year', values='USD')
print("\nPivot table:")
print(pivot)
# Prepare time series
ts = prepare_time_series(df_clean, freq='M')
print(f"\nTime series: {len(ts)} periods")

344
data_quality.py Normal file
View File

@@ -0,0 +1,344 @@
"""
Data quality reporting utility
Generates comprehensive data quality reports
Usage:
from data_quality import generate_data_quality_report, print_data_quality_report
# Generate and print report
report = generate_data_quality_report(df)
print_data_quality_report(report)
"""
import pandas as pd
import numpy as np
from config import (
REVENUE_COLUMN, DATE_COLUMN, CUSTOMER_COLUMN, ITEM_COLUMN,
QUANTITY_COLUMN, MIN_QUANTITY, MAX_QUANTITY
)
def generate_data_quality_report(df):
"""
Generate comprehensive data quality report
Args:
df: DataFrame to analyze
Returns:
dict: Dictionary containing data quality metrics
"""
report = {
'overview': {},
'missing_values': {},
'duplicates': {},
'outliers': {},
'data_types': {},
'date_coverage': {},
'revenue_summary': {},
'issues': []
}
# Overview
report['overview'] = {
'total_rows': len(df),
'total_columns': len(df.columns),
'memory_usage_mb': df.memory_usage(deep=True).sum() / 1024**2
}
# Missing values
missing = df.isnull().sum()
missing_pct = (missing / len(df)) * 100
report['missing_values'] = {
'by_column': missing[missing > 0].to_dict(),
'percentages': missing_pct[missing > 0].to_dict(),
'total_missing': missing.sum(),
'columns_with_missing': len(missing[missing > 0])
}
# Duplicates
duplicate_rows = df.duplicated().sum()
report['duplicates'] = {
'duplicate_rows': int(duplicate_rows),
'duplicate_percentage': (duplicate_rows / len(df)) * 100 if len(df) > 0 else 0
}
# Outliers (revenue and quantity)
outliers = {}
if REVENUE_COLUMN in df.columns:
revenue = pd.to_numeric(df[REVENUE_COLUMN], errors='coerce')
q1 = revenue.quantile(0.25)
q3 = revenue.quantile(0.75)
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr
revenue_outliers = ((revenue < lower_bound) | (revenue > upper_bound)).sum()
outliers['revenue'] = {
'count': int(revenue_outliers),
'percentage': (revenue_outliers / len(df)) * 100 if len(df) > 0 else 0,
'lower_bound': float(lower_bound),
'upper_bound': float(upper_bound),
'negative_values': int((revenue < 0).sum())
}
if QUANTITY_COLUMN in df.columns:
quantity = pd.to_numeric(df[QUANTITY_COLUMN], errors='coerce')
# Use config thresholds if available
if MIN_QUANTITY is not None and MAX_QUANTITY is not None:
quantity_outliers = ((quantity < MIN_QUANTITY) | (quantity > MAX_QUANTITY)).sum()
outliers['quantity'] = {
'count': int(quantity_outliers),
'percentage': (quantity_outliers / len(df)) * 100 if len(df) > 0 else 0,
'below_min': int((quantity < MIN_QUANTITY).sum()),
'above_max': int((quantity > MAX_QUANTITY).sum())
}
else:
q1 = quantity.quantile(0.25)
q3 = quantity.quantile(0.75)
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr
quantity_outliers = ((quantity < lower_bound) | (quantity > upper_bound)).sum()
outliers['quantity'] = {
'count': int(quantity_outliers),
'percentage': (quantity_outliers / len(df)) * 100 if len(df) > 0 else 0,
'lower_bound': float(lower_bound),
'upper_bound': float(upper_bound)
}
report['outliers'] = outliers
# Data types
report['data_types'] = {
'numeric_columns': list(df.select_dtypes(include=[np.number]).columns),
'datetime_columns': list(df.select_dtypes(include=['datetime64']).columns),
'object_columns': list(df.select_dtypes(include=['object']).columns),
'type_summary': df.dtypes.value_counts().to_dict()
}
# Date coverage
if DATE_COLUMN in df.columns:
date_coverage = df[DATE_COLUMN].notna().sum()
report['date_coverage'] = {
'total_rows': len(df),
'rows_with_dates': int(date_coverage),
'coverage_percentage': (date_coverage / len(df)) * 100 if len(df) > 0 else 0,
'min_date': str(df[DATE_COLUMN].min()) if date_coverage > 0 else None,
'max_date': str(df[DATE_COLUMN].max()) if date_coverage > 0 else None
}
# Revenue summary
if REVENUE_COLUMN in df.columns:
revenue = pd.to_numeric(df[REVENUE_COLUMN], errors='coerce')
valid_revenue = revenue.dropna()
if len(valid_revenue) > 0:
report['revenue_summary'] = {
'total_revenue': float(valid_revenue.sum()),
'mean_revenue': float(valid_revenue.mean()),
'median_revenue': float(valid_revenue.median()),
'min_revenue': float(valid_revenue.min()),
'max_revenue': float(valid_revenue.max()),
'std_revenue': float(valid_revenue.std()),
'valid_rows': int(len(valid_revenue)),
'invalid_rows': int(len(df) - len(valid_revenue))
}
# Identify issues
issues = []
# Critical issues
if report['missing_values']['columns_with_missing'] > 0:
high_missing = {k: v for k, v in report['missing_values']['percentages'].items() if v > 50}
if high_missing:
issues.append({
'severity': 'critical',
'issue': f"Columns with >50% missing values: {list(high_missing.keys())}",
'impact': 'High'
})
if DATE_COLUMN in df.columns:
if report['date_coverage']['coverage_percentage'] < 50:
issues.append({
'severity': 'critical',
'issue': f"Date coverage is only {report['date_coverage']['coverage_percentage']:.1f}%",
'impact': 'High - analyses may fail'
})
if REVENUE_COLUMN in df.columns:
if report['revenue_summary'].get('invalid_rows', 0) > len(df) * 0.1:
issues.append({
'severity': 'critical',
'issue': f"{report['revenue_summary']['invalid_rows']} rows have invalid revenue values",
'impact': 'High'
})
# Warnings
if report['duplicates']['duplicate_percentage'] > 5:
issues.append({
'severity': 'warning',
'issue': f"{report['duplicates']['duplicate_rows']} duplicate rows ({report['duplicates']['duplicate_percentage']:.1f}%)",
'impact': 'Medium'
})
if 'revenue' in outliers:
if outliers['revenue']['percentage'] > 10:
issues.append({
'severity': 'warning',
'issue': f"{outliers['revenue']['count']} revenue outliers ({outliers['revenue']['percentage']:.1f}%)",
'impact': 'Medium'
})
report['issues'] = issues
return report
def print_data_quality_report(report):
"""
Print formatted data quality report
Args:
report: Dictionary from generate_data_quality_report()
"""
print("\n" + "="*70)
print("DATA QUALITY REPORT")
print("="*70)
# Overview
print("\n📊 OVERVIEW")
print("-" * 70)
print(f"Total Rows: {report['overview']['total_rows']:,}")
print(f"Total Columns: {report['overview']['total_columns']}")
print(f"Memory Usage: {report['overview']['memory_usage_mb']:.2f} MB")
# Missing values
print("\n🔍 MISSING VALUES")
print("-" * 70)
if report['missing_values']['columns_with_missing'] > 0:
print(f"Columns with missing values: {report['missing_values']['columns_with_missing']}")
print(f"Total missing values: {report['missing_values']['total_missing']:,}")
print("\nTop columns by missing values:")
missing_sorted = sorted(
report['missing_values']['percentages'].items(),
key=lambda x: x[1],
reverse=True
)[:10]
for col, pct in missing_sorted:
count = report['missing_values']['by_column'][col]
print(f" {col:30s}: {count:8,} ({pct:5.1f}%)")
else:
print("✅ No missing values found")
# Duplicates
print("\n🔄 DUPLICATES")
print("-" * 70)
if report['duplicates']['duplicate_rows'] > 0:
print(f"Duplicate Rows: {report['duplicates']['duplicate_rows']:,} ({report['duplicates']['duplicate_percentage']:.2f}%)")
else:
print("✅ No duplicate rows found")
# Outliers
print("\n📈 OUTLIERS")
print("-" * 70)
if 'revenue' in report['outliers']:
rev_out = report['outliers']['revenue']
print(f"Revenue Outliers: {rev_out['count']:,} ({rev_out['percentage']:.2f}%)")
if 'negative_values' in rev_out and rev_out['negative_values'] > 0:
print(f" Negative Revenue Values: {rev_out['negative_values']:,}")
if 'quantity' in report['outliers']:
qty_out = report['outliers']['quantity']
print(f"Quantity Outliers: {qty_out['count']:,} ({qty_out['percentage']:.2f}%)")
if not report['outliers']:
print("✅ No significant outliers detected")
# Date coverage
if report['date_coverage']:
print("\n📅 DATE COVERAGE")
print("-" * 70)
dc = report['date_coverage']
print(f"Rows with Dates: {dc['rows_with_dates']:,} / {dc['total_rows']:,} ({dc['coverage_percentage']:.1f}%)")
if dc['min_date']:
print(f"Date Range: {dc['min_date']} to {dc['max_date']}")
# Revenue summary
if report['revenue_summary']:
print("\n💰 REVENUE SUMMARY")
print("-" * 70)
rs = report['revenue_summary']
print(f"Total Revenue: ${rs['total_revenue'] / 1e6:.2f}m")
print(f"Valid Rows: {rs['valid_rows']:,} / {rs['valid_rows'] + rs['invalid_rows']:,}")
if rs['invalid_rows'] > 0:
print(f"Invalid Rows: {rs['invalid_rows']:,}")
print(f"Mean: ${rs['mean_revenue']:,.2f}")
print(f"Median: ${rs['median_revenue']:,.2f}")
print(f"Min: ${rs['min_revenue']:,.2f}")
print(f"Max: ${rs['max_revenue']:,.2f}")
# Issues
if report['issues']:
print("\n⚠️ ISSUES DETECTED")
print("-" * 70)
critical = [i for i in report['issues'] if i['severity'] == 'critical']
warnings = [i for i in report['issues'] if i['severity'] == 'warning']
if critical:
print("❌ CRITICAL ISSUES:")
for issue in critical:
print(f"{issue['issue']}")
print(f" Impact: {issue['impact']}")
if warnings:
print("\n⚠️ WARNINGS:")
for issue in warnings:
print(f"{issue['issue']}")
print(f" Impact: {issue['impact']}")
else:
print("\n✅ NO ISSUES DETECTED")
print("\n" + "="*70)
def generate_data_quality_report_simple(df):
"""
Generate a simple data quality summary (quick check)
Args:
df: DataFrame to analyze
Returns:
str: Simple summary string
"""
summary_parts = []
summary_parts.append(f"Rows: {len(df):,}")
summary_parts.append(f"Columns: {len(df.columns)}")
if REVENUE_COLUMN in df.columns:
revenue = pd.to_numeric(df[REVENUE_COLUMN], errors='coerce')
valid = revenue.notna().sum()
summary_parts.append(f"Valid Revenue: {valid:,} ({valid/len(df)*100:.1f}%)")
if DATE_COLUMN in df.columns:
date_coverage = df[DATE_COLUMN].notna().sum()
summary_parts.append(f"Date Coverage: {date_coverage:,} ({date_coverage/len(df)*100:.1f}%)")
return " | ".join(summary_parts)
# ============================================================================
# STANDALONE DATA QUALITY CHECK
# ============================================================================
if __name__ == "__main__":
"""Run data quality check"""
from data_loader import load_sales_data
from config import get_data_path
print("Loading data for quality check...")
try:
df = load_sales_data(get_data_path())
report = generate_data_quality_report(df)
print_data_quality_report(report)
except Exception as e:
print(f"ERROR: {e}")

View File

@@ -0,0 +1,134 @@
"""
Example: Annual Revenue Trend Analysis
Simple example showing annual revenue with LTM support
This is a working example that demonstrates:
- Loading data using data_loader
- Calculating annual metrics with LTM
- Creating a revenue trend chart
- Following template best practices
"""
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
# Import utilities
from data_loader import load_sales_data, validate_data_structure
from validate_revenue import validate_revenue
from analysis_utils import (
get_ltm_period_config, calculate_annual_metrics,
setup_revenue_chart, save_chart,
format_currency, print_annual_summary, sort_mixed_years,
apply_exclusion_filters
)
from config import (
OUTPUT_DIR, ANALYSIS_YEARS, MAX_DATE,
CHART_SIZES, ensure_directories, get_data_path, COMPANY_NAME,
REVENUE_COLUMN, MIN_YEAR, DATE_COLUMN
)
# ============================================================================
# CONFIGURATION
# ============================================================================
ANALYSIS_NAME = "Annual Revenue Trend"
DESCRIPTION = "Simple annual revenue trend analysis with LTM support"
# ============================================================================
# MAIN ANALYSIS FUNCTION
# ============================================================================
def main():
"""Main analysis function"""
print(f"\n{'='*60}")
print(f"{ANALYSIS_NAME}")
print(f"{'='*60}\n")
# 1. Load data
print("Loading data...")
try:
df = load_sales_data(get_data_path())
print(f"Loaded {len(df):,} transactions")
except Exception as e:
print(f"ERROR loading data: {e}")
return
# 2. Validate data structure
is_valid, msg = validate_data_structure(df)
if not is_valid:
print(f"ERROR: {msg}")
return
print("Data validation passed")
# 3. Apply exclusion filters (if configured)
df = apply_exclusion_filters(df)
# 4. Filter by date range
df = df[df['Year'] >= MIN_YEAR]
if DATE_COLUMN in df.columns:
df = df[df[DATE_COLUMN] <= MAX_DATE]
# 5. Setup LTM period (if enabled)
ltm_start, ltm_end = get_ltm_period_config()
if ltm_start and ltm_end:
print(f"LTM period: {ltm_start} to {ltm_end}")
# 6. Calculate annual metrics
print("\nCalculating annual metrics...")
def calculate_metrics(year_data):
"""Calculate metrics for a single year"""
return {
'Revenue': year_data[REVENUE_COLUMN].sum(),
}
annual_df = calculate_annual_metrics(df, calculate_metrics, ltm_start, ltm_end)
# 7. Print summary
print_annual_summary(annual_df, 'Revenue', 'Revenue')
# 8. Create visualization
print("Generating chart...")
ensure_directories()
# Annual revenue trend chart
fig, ax = plt.subplots(figsize=CHART_SIZES['medium'])
# Prepare data for plotting (handle mixed types)
annual_df_sorted = sort_mixed_years(annual_df.reset_index(), 'Year')
years = annual_df_sorted['Year'].tolist()
revenue = annual_df_sorted['Revenue'].values / 1e6 # Convert to millions
# Create chart
ax.plot(range(len(years)), revenue, marker='o', linewidth=2, markersize=8, color='#2E86AB')
ax.set_xticks(range(len(years)))
ax.set_xticklabels(years, rotation=45, ha='right')
setup_revenue_chart(ax)
# Add LTM notation to title if applicable
title = f'Annual Revenue Trend - {COMPANY_NAME}'
if ltm_start and ltm_end:
from config import get_ltm_label
ltm_label = get_ltm_label()
if ltm_label:
title += f'\n({ltm_label})'
ax.set_title(title, fontsize=14, fontweight='bold')
plt.tight_layout()
save_chart(fig, 'annual_revenue_trend.png')
plt.close()
# 9. Validate revenue
print("\nValidating revenue...")
validate_revenue(df, ANALYSIS_NAME)
print(f"\n{ANALYSIS_NAME} complete!")
print(f"Chart saved to: {OUTPUT_DIR}")
# ============================================================================
# RUN ANALYSIS
# ============================================================================
if __name__ == "__main__":
main()

218
examples/cohort_analysis.py Normal file
View File

@@ -0,0 +1,218 @@
"""
Example: Cohort Analysis
Advanced example showing customer cohort retention analysis
This demonstrates:
- Cohort-based analysis
- Retention rate calculations
- Revenue retention metrics
- Advanced visualization
"""
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from operator import attrgetter
# Import utilities
from data_loader import load_sales_data, validate_data_structure
from validate_revenue import validate_revenue
from analysis_utils import (
get_ltm_period_config, apply_exclusion_filters,
setup_revenue_chart, save_chart, format_currency
)
from config import (
OUTPUT_DIR, MAX_DATE, CHART_SIZES, ensure_directories,
get_data_path, COMPANY_NAME, REVENUE_COLUMN, CUSTOMER_COLUMN,
DATE_COLUMN, MIN_YEAR
)
# ============================================================================
# CONFIGURATION
# ============================================================================
ANALYSIS_NAME = "Cohort Analysis"
DESCRIPTION = "Customer cohort retention and revenue analysis"
# ============================================================================
# COHORT ANALYSIS FUNCTIONS
# ============================================================================
def create_cohorts(df):
"""
Create customer cohorts based on first purchase date
Args:
df: DataFrame with customer and date columns
Returns:
DataFrame: Original DataFrame with 'Cohort' and 'CohortPeriod' columns
"""
from config import CUSTOMER_COLUMN, DATE_COLUMN
# Get first purchase date for each customer
first_purchase = df.groupby(CUSTOMER_COLUMN)[DATE_COLUMN].min().reset_index()
first_purchase.columns = [CUSTOMER_COLUMN, 'FirstPurchaseDate']
# Extract cohort year-month
first_purchase['Cohort'] = first_purchase['FirstPurchaseDate'].dt.to_period('M')
# Merge back to original data
df_with_cohort = df.merge(first_purchase[[CUSTOMER_COLUMN, 'Cohort']], on=CUSTOMER_COLUMN)
# Calculate period number (months since first purchase)
df_with_cohort['Period'] = df_with_cohort[DATE_COLUMN].dt.to_period('M')
df_with_cohort['CohortPeriod'] = (df_with_cohort['Period'] - df_with_cohort['Cohort']).apply(attrgetter('n'))
return df_with_cohort
def calculate_cohort_metrics(df_with_cohort):
"""
Calculate cohort retention metrics
Args:
df_with_cohort: DataFrame with Cohort and CohortPeriod columns
Returns:
DataFrame: Cohort metrics by period
"""
from config import REVENUE_COLUMN, CUSTOMER_COLUMN
# Customer count by cohort and period
cohort_size = df_with_cohort.groupby('Cohort')[CUSTOMER_COLUMN].nunique()
# Revenue by cohort and period
cohort_revenue = df_with_cohort.groupby(['Cohort', 'CohortPeriod']).agg({
CUSTOMER_COLUMN: 'nunique',
REVENUE_COLUMN: 'sum'
}).reset_index()
cohort_revenue.columns = ['Cohort', 'Period', 'Customers', 'Revenue']
# Calculate retention rates
cohort_retention = []
for cohort in cohort_revenue['Cohort'].unique():
cohort_data = cohort_revenue[cohort_revenue['Cohort'] == cohort].copy()
initial_customers = cohort_data[cohort_data['Period'] == 0]['Customers'].values[0]
cohort_data['Retention_Rate'] = (cohort_data['Customers'] / initial_customers) * 100
cohort_data['Revenue_Retention'] = cohort_data['Revenue'] / cohort_data[cohort_data['Period'] == 0]['Revenue'].values[0] * 100
cohort_retention.append(cohort_data)
return pd.concat(cohort_retention, ignore_index=True)
# ============================================================================
# MAIN ANALYSIS FUNCTION
# ============================================================================
def main():
"""Main analysis function"""
print(f"\n{'='*60}")
print(f"{ANALYSIS_NAME}")
print(f"{'='*60}\n")
# 1. Load data
print("Loading data...")
try:
df = load_sales_data(get_data_path())
print(f"Loaded {len(df):,} transactions")
except Exception as e:
print(f"ERROR loading data: {e}")
return
# 2. Validate
is_valid, msg = validate_data_structure(df)
if not is_valid:
print(f"ERROR: {msg}")
return
if CUSTOMER_COLUMN not in df.columns:
print(f"ERROR: Customer column '{CUSTOMER_COLUMN}' not found")
return
# 3. Apply filters
df = apply_exclusion_filters(df)
df = df[df['Year'] >= MIN_YEAR]
if DATE_COLUMN in df.columns:
df = df[df[DATE_COLUMN] <= MAX_DATE]
# 4. Create cohorts
print("\nCreating customer cohorts...")
df_cohort = create_cohorts(df)
# 5. Calculate cohort metrics
print("Calculating cohort metrics...")
cohort_metrics = calculate_cohort_metrics(df_cohort)
# 6. Print summary
print("\nCohort Summary:")
print("-" * 60)
for cohort in sorted(cohort_metrics['Cohort'].unique())[:5]: # Show top 5 cohorts
cohort_data = cohort_metrics[cohort_metrics['Cohort'] == cohort]
period_0 = cohort_data[cohort_data['Period'] == 0]
if len(period_0) > 0:
initial_customers = period_0['Customers'].values[0]
initial_revenue = period_0['Revenue'].values[0]
print(f"\n{cohort}:")
print(f" Initial: {initial_customers:,} customers, {format_currency(initial_revenue)}")
# Show retention at period 12
period_12 = cohort_data[cohort_data['Period'] == 12]
if len(period_12) > 0:
retention = period_12['Retention_Rate'].values[0]
revenue_ret = period_12['Revenue_Retention'].values[0]
print(f" Period 12: {retention:.1f}% customer retention, {revenue_ret:.1f}% revenue retention")
# 7. Create visualizations
print("\nGenerating charts...")
ensure_directories()
# Heatmap: Customer retention
pivot_retention = cohort_metrics.pivot_table(
index='Cohort',
columns='Period',
values='Retention_Rate',
aggfunc='mean'
)
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=CHART_SIZES['wide'])
# Retention heatmap
sns.heatmap(pivot_retention, annot=True, fmt='.0f', cmap='YlOrRd', ax=ax1, cbar_kws={'label': 'Retention %'})
ax1.set_title('Customer Retention by Cohort\n(Period 0 = 100%)', fontsize=12, fontweight='bold')
ax1.set_xlabel('Months Since First Purchase')
ax1.set_ylabel('Cohort')
# Revenue retention heatmap
pivot_revenue = cohort_metrics.pivot_table(
index='Cohort',
columns='Period',
values='Revenue_Retention',
aggfunc='mean'
)
sns.heatmap(pivot_revenue, annot=True, fmt='.0f', cmap='YlGnBu', ax=ax2, cbar_kws={'label': 'Revenue Retention %'})
ax2.set_title('Revenue Retention by Cohort\n(Period 0 = 100%)', fontsize=12, fontweight='bold')
ax2.set_xlabel('Months Since First Purchase')
ax2.set_ylabel('Cohort')
plt.suptitle(f'Cohort Analysis - {COMPANY_NAME}', fontsize=14, fontweight='bold', y=1.02)
plt.tight_layout()
save_chart(fig, 'cohort_analysis.png')
plt.close()
# 8. Validate
print("\nValidating revenue...")
validate_revenue(df, ANALYSIS_NAME)
print(f"\n{ANALYSIS_NAME} complete!")
print(f"Charts saved to: {OUTPUT_DIR}")
# ============================================================================
# RUN ANALYSIS
# ============================================================================
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,213 @@
"""
Example: Customer Segmentation (RFM) Analysis
Example showing customer segmentation using RFM methodology
This example demonstrates:
- Customer-level aggregation
- RFM segmentation (Recency, Frequency, Monetary)
- Segment analysis and visualization
"""
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
# Import utilities
from data_loader import load_sales_data, validate_data_structure
from validate_revenue import validate_revenue
from analysis_utils import (
get_ltm_period_config, apply_exclusion_filters,
setup_revenue_chart, save_chart, format_currency
)
from config import (
OUTPUT_DIR, MAX_DATE, CHART_SIZES, ensure_directories,
get_data_path, COMPANY_NAME, REVENUE_COLUMN, CUSTOMER_COLUMN,
DATE_COLUMN, MIN_YEAR
)
# ============================================================================
# CONFIGURATION
# ============================================================================
ANALYSIS_NAME = "Customer Segmentation (RFM)"
DESCRIPTION = "Customer segmentation using RFM methodology"
# ============================================================================
# RFM SEGMENTATION FUNCTIONS
# ============================================================================
def calculate_rfm_scores(df, analysis_date=None):
"""
Calculate RFM scores for each customer
Args:
df: DataFrame with customer, date, and revenue columns
analysis_date: Reference date for recency calculation (defaults to max date)
Returns:
DataFrame with RFM scores and segment assignment
"""
if analysis_date is None:
analysis_date = df[DATE_COLUMN].max()
# Calculate customer-level metrics
customer_metrics = df.groupby(CUSTOMER_COLUMN).agg({
DATE_COLUMN: ['max', 'count'],
REVENUE_COLUMN: 'sum'
}).reset_index()
customer_metrics.columns = [CUSTOMER_COLUMN, 'LastPurchaseDate', 'Frequency', 'Monetary']
# Calculate Recency (days since last purchase)
customer_metrics['Recency'] = (analysis_date - customer_metrics['LastPurchaseDate']).dt.days
# Score each dimension (1-5 scale, 5 = best)
customer_metrics['R_Score'] = pd.qcut(
customer_metrics['Recency'].rank(method='first'),
q=5, labels=[5, 4, 3, 2, 1], duplicates='drop'
).astype(int)
customer_metrics['F_Score'] = pd.qcut(
customer_metrics['Frequency'].rank(method='first'),
q=5, labels=[1, 2, 3, 4, 5], duplicates='drop'
).astype(int)
customer_metrics['M_Score'] = pd.qcut(
customer_metrics['Monetary'].rank(method='first'),
q=5, labels=[1, 2, 3, 4, 5], duplicates='drop'
).astype(int)
# Calculate RFM score (sum of R, F, M)
customer_metrics['RFM_Score'] = (
customer_metrics['R_Score'] +
customer_metrics['F_Score'] +
customer_metrics['M_Score']
)
# Assign segments
def assign_segment(row):
r, f, m = row['R_Score'], row['F_Score'], row['M_Score']
if r >= 4 and f >= 4 and m >= 4:
return 'Champions'
elif r >= 3 and f >= 3 and m >= 4:
return 'Loyal Customers'
elif r >= 4 and f <= 2:
return 'At Risk'
elif r <= 2:
return 'Hibernating'
elif r >= 3 and f >= 3 and m <= 2:
return 'Potential Loyalists'
else:
return 'Need Attention'
customer_metrics['Segment'] = customer_metrics.apply(assign_segment, axis=1)
return customer_metrics
# ============================================================================
# MAIN ANALYSIS FUNCTION
# ============================================================================
def main():
"""Main analysis function"""
print(f"\n{'='*60}")
print(f"{ANALYSIS_NAME}")
print(f"{'='*60}\n")
# 1. Load data
print("Loading data...")
try:
df = load_sales_data(get_data_path())
print(f"Loaded {len(df):,} transactions")
except Exception as e:
print(f"ERROR loading data: {e}")
return
# 2. Validate data structure
is_valid, msg = validate_data_structure(df)
if not is_valid:
print(f"ERROR: {msg}")
return
if CUSTOMER_COLUMN not in df.columns:
print(f"ERROR: Customer column '{CUSTOMER_COLUMN}' not found in data")
return
print("Data validation passed")
# 3. Apply exclusion filters
df = apply_exclusion_filters(df)
# 4. Filter by date range
df = df[df['Year'] >= MIN_YEAR]
if DATE_COLUMN in df.columns:
df = df[df[DATE_COLUMN] <= MAX_DATE]
# 5. Calculate RFM scores
print("\nCalculating RFM scores...")
rfm_df = calculate_rfm_scores(df)
# 6. Segment summary
print("\nCustomer Segmentation Summary:")
print("-" * 60)
segment_summary = rfm_df.groupby('Segment').agg({
CUSTOMER_COLUMN: 'count',
'Monetary': 'sum'
}).reset_index()
segment_summary.columns = ['Segment', 'Customer Count', 'Total Revenue']
segment_summary = segment_summary.sort_values('Total Revenue', ascending=False)
for _, row in segment_summary.iterrows():
pct_customers = (row['Customer Count'] / len(rfm_df)) * 100
pct_revenue = (row['Total Revenue'] / rfm_df['Monetary'].sum()) * 100
print(f"{row['Segment']:20s}: {row['Customer Count']:5d} customers ({pct_customers:5.1f}%), "
f"{format_currency(row['Total Revenue'])} ({pct_revenue:5.1f}% of revenue)")
# 7. Create visualizations
print("\nGenerating charts...")
ensure_directories()
# Chart 1: Revenue by Segment
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=CHART_SIZES['wide'])
segment_summary_sorted = segment_summary.sort_values('Total Revenue', ascending=True)
revenue_millions = segment_summary_sorted['Total Revenue'].values / 1e6
ax1.barh(range(len(segment_summary_sorted)), revenue_millions, color='#2E86AB')
ax1.set_yticks(range(len(segment_summary_sorted)))
ax1.set_yticklabels(segment_summary_sorted['Segment'].values)
ax1.set_xlabel('Revenue (Millions USD)')
ax1.set_title('Revenue by Customer Segment', fontsize=12, fontweight='bold')
setup_revenue_chart(ax1)
ax1.set_ylabel('')
# Chart 2: Customer Count by Segment
customer_counts = segment_summary_sorted['Customer Count'].values
ax2.barh(range(len(segment_summary_sorted)), customer_counts, color='#A23B72')
ax2.set_yticks(range(len(segment_summary_sorted)))
ax2.set_yticklabels(segment_summary_sorted['Segment'].values)
ax2.set_xlabel('Number of Customers')
ax2.set_title('Customer Count by Segment', fontsize=12, fontweight='bold')
ax2.set_ylabel('')
ax2.grid(True, alpha=0.3)
plt.suptitle(f'Customer Segmentation Analysis - {COMPANY_NAME}',
fontsize=14, fontweight='bold', y=1.02)
plt.tight_layout()
save_chart(fig, 'customer_segmentation.png')
plt.close()
# 8. Validate revenue
print("\nValidating revenue...")
validate_revenue(df, ANALYSIS_NAME)
print(f"\n{ANALYSIS_NAME} complete!")
print(f"Charts saved to: {OUTPUT_DIR}")
# ============================================================================
# RUN ANALYSIS
# ============================================================================
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,203 @@
"""
Example: Product Performance Analysis
Example showing product mix and performance analysis
This example demonstrates:
- Product-level aggregation
- Product performance metrics
- Product mix visualization
"""
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
# Import utilities
from data_loader import load_sales_data, validate_data_structure
from validate_revenue import validate_revenue
from analysis_utils import (
get_ltm_period_config, calculate_annual_metrics,
apply_exclusion_filters, setup_revenue_chart, save_chart,
format_currency, sort_mixed_years
)
from config import (
OUTPUT_DIR, MAX_DATE, CHART_SIZES, ensure_directories,
get_data_path, COMPANY_NAME, REVENUE_COLUMN, ITEM_COLUMN,
DATE_COLUMN, MIN_YEAR, QUANTITY_COLUMN
)
# ============================================================================
# CONFIGURATION
# ============================================================================
ANALYSIS_NAME = "Product Performance Analysis"
DESCRIPTION = "Product mix and performance analysis"
# ============================================================================
# MAIN ANALYSIS FUNCTION
# ============================================================================
def main():
"""Main analysis function"""
print(f"\n{'='*60}")
print(f"{ANALYSIS_NAME}")
print(f"{'='*60}\n")
# 1. Load data
print("Loading data...")
try:
df = load_sales_data(get_data_path())
print(f"Loaded {len(df):,} transactions")
except Exception as e:
print(f"ERROR loading data: {e}")
return
# 2. Validate data structure
is_valid, msg = validate_data_structure(df)
if not is_valid:
print(f"ERROR: {msg}")
return
if ITEM_COLUMN not in df.columns:
print(f"WARNING: Item column '{ITEM_COLUMN}' not found. Using transaction-level analysis.")
# Create a dummy item column for demonstration
df[ITEM_COLUMN] = 'All Products'
print("Data validation passed")
# 3. Apply exclusion filters
df = apply_exclusion_filters(df)
# 4. Filter by date range
df = df[df['Year'] >= MIN_YEAR]
if DATE_COLUMN in df.columns:
df = df[df[DATE_COLUMN] <= MAX_DATE]
# 5. Setup LTM period
ltm_start, ltm_end = get_ltm_period_config()
# 6. Product performance summary
print("\nCalculating product performance...")
# Get most recent period data
if ltm_start and ltm_end and 'YearMonth' in df.columns:
recent_data = df[(df['YearMonth'] >= ltm_start) & (df['YearMonth'] <= ltm_end)]
period_label = f"LTM {ltm_end}"
else:
recent_year = df['Year'].max()
recent_data = df[df['Year'] == recent_year]
period_label = str(recent_year)
# Product-level metrics
product_metrics = recent_data.groupby(ITEM_COLUMN).agg({
REVENUE_COLUMN: ['sum', 'count'],
QUANTITY_COLUMN: 'sum' if QUANTITY_COLUMN in df.columns else 'count'
}).reset_index()
product_metrics.columns = [ITEM_COLUMN, 'Revenue', 'Transaction_Count', 'Quantity']
# Calculate average price per unit if quantity available
if QUANTITY_COLUMN in df.columns:
product_metrics['Avg_Price'] = product_metrics['Revenue'] / product_metrics['Quantity'].replace(0, np.nan)
else:
product_metrics['Avg_Price'] = product_metrics['Revenue'] / product_metrics['Transaction_Count']
# Sort by revenue
product_metrics = product_metrics.sort_values('Revenue', ascending=False)
# Top products summary
print(f"\nTop 10 Products by Revenue ({period_label}):")
print("-" * 80)
top_10 = product_metrics.head(10)
total_revenue = product_metrics['Revenue'].sum()
for idx, row in top_10.iterrows():
pct = (row['Revenue'] / total_revenue) * 100
print(f"{row[ITEM_COLUMN]:30s}: {format_currency(row['Revenue']):>12s} ({pct:5.1f}%)")
# 7. Annual product trends (if multiple years available)
if len(df['Year'].unique()) > 1:
print("\nCalculating annual product trends...")
def calculate_product_metrics(year_data):
"""Calculate product metrics for a year"""
product_revenue = year_data.groupby(ITEM_COLUMN)[REVENUE_COLUMN].sum()
# Get top 5 products
top_5 = product_revenue.nlargest(5)
return dict(top_5)
annual_product_df = calculate_annual_metrics(df, calculate_product_metrics, ltm_start, ltm_end)
# 8. Create visualizations
print("\nGenerating charts...")
ensure_directories()
# Chart 1: Top Products Revenue (Bar Chart)
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=CHART_SIZES['wide'])
top_10_revenue = top_10['Revenue'].values / 1e6
top_10_names = top_10[ITEM_COLUMN].values
ax1.barh(range(len(top_10)), top_10_revenue, color='#2E86AB')
ax1.set_yticks(range(len(top_10)))
ax1.set_yticklabels([name[:30] + '...' if len(name) > 30 else name for name in top_10_names])
ax1.set_xlabel('Revenue (Millions USD)')
ax1.set_title(f'Top 10 Products by Revenue\n({period_label})', fontsize=12, fontweight='bold')
setup_revenue_chart(ax1)
ax1.set_ylabel('')
# Chart 2: Revenue Distribution (Pie Chart for top 10)
if len(product_metrics) > 10:
other_revenue = product_metrics.iloc[10:]['Revenue'].sum()
pie_data = list(top_10['Revenue'].values) + [other_revenue]
pie_labels = list(top_10[ITEM_COLUMN].values) + ['Other']
else:
pie_data = product_metrics['Revenue'].values
pie_labels = product_metrics[ITEM_COLUMN].values
pie_data_millions = [x / 1e6 for x in pie_data]
ax2.pie(pie_data_millions, labels=pie_labels, autopct='%1.1f%%', startangle=90)
ax2.set_title('Revenue Distribution\n(Top Products)', fontsize=12, fontweight='bold')
plt.suptitle(f'Product Performance Analysis - {COMPANY_NAME}',
fontsize=14, fontweight='bold', y=1.02)
plt.tight_layout()
save_chart(fig, 'product_performance.png')
plt.close()
else:
# Single chart if only one year
print("\nGenerating chart...")
ensure_directories()
fig, ax = plt.subplots(figsize=CHART_SIZES['medium'])
top_10_revenue = top_10['Revenue'].values / 1e6
top_10_names = top_10[ITEM_COLUMN].values
ax.barh(range(len(top_10)), top_10_revenue, color='#2E86AB')
ax.set_yticks(range(len(top_10)))
ax.set_yticklabels([name[:40] + '...' if len(name) > 40 else name for name in top_10_names])
ax.set_xlabel('Revenue (Millions USD)')
ax.set_title(f'Top 10 Products by Revenue - {COMPANY_NAME}\n({period_label})',
fontsize=14, fontweight='bold')
setup_revenue_chart(ax)
ax.set_ylabel('')
plt.tight_layout()
save_chart(fig, 'product_performance.png')
plt.close()
# 9. Validate revenue
print("\nValidating revenue...")
validate_revenue(df, ANALYSIS_NAME)
print(f"\n{ANALYSIS_NAME} complete!")
print(f"Charts saved to: {OUTPUT_DIR}")
# ============================================================================
# RUN ANALYSIS
# ============================================================================
if __name__ == "__main__":
main()

238
export_utils.py Normal file
View File

@@ -0,0 +1,238 @@
"""
Export utilities for analysis results
Provides functions to export DataFrames and summary data to CSV and Excel
Usage:
from export_utils import export_to_csv, export_to_excel, export_summary_table
# Export DataFrame to CSV
export_to_csv(df, 'results.csv')
# Export DataFrame to Excel
export_to_excel(df, 'results.xlsx', sheet_name='Data')
# Export summary table
export_summary_table({'Metric1': 100, 'Metric2': 200}, 'summary.xlsx')
"""
import pandas as pd
from pathlib import Path
from config import REPORTS_DIR, ensure_directories
def export_to_csv(df, filename, output_dir=None, index=True):
"""
Export DataFrame to CSV with proper formatting
Args:
df: DataFrame to export
filename: Output filename (e.g., 'results.csv')
output_dir: Output directory (defaults to config.REPORTS_DIR)
index: Whether to include index in export (default: True)
Returns:
Path to exported file
"""
if output_dir is None:
output_dir = REPORTS_DIR
else:
output_dir = Path(output_dir)
ensure_directories()
output_dir.mkdir(exist_ok=True)
filepath = output_dir / filename
df.to_csv(filepath, index=index, encoding='utf-8-sig')
print(f"Exported to CSV: {filepath}")
return filepath
def export_to_excel(df, filename, sheet_name='Data', output_dir=None, index=True):
"""
Export DataFrame to Excel with formatting
Args:
df: DataFrame to export
filename: Output filename (e.g., 'results.xlsx')
sheet_name: Excel sheet name (default: 'Data')
output_dir: Output directory (defaults to config.REPORTS_DIR)
index: Whether to include index in export (default: True)
Returns:
Path to exported file
Raises:
ImportError: If openpyxl is not installed
"""
try:
import openpyxl
except ImportError:
raise ImportError(
"openpyxl is required for Excel export. Install with: pip install openpyxl"
)
if output_dir is None:
output_dir = REPORTS_DIR
else:
output_dir = Path(output_dir)
ensure_directories()
output_dir.mkdir(exist_ok=True)
filepath = output_dir / filename
# Create Excel writer
with pd.ExcelWriter(filepath, engine='openpyxl') as writer:
df.to_excel(writer, sheet_name=sheet_name, index=index)
# Auto-adjust column widths
worksheet = writer.sheets[sheet_name]
for idx, col in enumerate(df.columns, 1):
max_length = max(
df[col].astype(str).map(len).max(),
len(str(col))
)
# Cap at 50 characters for readability
adjusted_width = min(max_length + 2, 50)
worksheet.column_dimensions[chr(64 + idx)].width = adjusted_width
print(f"Exported to Excel: {filepath}")
return filepath
def export_summary_table(data_dict, filename, output_dir=None, title=None):
"""
Export summary statistics to formatted table (Excel)
Args:
data_dict: Dictionary of {metric_name: value} pairs
filename: Output filename (e.g., 'summary.xlsx')
output_dir: Output directory (defaults to config.REPORTS_DIR)
title: Optional title for the summary table
Returns:
Path to exported file
Example:
export_summary_table({
'Total Revenue': 1000000,
'Customer Count': 500,
'Average Order Value': 2000
}, 'summary.xlsx')
"""
try:
import openpyxl
except ImportError:
raise ImportError(
"openpyxl is required for Excel export. Install with: pip install openpyxl"
)
if output_dir is None:
output_dir = REPORTS_DIR
else:
output_dir = Path(output_dir)
ensure_directories()
output_dir.mkdir(exist_ok=True)
filepath = output_dir / filename
# Create DataFrame from dictionary
df = pd.DataFrame({
'Metric': list(data_dict.keys()),
'Value': list(data_dict.values())
})
# Format numeric values
def format_value(val):
if isinstance(val, (int, float)):
if abs(val) >= 1e6:
return f"${val / 1e6:.2f}m"
elif abs(val) >= 1e3:
return f"${val / 1e3:.2f}k"
else:
return f"${val:.2f}"
return str(val)
df['Formatted_Value'] = df['Value'].apply(format_value)
# Create Excel writer
with pd.ExcelWriter(filepath, engine='openpyxl') as writer:
df.to_excel(writer, sheet_name='Summary', index=False)
# Format worksheet
worksheet = writer.sheets['Summary']
# Set column widths
worksheet.column_dimensions['A'].width = 30
worksheet.column_dimensions['B'].width = 20
worksheet.column_dimensions['C'].width = 20
# Add title if provided
if title:
worksheet.insert_rows(1)
worksheet.merge_cells('A1:C1')
worksheet['A1'] = title
worksheet['A1'].font = openpyxl.styles.Font(bold=True, size=14)
worksheet['A1'].alignment = openpyxl.styles.Alignment(horizontal='center')
print(f"Exported summary table to Excel: {filepath}")
return filepath
def export_multiple_sheets(data_dict, filename, output_dir=None):
"""
Export multiple DataFrames to Excel with multiple sheets
Args:
data_dict: Dictionary of {sheet_name: DataFrame} pairs
filename: Output filename (e.g., 'results.xlsx')
output_dir: Output directory (defaults to config.REPORTS_DIR)
Returns:
Path to exported file
Example:
export_multiple_sheets({
'Revenue': revenue_df,
'Customers': customer_df,
'Products': product_df
}, 'analysis_results.xlsx')
"""
try:
import openpyxl
except ImportError:
raise ImportError(
"openpyxl is required for Excel export. Install with: pip install openpyxl"
)
if output_dir is None:
output_dir = REPORTS_DIR
else:
output_dir = Path(output_dir)
ensure_directories()
output_dir.mkdir(exist_ok=True)
filepath = output_dir / filename
# Create Excel writer
with pd.ExcelWriter(filepath, engine='openpyxl') as writer:
for sheet_name, df in data_dict.items():
# Truncate sheet name to 31 characters (Excel limit)
safe_sheet_name = sheet_name[:31]
df.to_excel(writer, sheet_name=safe_sheet_name, index=True)
# Auto-adjust column widths
worksheet = writer.sheets[safe_sheet_name]
for idx, col in enumerate(df.columns, 1):
max_length = max(
df[col].astype(str).map(len).max(),
len(str(col))
)
adjusted_width = min(max_length + 2, 50)
col_letter = openpyxl.utils.get_column_letter(idx)
worksheet.column_dimensions[col_letter].width = adjusted_width
print(f"Exported {len(data_dict)} sheets to Excel: {filepath}")
return filepath

184
generate_sample_data.py Normal file
View File

@@ -0,0 +1,184 @@
"""
Sample data generator for testing and demonstrations
Generates realistic sample sales data
Usage:
python generate_sample_data.py
# Or import and use programmatically:
from generate_sample_data import generate_sample_sales_data
df = generate_sample_sales_data(num_customers=100, num_products=50, years=[2021, 2022, 2023])
"""
import pandas as pd
import numpy as np
from pathlib import Path
from datetime import datetime, timedelta
import random
def generate_sample_sales_data(
num_customers=100,
num_products=50,
years=[2021, 2022, 2023, 2024, 2025],
transactions_per_month=500,
output_file='sample_sales_data.csv'
):
"""
Generate realistic sample sales data
Args:
num_customers: Number of unique customers
num_products: Number of unique products
years: List of years to generate data for
transactions_per_month: Average transactions per month
output_file: Output CSV filename
Returns:
DataFrame: Generated sales data
"""
print(f"Generating sample sales data...")
print(f" Customers: {num_customers}")
print(f" Products: {num_products}")
print(f" Years: {years}")
# Generate customer names
customer_names = [f"Customer_{i:04d}" for i in range(1, num_customers + 1)]
# Generate product names
product_names = [f"Product_{i:04d}" for i in range(1, num_products + 1)]
# Generate transactions
transactions = []
for year in years:
for month in range(1, 13):
# Skip future months
current_date = datetime.now()
if year > current_date.year or (year == current_date.year and month > current_date.month):
continue
# Generate transactions for this month
num_transactions = int(np.random.normal(transactions_per_month, transactions_per_month * 0.2))
num_transactions = max(10, num_transactions) # At least 10 transactions
for _ in range(num_transactions):
# Random date within month
if month == 2:
max_day = 28
elif month in [4, 6, 9, 11]:
max_day = 30
else:
max_day = 31
day = random.randint(1, max_day)
invoice_date = datetime(year, month, day)
# Random customer and product
customer = random.choice(customer_names)
product = random.choice(product_names)
# Generate quantity (most transactions are small)
quantity = int(np.random.lognormal(mean=1.5, sigma=1.0))
quantity = max(1, min(quantity, 100)) # Cap at 100
# Generate revenue (with some correlation to quantity)
base_price = np.random.lognormal(mean=5, sigma=1.5)
revenue = base_price * quantity
# Add some variation
revenue *= np.random.uniform(0.8, 1.2)
revenue = round(revenue, 2)
transactions.append({
'InvoiceDate': invoice_date,
'Customer': customer,
'Item': product,
'Quantity': quantity,
'USD': revenue,
'Year': year,
'Month': month
})
# Create DataFrame
df = pd.DataFrame(transactions)
# Sort by date
df = df.sort_values('InvoiceDate').reset_index(drop=True)
# Add some missing dates (realistic data quality issue)
missing_date_pct = 0.05 # 5% missing dates
num_missing = int(len(df) * missing_date_pct)
missing_indices = np.random.choice(df.index, size=num_missing, replace=False)
df.loc[missing_indices, 'InvoiceDate'] = pd.NaT
# Save to CSV
output_path = Path(output_file)
df.to_csv(output_path, index=False)
print(f"\n✅ Sample data generated: {output_path}")
print(f" Rows: {len(df):,}")
print(f" Date range: {df['InvoiceDate'].min()} to {df['InvoiceDate'].max()}")
print(f" Total revenue: ${df['USD'].sum() / 1e6:.2f}m")
return df
def generate_sample_data_for_template():
"""
Generate sample data matching template's expected structure
Uses config.py column names
"""
from config import (
REVENUE_COLUMN, DATE_COLUMN, CUSTOMER_COLUMN, ITEM_COLUMN,
QUANTITY_COLUMN, ANALYSIS_YEARS
)
print("Generating sample data for template...")
df = generate_sample_sales_data(
num_customers=200,
num_products=100,
years=ANALYSIS_YEARS,
transactions_per_month=1000,
output_file='sample_sales_data.csv'
)
# Rename columns to match config (if different)
column_mapping = {
'USD': REVENUE_COLUMN,
'InvoiceDate': DATE_COLUMN,
'Customer': CUSTOMER_COLUMN,
'Item': ITEM_COLUMN,
'Quantity': QUANTITY_COLUMN
}
# Only rename if different
for old_name, new_name in column_mapping.items():
if old_name in df.columns and old_name != new_name:
df = df.rename(columns={old_name: new_name})
# Save
output_path = Path('sample_sales_data.csv')
df.to_csv(output_path, index=False)
print(f"\n✅ Sample data saved to: {output_path}")
print(f" Ready to use with sales_analysis_template")
return df
# ============================================================================
# MAIN
# ============================================================================
if __name__ == "__main__":
"""Generate sample data"""
import sys
if len(sys.argv) > 1:
# Custom generation
num_customers = int(sys.argv[1]) if len(sys.argv) > 1 else 100
num_products = int(sys.argv[2]) if len(sys.argv) > 2 else 50
generate_sample_sales_data(
num_customers=num_customers,
num_products=num_products
)
else:
# Generate for template
generate_sample_data_for_template()

197
logger_config.py Normal file
View File

@@ -0,0 +1,197 @@
"""
Logging configuration for analysis scripts
Provides structured logging with file and console output
Usage:
from logger_config import get_logger
logger = get_logger('my_analysis')
logger.info("Analysis started")
logger.warning("Low data quality detected")
logger.error("Failed to load data")
"""
import logging
import sys
from pathlib import Path
from datetime import datetime
from config import COMPANY_NAME, OUTPUT_DIR
# Global logger instance
_logger = None
def setup_logging(log_level=logging.INFO, log_file=None, analysis_name=None):
"""
Setup logging configuration
Args:
log_level: Logging level (DEBUG, INFO, WARNING, ERROR)
log_file: Path to log file (defaults to logs/analysis_YYYYMMDD_HHMMSS.log)
analysis_name: Name of analysis for log file naming
Returns:
logging.Logger: Configured logger instance
"""
global _logger
# Create logs directory
logs_dir = Path('logs')
logs_dir.mkdir(exist_ok=True)
# Default log file name
if log_file is None:
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
if analysis_name:
safe_name = analysis_name.lower().replace(' ', '_').replace('/', '_')
log_file = logs_dir / f"{safe_name}_{timestamp}.log"
else:
log_file = logs_dir / f"analysis_{timestamp}.log"
else:
log_file = Path(log_file)
log_file.parent.mkdir(parents=True, exist_ok=True)
# Create logger
logger = logging.getLogger(analysis_name or 'analysis')
logger.setLevel(log_level)
# Remove existing handlers to avoid duplicates
logger.handlers = []
# Create formatters
detailed_formatter = logging.Formatter(
'%(asctime)s - %(name)s - %(levelname)s - %(message)s',
datefmt='%Y-%m-%d %H:%M:%S'
)
console_formatter = logging.Formatter(
'%(levelname)s - %(message)s'
)
# File handler (detailed)
file_handler = logging.FileHandler(log_file, encoding='utf-8')
file_handler.setLevel(log_level)
file_handler.setFormatter(detailed_formatter)
logger.addHandler(file_handler)
# Console handler (simpler)
console_handler = logging.StreamHandler(sys.stdout)
console_handler.setLevel(log_level)
console_handler.setFormatter(console_formatter)
logger.addHandler(console_handler)
# Log startup message
logger.info(f"="*60)
logger.info(f"Analysis: {analysis_name or 'Unknown'}")
logger.info(f"Company: {COMPANY_NAME}")
logger.info(f"Log File: {log_file}")
logger.info(f"="*60)
_logger = logger
return logger
def get_logger(analysis_name=None, log_level=logging.INFO):
"""
Get or create logger instance
Args:
analysis_name: Name of analysis
log_level: Logging level (default: INFO)
Returns:
logging.Logger: Logger instance
"""
global _logger
if _logger is None:
_logger = setup_logging(log_level=log_level, analysis_name=analysis_name)
return _logger
def log_analysis_start(analysis_name, logger=None):
"""
Log analysis start
Args:
analysis_name: Name of analysis
logger: Logger instance (creates one if None)
"""
if logger is None:
logger = get_logger(analysis_name)
logger.info(f"Starting analysis: {analysis_name}")
logger.info(f"Timestamp: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
def log_analysis_end(analysis_name, success=True, logger=None):
"""
Log analysis completion
Args:
analysis_name: Name of analysis
success: Whether analysis completed successfully
logger: Logger instance (creates one if None)
"""
if logger is None:
logger = get_logger(analysis_name)
if success:
logger.info(f"Analysis completed successfully: {analysis_name}")
else:
logger.error(f"Analysis failed: {analysis_name}")
logger.info(f"Timestamp: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
logger.info("="*60)
def log_data_loading(df, logger=None):
"""
Log data loading summary
Args:
df: Loaded DataFrame
logger: Logger instance (creates one if None)
"""
if logger is None:
logger = get_logger()
logger.info(f"Data loaded: {len(df):,} rows, {len(df.columns)} columns")
from config import REVENUE_COLUMN, DATE_COLUMN
if REVENUE_COLUMN in df.columns:
revenue = df[REVENUE_COLUMN].sum()
logger.info(f"Total revenue: ${revenue / 1e6:.2f}m")
if DATE_COLUMN in df.columns:
date_coverage = df[DATE_COLUMN].notna().sum() / len(df) * 100
logger.info(f"Date coverage: {date_coverage:.1f}%")
def log_error(error, logger=None, context=None):
"""
Log error with context
Args:
error: Exception or error message
logger: Logger instance (creates one if None)
context: Additional context string
"""
if logger is None:
logger = get_logger()
error_msg = str(error)
if context:
error_msg = f"{context}: {error_msg}"
logger.error(error_msg, exc_info=True)
# ============================================================================
# EXAMPLE USAGE
# ============================================================================
if __name__ == "__main__":
"""Example usage"""
logger = setup_logging(log_level=logging.DEBUG, analysis_name="Example Analysis")
logger.debug("This is a debug message")
logger.info("This is an info message")
logger.warning("This is a warning message")
logger.error("This is an error message")
log_analysis_start("Example Analysis", logger)
log_analysis_end("Example Analysis", success=True, logger)

228
report_generator.py Normal file
View File

@@ -0,0 +1,228 @@
"""
Report generation utility
Combines multiple charts and data into a PDF report
Usage:
from report_generator import generate_pdf_report
# Generate PDF report
generate_pdf_report(
charts=['chart1.png', 'chart2.png'],
title='Sales Analysis Report',
summary_data={'Total Revenue': 1000000}
)
"""
from pathlib import Path
from datetime import datetime
from config import COMPANY_NAME, OUTPUT_DIR, REPORTS_DIR, ensure_directories
def generate_pdf_report(
charts,
title=None,
summary_data=None,
output_filename=None,
output_dir=None
):
"""
Generate PDF report from charts and summary data
Args:
charts: List of chart file paths (PNG files)
title: Report title (defaults to company name + date)
summary_data: Dictionary of summary metrics
output_filename: Output PDF filename (defaults to report_YYYYMMDD_HHMMSS.pdf)
output_dir: Output directory (defaults to config.REPORTS_DIR)
Returns:
Path: Path to generated PDF file
Raises:
ImportError: If reportlab is not installed
"""
try:
from reportlab.lib.pagesizes import letter, A4
from reportlab.lib.units import inch
from reportlab.lib import colors
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Image, Table, TableStyle, PageBreak
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.lib.enums import TA_CENTER, TA_LEFT
except ImportError:
raise ImportError(
"reportlab is required for PDF generation. Install with: pip install reportlab"
)
if output_dir is None:
output_dir = REPORTS_DIR
else:
output_dir = Path(output_dir)
ensure_directories()
output_dir.mkdir(exist_ok=True)
# Default filename
if output_filename is None:
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
output_filename = f"report_{timestamp}.pdf"
output_path = output_dir / output_filename
# Create PDF document
doc = SimpleDocTemplate(
str(output_path),
pagesize=letter,
rightMargin=0.75*inch,
leftMargin=0.75*inch,
topMargin=0.75*inch,
bottomMargin=0.75*inch
)
# Container for PDF elements
story = []
# Styles
styles = getSampleStyleSheet()
title_style = ParagraphStyle(
'CustomTitle',
parent=styles['Heading1'],
fontSize=20,
textColor=colors.HexColor('#2E86AB'),
spaceAfter=30,
alignment=TA_CENTER
)
heading_style = ParagraphStyle(
'CustomHeading',
parent=styles['Heading2'],
fontSize=14,
textColor=colors.HexColor('#2E86AB'),
spaceAfter=12
)
# Title
if title is None:
title = f"{COMPANY_NAME} Sales Analysis Report"
story.append(Paragraph(title, title_style))
story.append(Spacer(1, 0.2*inch))
# Report metadata
metadata_text = f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
story.append(Paragraph(metadata_text, styles['Normal']))
story.append(Spacer(1, 0.3*inch))
# Summary data table
if summary_data:
story.append(Paragraph("Summary", heading_style))
# Create table
table_data = [['Metric', 'Value']]
for key, value in summary_data.items():
# Format value
if isinstance(value, (int, float)):
if abs(value) >= 1e6:
formatted_value = f"${value / 1e6:.2f}m"
elif abs(value) >= 1e3:
formatted_value = f"${value / 1e3:.2f}k"
else:
formatted_value = f"${value:.2f}"
else:
formatted_value = str(value)
table_data.append([key, formatted_value])
table = Table(table_data, colWidths=[3*inch, 2*inch])
table.setStyle(TableStyle([
('BACKGROUND', (0, 0), (-1, 0), colors.HexColor('#2E86AB')),
('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
('ALIGN', (0, 0), (-1, -1), 'LEFT'),
('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
('FONTSIZE', (0, 0), (-1, 0), 12),
('BOTTOMPADDING', (0, 0), (-1, 0), 12),
('BACKGROUND', (0, 1), (-1, -1), colors.beige),
('GRID', (0, 0), (-1, -1), 1, colors.black),
('ROWBACKGROUNDS', (0, 1), (-1, -1), [colors.white, colors.lightgrey])
]))
story.append(table)
story.append(Spacer(1, 0.3*inch))
# Add charts
if charts:
story.append(Paragraph("Charts", heading_style))
for i, chart_path in enumerate(charts, 1):
chart_path = Path(chart_path)
if not chart_path.exists():
print(f"Warning: Chart not found: {chart_path}")
continue
# Add chart title
chart_title = f"Chart {i}: {chart_path.stem.replace('_', ' ').title()}"
story.append(Paragraph(chart_title, styles['Heading3']))
story.append(Spacer(1, 0.1*inch))
# Add image
try:
img = Image(str(chart_path), width=6*inch, height=4*inch)
story.append(img)
except Exception as e:
error_msg = f"Error loading chart: {e}"
story.append(Paragraph(error_msg, styles['Normal']))
# Add page break between charts (except last one)
if i < len(charts):
story.append(PageBreak())
# Build PDF
doc.build(story)
print(f"PDF report generated: {output_path}")
return output_path
def generate_simple_report(charts, title=None, output_filename=None):
"""
Generate a simple PDF report (wrapper with defaults)
Args:
charts: List of chart file paths
title: Report title
output_filename: Output filename
Returns:
Path: Path to generated PDF
"""
return generate_pdf_report(
charts=charts,
title=title,
output_filename=output_filename
)
# ============================================================================
# EXAMPLE USAGE
# ============================================================================
if __name__ == "__main__":
"""Example usage"""
from config import OUTPUT_DIR
# Find charts in output directory
chart_files = list(OUTPUT_DIR.glob('*.png'))
if chart_files:
print(f"Found {len(chart_files)} charts")
# Generate report
report_path = generate_pdf_report(
charts=[str(f) for f in chart_files[:5]], # Limit to 5 charts
title="Sales Analysis Report",
summary_data={
'Total Charts': len(chart_files),
'Report Date': datetime.now().strftime('%Y-%m-%d')
}
)
print(f"Report saved to: {report_path}")
else:
print("No charts found in output directory")

30
requirements.txt Normal file
View File

@@ -0,0 +1,30 @@
# Python dependencies for Sales Analysis Template
# Install with: pip install -r requirements.txt
# Core data analysis
pandas>=2.0.0
numpy>=1.24.0
# Visualization
matplotlib>=3.7.0
seaborn>=0.12.0
# Export utilities (optional - uncomment if needed)
# openpyxl>=3.1.0 # For Excel export (export_utils.py)
# Interactive visualizations (optional - uncomment if needed)
# plotly>=5.17.0 # For interactive charts (analysis_utils.py)
# Report generation (optional - uncomment if needed)
# reportlab>=4.0.0 # For PDF reports (report_generator.py)
# Statistical analysis (optional - uncomment if needed)
# scipy>=1.10.0 # For statistical analysis, product lifecycle (statistical_utils.py)
# Testing (optional - uncomment if needed)
# pytest>=7.4.0 # For unit tests
# Advanced analysis (optional - uncomment if needed)
# pmdarima>=2.0.0 # For time series forecasting
# mlxtend>=0.22.0 # For market basket analysis
# scikit-learn>=1.3.0 # For machine learning analyses

185
run_all_analyses.py Normal file
View File

@@ -0,0 +1,185 @@
"""
Batch runner for all analysis scripts
Runs all analyses in sequence and generates a summary report
To use:
1. Add your analysis scripts to the ANALYSIS_SCRIPTS list below
2. Run: python run_all_analyses.py
"""
import subprocess
import sys
from pathlib import Path
from datetime import datetime
import time
# ============================================================================
# CONFIGURATION
# ============================================================================
# List of analysis scripts to run
# TODO: Add your analysis scripts here
ANALYSIS_SCRIPTS = [
# Example structure - customize for your analyses:
# 'check_annual_revenue.py',
# 'revenue_analysis.py',
# 'geographic_analysis.py',
# 'customer_segmentation.py',
# 'product_analysis.py',
# Add your analysis scripts here...
]
# Timeout per script (in seconds)
SCRIPT_TIMEOUT = 600 # 10 minutes
# ============================================================================
# HELPER FUNCTIONS
# ============================================================================
def run_script(script_path):
"""Run a single analysis script"""
script_name = Path(script_path).name
print(f"\n{'='*60}")
print(f"Running: {script_name}")
print(f"{'='*60}")
start_time = time.time()
try:
result = subprocess.run(
[sys.executable, script_path],
capture_output=True,
text=True,
timeout=SCRIPT_TIMEOUT
)
elapsed = time.time() - start_time
if result.returncode == 0:
print(f"{script_name} completed successfully ({elapsed:.1f}s)")
if result.stdout:
# Print last 10 lines of output
lines = result.stdout.strip().split('\n')
if len(lines) > 10:
print(" ... (output truncated)")
for line in lines[-10:]:
print(f" {line}")
else:
for line in lines:
print(f" {line}")
return True, elapsed, None
else:
print(f"{script_name} failed ({elapsed:.1f}s)")
if result.stderr:
print(f" Error: {result.stderr[:500]}")
return False, elapsed, result.stderr
except subprocess.TimeoutExpired:
elapsed = time.time() - start_time
print(f"⏱️ {script_name} timed out after {elapsed:.1f}s")
return False, elapsed, "Timeout"
except Exception as e:
elapsed = time.time() - start_time
print(f"{script_name} error: {str(e)}")
return False, elapsed, str(e)
# ============================================================================
# MAIN FUNCTION
# ============================================================================
def main():
"""Run all analysis scripts"""
from config import COMPANY_NAME
print(f"\n{'='*60}")
print(f"{COMPANY_NAME} Sales Analysis - Batch Runner")
print(f"Started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"{'='*60}\n")
# Check which scripts exist
existing_scripts = []
missing_scripts = []
for script in ANALYSIS_SCRIPTS:
script_path = Path(script)
if script_path.exists():
existing_scripts.append(script)
else:
missing_scripts.append(script)
if missing_scripts:
print(f"⚠️ Warning: {len(missing_scripts)} scripts not found:")
for script in missing_scripts:
print(f" - {script}")
print()
if not existing_scripts:
print("❌ No analysis scripts found!")
print(" Please add analysis scripts to ANALYSIS_SCRIPTS list in run_all_analyses.py")
return
print(f"Found {len(existing_scripts)} analysis scripts to run\n")
# Run scripts
results = []
total_start = time.time()
for script in existing_scripts:
success, elapsed, error = run_script(script)
results.append({
'script': script,
'success': success,
'elapsed': elapsed,
'error': error
})
total_elapsed = time.time() - total_start
# Print summary
print(f"\n{'='*60}")
print("Batch Run Summary")
print(f"{'='*60}\n")
successful = [r for r in results if r['success']]
failed = [r for r in results if not r['success']]
print(f"Total scripts: {len(results)}")
print(f"✅ Successful: {len(successful)}")
print(f"❌ Failed: {len(failed)}")
print(f"⏱️ Total time: {total_elapsed/60:.1f} minutes\n")
if failed:
print("Failed scripts:")
for r in failed:
print(f"{r['script']} ({r['elapsed']:.1f}s)")
if r['error']:
print(f" Error: {r['error'][:100]}")
print()
# Save summary to file
summary_file = Path('analysis_run_summary.txt')
with open(summary_file, 'w') as f:
f.write(f"{COMPANY_NAME} Sales Analysis - Batch Run Summary\n")
f.write(f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
f.write(f"{'='*60}\n\n")
f.write(f"Total scripts: {len(results)}\n")
f.write(f"Successful: {len(successful)}\n")
f.write(f"Failed: {len(failed)}\n")
f.write(f"Total time: {total_elapsed/60:.1f} minutes\n\n")
if successful:
f.write("Successful scripts:\n")
for r in successful:
f.write(f"{r['script']} ({r['elapsed']:.1f}s)\n")
f.write("\n")
if failed:
f.write("Failed scripts:\n")
for r in failed:
f.write(f"{r['script']} ({r['elapsed']:.1f}s)\n")
if r['error']:
f.write(f" Error: {r['error']}\n")
print(f"Summary saved to: {summary_file}")
if __name__ == "__main__":
main()

240
setup_wizard.py Normal file
View File

@@ -0,0 +1,240 @@
"""
Interactive setup wizard for configuring the sales analysis template
Asks clarifying questions to configure config.py for your specific company and data
"""
import os
import sys
from pathlib import Path
def print_header(text):
"""Print a formatted header"""
print("\n" + "="*70)
print(f" {text}")
print("="*70 + "\n")
def ask_question(prompt, default=None, validator=None):
"""
Ask a question and return the answer
Args:
prompt: Question to ask
default: Default value if user just presses Enter
validator: Optional function to validate input
Returns:
User's answer (or default)
"""
if default:
full_prompt = f"{prompt} [{default}]: "
else:
full_prompt = f"{prompt}: "
while True:
answer = input(full_prompt).strip()
if not answer and default:
return default
elif not answer:
print(" Please provide an answer.")
continue
if validator:
try:
return validator(answer)
except Exception as e:
print(f" Invalid input: {e}")
continue
return answer
def validate_yes_no(answer):
"""Validate yes/no answer"""
answer_lower = answer.lower()
if answer_lower in ['y', 'yes', 'true', '1']:
return True
elif answer_lower in ['n', 'no', 'false', '0']:
return False
else:
raise ValueError("Please answer 'yes' or 'no'")
def validate_int(answer):
"""Validate integer answer"""
return int(answer)
def validate_file_exists(answer):
"""Validate that file exists"""
if not Path(answer).exists():
raise ValueError(f"File not found: {answer}")
return answer
def main():
"""Run the setup wizard"""
print_header("Sales Analysis Template - Setup Wizard")
print("This wizard will help you configure the template for your company's data.")
print("You can press Enter to accept defaults (shown in brackets).\n")
responses = {}
# Company Information
print_header("Company Information")
responses['company_name'] = ask_question("Company Name", default="Your Company Name")
responses['analysis_date'] = ask_question("Analysis Date (YYYY-MM-DD)", default="2026-01-12")
# Data File
print_header("Data File Configuration")
print("Where is your sales data CSV file located?")
data_file = ask_question("Data file name (e.g., sales_data.csv)", default="sales_data.csv")
# Check if file exists
if Path(data_file).exists():
print(f" ✓ Found: {data_file}")
else:
print(f" ⚠ Warning: {data_file} not found. Make sure to place it in the template directory.")
responses['data_file'] = data_file
# Column Mapping
print_header("Column Mapping")
print("What are the column names in your CSV file?")
print("(Press Enter to accept defaults if your columns match common names)\n")
responses['revenue_column'] = ask_question("Revenue/Amount column name", default="USD")
responses['date_column'] = ask_question("Primary date column name", default="InvoiceDate")
has_fallback = ask_question("Do you have fallback date columns (Month, Year)?", default="yes", validator=validate_yes_no)
if has_fallback:
fallback_str = ask_question("Fallback date columns (comma-separated)", default="Month, Year")
responses['date_fallback'] = [col.strip() for col in fallback_str.split(',')]
else:
responses['date_fallback'] = []
responses['customer_column'] = ask_question("Customer/Account column name", default="Customer")
responses['item_column'] = ask_question("Item/Product column name", default="Item")
has_quantity = ask_question("Do you have a Quantity column?", default="yes", validator=validate_yes_no)
if has_quantity:
responses['quantity_column'] = ask_question("Quantity column name", default="Quantity")
else:
responses['quantity_column'] = None
# Date Range
print_header("Date Range Configuration")
responses['min_year'] = ask_question("Minimum year to include in analysis", default="2021", validator=validate_int)
responses['max_date'] = ask_question("Maximum date (YYYY-MM-DD)", default="2025-09-30")
years_str = ask_question("Analysis years (comma-separated, e.g., 2021,2022,2023,2024,2025)", default="2021,2022,2023,2024,2025")
responses['analysis_years'] = [int(y.strip()) for y in years_str.split(',')]
# LTM Configuration
print_header("LTM (Last Twelve Months) Configuration")
print("LTM is used for the most recent partial year to enable apples-to-apples comparison.")
print("Example: If your latest data is through September 2025, use Oct 2024 - Sep 2025.\n")
use_ltm = ask_question("Do you need LTM for the most recent year?", default="yes", validator=validate_yes_no)
responses['ltm_enabled'] = use_ltm
if use_ltm:
responses['ltm_start_month'] = ask_question("LTM start month (1-12)", default="10", validator=validate_int)
responses['ltm_start_year'] = ask_question("LTM start year", default="2024", validator=validate_int)
responses['ltm_end_month'] = ask_question("LTM end month (1-12)", default="9", validator=validate_int)
responses['ltm_end_year'] = ask_question("LTM end year", default="2025", validator=validate_int)
else:
responses['ltm_start_month'] = 10
responses['ltm_start_year'] = 2024
responses['ltm_end_month'] = 9
responses['ltm_end_year'] = 2025
# Exclusion Filters
print_header("Exclusion Filters (Optional)")
use_exclusions = ask_question("Do you need to exclude specific segments (e.g., test accounts, business units)?", default="no", validator=validate_yes_no)
responses['exclusions_enabled'] = use_exclusions
if use_exclusions:
responses['exclude_column'] = ask_question("Column name to filter on", default="Country")
exclude_values_str = ask_question("Values to exclude (comma-separated)", default="")
responses['exclude_values'] = [v.strip() for v in exclude_values_str.split(',') if v.strip()]
else:
responses['exclude_column'] = None
responses['exclude_values'] = []
# Generate config.py
print_header("Generating Configuration")
print("Updating config.py with your settings...")
# Read current config.py
config_path = Path('config.py')
if not config_path.exists():
print("ERROR: config.py not found!")
return
with open(config_path, 'r', encoding='utf-8') as f:
config_content = f.read()
# Replace values
replacements = {
"COMPANY_NAME = \"Your Company Name\"": f"COMPANY_NAME = \"{responses['company_name']}\"",
"ANALYSIS_DATE = \"2026-01-12\"": f"ANALYSIS_DATE = \"{responses['analysis_date']}\"",
"DATA_FILE = 'sales_data.csv'": f"DATA_FILE = '{responses['data_file']}'",
"REVENUE_COLUMN = 'USD'": f"REVENUE_COLUMN = '{responses['revenue_column']}'",
"DATE_COLUMN = 'InvoiceDate'": f"DATE_COLUMN = '{responses['date_column']}'",
"DATE_FALLBACK_COLUMNS = ['Month', 'Year']": f"DATE_FALLBACK_COLUMNS = {responses['date_fallback']}",
"CUSTOMER_COLUMN = 'Customer'": f"CUSTOMER_COLUMN = '{responses['customer_column']}'",
"ITEM_COLUMN = 'Item'": f"ITEM_COLUMN = '{responses['item_column']}'",
"QUANTITY_COLUMN = 'Quantity'": f"QUANTITY_COLUMN = '{responses['quantity_column']}'" if responses['quantity_column'] else "QUANTITY_COLUMN = None",
"MIN_YEAR = 2021": f"MIN_YEAR = {responses['min_year']}",
"MAX_DATE = pd.Timestamp('2025-09-30')": f"MAX_DATE = pd.Timestamp('{responses['max_date']}')",
"ANALYSIS_YEARS = [2021, 2022, 2023, 2024, 2025]": f"ANALYSIS_YEARS = {responses['analysis_years']}",
"LTM_ENABLED = True": f"LTM_ENABLED = {responses['ltm_enabled']}",
"LTM_START_MONTH = 10": f"LTM_START_MONTH = {responses['ltm_start_month']}",
"LTM_START_YEAR = 2024": f"LTM_START_YEAR = {responses['ltm_start_year']}",
"LTM_END_MONTH = 9": f"LTM_END_MONTH = {responses['ltm_end_month']}",
"LTM_END_YEAR = 2025": f"LTM_END_YEAR = {responses['ltm_end_year']}",
}
# Handle exclusions
if responses['exclusions_enabled']:
exclusions_config = f"""EXCLUSION_FILTERS = {{
'enabled': True,
'exclude_by_column': '{responses['exclude_column']}',
'exclude_values': {responses['exclude_values']}
}}"""
# Replace the exclusion filters section
import re
pattern = r"EXCLUSION_FILTERS = \{.*?\}"
config_content = re.sub(pattern, exclusions_config, config_content, flags=re.DOTALL)
else:
exclusions_config = """EXCLUSION_FILTERS = {
'enabled': False,
'exclude_by_column': None,
'exclude_values': []
}"""
import re
pattern = r"EXCLUSION_FILTERS = \{.*?\}"
config_content = re.sub(pattern, exclusions_config, config_content, flags=re.DOTALL)
# Apply replacements
for old, new in replacements.items():
if old in config_content:
config_content = config_content.replace(old, new)
# Write updated config
with open(config_path, 'w', encoding='utf-8') as f:
f.write(config_content)
print(" ✓ Configuration updated successfully!")
# Summary
print_header("Setup Complete")
print("Your configuration has been saved to config.py")
print("\nNext steps:")
print("1. Place your data file in the template directory (if not already there)")
print("2. Test data loading: python -c \"from data_loader import load_sales_data; from config import get_data_path; df = load_sales_data(get_data_path()); print(f'Loaded {len(df):,} rows')\"")
print("3. Review config.py and adjust any settings as needed")
print("4. Start creating your analysis scripts using analysis_template.py")
print("\nFor help, see README.md")
if __name__ == "__main__":
try:
main()
except KeyboardInterrupt:
print("\n\nSetup cancelled by user.")
sys.exit(0)

321
statistical_utils.py Normal file
View File

@@ -0,0 +1,321 @@
"""
Statistical analysis utilities
Common statistical operations for sales analysis
Usage:
from statistical_utils import calculate_yoy_growth, calculate_cagr, calculate_correlation
# Calculate year-over-year growth
growth = calculate_yoy_growth(current_value=100, previous_value=90)
# Calculate CAGR
cagr = calculate_cagr(start_value=100, end_value=150, periods=3)
"""
import pandas as pd
import numpy as np
from scipy import stats
def calculate_yoy_growth(current, previous):
"""
Calculate year-over-year growth percentage
Args:
current: Current period value
previous: Previous period value
Returns:
float: Growth percentage (can be negative)
Example:
calculate_yoy_growth(110, 100) # Returns 10.0
calculate_yoy_growth(90, 100) # Returns -10.0
"""
if previous == 0:
return np.nan if current == 0 else np.inf
return ((current - previous) / previous) * 100
def calculate_cagr(start_value, end_value, periods):
"""
Calculate Compound Annual Growth Rate (CAGR)
Args:
start_value: Starting value
end_value: Ending value
periods: Number of periods (years)
Returns:
float: CAGR as percentage
Example:
calculate_cagr(100, 150, 3) # Returns ~14.47%
"""
if start_value <= 0 or periods <= 0:
return np.nan
if end_value <= 0:
return np.nan
cagr = ((end_value / start_value) ** (1 / periods) - 1) * 100
return cagr
def calculate_correlation(df, col1, col2):
"""
Calculate correlation between two columns
Args:
df: DataFrame
col1: First column name
col2: Second column name
Returns:
float: Correlation coefficient (-1 to 1)
"""
if col1 not in df.columns or col2 not in df.columns:
return np.nan
# Convert to numeric
series1 = pd.to_numeric(df[col1], errors='coerce')
series2 = pd.to_numeric(df[col2], errors='coerce')
# Remove NaN pairs
valid_mask = series1.notna() & series2.notna()
if valid_mask.sum() < 2:
return np.nan
correlation = series1[valid_mask].corr(series2[valid_mask])
return correlation
def calculate_trend_slope(y_values):
"""
Calculate linear trend slope
Args:
y_values: Array-like of y values
Returns:
float: Slope of linear trend
"""
if len(y_values) < 2:
return np.nan
x_values = np.arange(len(y_values))
# Remove NaN values
valid_mask = ~np.isnan(y_values)
if valid_mask.sum() < 2:
return np.nan
x_valid = x_values[valid_mask]
y_valid = y_values[valid_mask]
slope, intercept, r_value, p_value, std_err = stats.linregress(x_valid, y_valid)
return slope
def calculate_percent_change(series, periods=1):
"""
Calculate percent change over periods
Args:
series: Pandas Series
periods: Number of periods to shift (default: 1)
Returns:
Series: Percent change
"""
return series.pct_change(periods=periods) * 100
def calculate_moving_average(series, window=3):
"""
Calculate moving average
Args:
series: Pandas Series
window: Window size for moving average
Returns:
Series: Moving average
"""
return series.rolling(window=window, center=False).mean()
def calculate_volatility(series, window=12):
"""
Calculate rolling volatility (standard deviation)
Args:
series: Pandas Series
window: Window size for rolling calculation
Returns:
Series: Rolling volatility
"""
return series.rolling(window=window, center=False).std()
def calculate_z_score(value, mean, std):
"""
Calculate z-score
Args:
value: Value to score
mean: Mean of distribution
std: Standard deviation of distribution
Returns:
float: Z-score
"""
if std == 0:
return np.nan
return (value - mean) / std
def test_statistical_significance(group1, group2, alpha=0.05):
"""
Test statistical significance between two groups (t-test)
Args:
group1: First group (array-like)
group2: Second group (array-like)
alpha: Significance level (default: 0.05)
Returns:
dict: Test results with p-value, significant flag, etc.
"""
group1 = np.array(group1)
group2 = np.array(group2)
# Remove NaN values
group1 = group1[~np.isnan(group1)]
group2 = group2[~np.isnan(group2)]
if len(group1) < 2 or len(group2) < 2:
return {
'p_value': np.nan,
'significant': False,
'test_statistic': np.nan,
'error': 'Insufficient data'
}
# Perform t-test
t_statistic, p_value = stats.ttest_ind(group1, group2)
return {
'p_value': float(p_value),
'significant': p_value < alpha,
'test_statistic': float(t_statistic),
'alpha': alpha,
'group1_mean': float(np.mean(group1)),
'group2_mean': float(np.mean(group2)),
'group1_std': float(np.std(group1)),
'group2_std': float(np.std(group2))
}
def calculate_confidence_interval(series, confidence=0.95):
"""
Calculate confidence interval for a series
Args:
series: Pandas Series
confidence: Confidence level (default: 0.95 for 95%)
Returns:
dict: Mean, lower bound, upper bound
"""
series_clean = series.dropna()
if len(series_clean) == 0:
return {
'mean': np.nan,
'lower': np.nan,
'upper': np.nan,
'confidence': confidence
}
mean = series_clean.mean()
std = series_clean.std()
n = len(series_clean)
# Calculate standard error
se = std / np.sqrt(n)
# Calculate critical value (z-score for normal distribution)
alpha = 1 - confidence
z_critical = stats.norm.ppf(1 - alpha/2)
margin = z_critical * se
return {
'mean': float(mean),
'lower': float(mean - margin),
'upper': float(mean + margin),
'confidence': confidence,
'margin': float(margin)
}
def calculate_annual_growth_rates(values, years):
"""
Calculate year-over-year growth rates for annual data
Args:
values: Array-like of annual values
years: Array-like of corresponding years
Returns:
DataFrame: Years, values, and growth rates
"""
df = pd.DataFrame({
'Year': years,
'Value': values
})
df['YoY_Growth'] = calculate_percent_change(df['Value'])
df['YoY_Change'] = df['Value'].diff()
return df
def calculate_seasonality_index(monthly_series):
"""
Calculate seasonality index for monthly data
Args:
monthly_series: Series with datetime index (monthly frequency)
Returns:
Series: Seasonality index (1.0 = average, >1.0 = above average, <1.0 = below average)
"""
if not isinstance(monthly_series.index, pd.DatetimeIndex):
raise ValueError("Series must have DatetimeIndex")
# Extract month
monthly_series = monthly_series.copy()
monthly_series['Month'] = monthly_series.index.month
# Calculate average by month
monthly_avg = monthly_series.groupby('Month').mean()
overall_avg = monthly_series.mean()
# Calculate seasonality index
seasonality = monthly_avg / overall_avg
return seasonality
# ============================================================================
# EXAMPLE USAGE
# ============================================================================
if __name__ == "__main__":
"""Example usage"""
# YoY Growth
growth = calculate_yoy_growth(110, 100)
print(f"Year-over-year growth: {growth:.2f}%")
# CAGR
cagr = calculate_cagr(100, 150, 3)
print(f"CAGR: {cagr:.2f}%")
# Sample data for correlation
df = pd.DataFrame({
'Revenue': [100, 110, 120, 130, 140],
'Quantity': [10, 11, 12, 13, 14]
})
corr = calculate_correlation(df, 'Revenue', 'Quantity')
print(f"Correlation: {corr:.2f}")

View File

@@ -0,0 +1,85 @@
"""
Unit tests for analysis_utils.py
"""
import pytest
import pandas as pd
import numpy as np
from pathlib import Path
import sys
# Add parent directory to path
sys.path.insert(0, str(Path(__file__).parent.parent))
from analysis_utils import (
millions_formatter, thousands_formatter,
get_millions_formatter, get_thousands_formatter,
format_currency, calculate_price_per_unit,
sort_mixed_years, safe_year_labels
)
class TestFormatters:
"""Test formatting functions"""
def test_millions_formatter(self):
"""Test millions formatter"""
assert millions_formatter(10.5, None) == '$10.5m'
assert millions_formatter(0, None) == '$0.0m'
assert millions_formatter(100.0, None) == '$100.0m'
def test_thousands_formatter(self):
"""Test thousands formatter"""
assert thousands_formatter(10.5, None) == '$10.5k'
assert thousands_formatter(0, None) == '$0.0k'
def test_format_currency(self):
"""Test currency formatting"""
assert format_currency(1000000) == '$1.00m'
assert format_currency(1000, millions=False) == '$1.00k'
assert format_currency(np.nan) == 'N/A'
class TestPriceCalculation:
"""Test price calculation functions"""
def test_calculate_price_per_unit(self):
"""Test price per unit calculation"""
df = pd.DataFrame({
'Quantity': [10, 20, 30],
'Revenue': [100, 200, 300]
})
price = calculate_price_per_unit(df, 'Quantity', 'Revenue')
assert price == 10.0 # (100+200+300) / (10+20+30)
def test_calculate_price_per_unit_with_outliers(self):
"""Test price calculation excludes outliers"""
df = pd.DataFrame({
'Quantity': [10, 20, 30, 2000], # 2000 is outlier
'Revenue': [100, 200, 300, 10000]
})
# Should exclude quantity > 1000 by default
price = calculate_price_per_unit(df, 'Quantity', 'Revenue')
assert price == 10.0 # Only first 3 rows
class TestYearHandling:
"""Test year handling functions"""
def test_sort_mixed_years(self):
"""Test sorting mixed int/str years"""
df = pd.DataFrame({
'Year': [2023, '2025 (LTM)', 2024, 2022],
'Value': [100, 150, 120, 90]
})
sorted_df = sort_mixed_years(df, 'Year')
assert sorted_df['Year'].iloc[0] == 2022
assert sorted_df['Year'].iloc[-1] == '2025 (LTM)'
def test_safe_year_labels(self):
"""Test year label conversion"""
years = [2021, 2022, '2025 (LTM)']
labels = safe_year_labels(years)
assert labels == ['2021', '2022', '2025 (LTM)']
if __name__ == "__main__":
pytest.main([__file__, '-v'])

View File

@@ -0,0 +1,45 @@
"""
Unit tests for config_validator.py
"""
import pytest
import pandas as pd
from pathlib import Path
import sys
# Add parent directory to path
sys.path.insert(0, str(Path(__file__).parent.parent))
from config_validator import validate_config
class TestConfigValidator:
"""Test configuration validation"""
def test_validate_config_missing_column(self):
"""Test validation catches missing columns"""
df = pd.DataFrame({
'SomeColumn': [1, 2, 3]
})
errors, warnings = validate_config(df)
# Should have errors for missing required columns
assert len(errors) > 0
assert any('not found' in error.lower() for error in errors)
def test_validate_config_valid_data(self):
"""Test validation with valid data"""
df = pd.DataFrame({
'InvoiceDate': pd.to_datetime(['2023-01-01', '2023-02-01']),
'USD': [100.0, 200.0],
'Year': [2023, 2023]
})
errors, warnings = validate_config(df)
# Should have minimal errors (may have warnings about missing optional columns)
# But should not have critical errors if basic structure is correct
critical_errors = [e for e in errors if 'not found' in e.lower() and 'USD' in e or 'InvoiceDate' in e]
assert len(critical_errors) == 0
if __name__ == "__main__":
pytest.main([__file__, '-v'])

68
tests/test_data_loader.py Normal file
View File

@@ -0,0 +1,68 @@
"""
Integration tests for data_loader.py
"""
import pytest
import pandas as pd
import numpy as np
from pathlib import Path
import sys
import tempfile
import os
# Add parent directory to path
sys.path.insert(0, str(Path(__file__).parent.parent))
from data_loader import load_sales_data, validate_data_structure
class TestDataLoader:
"""Test data loading functions"""
def test_load_sales_data_basic(self):
"""Test basic data loading"""
# Create temporary CSV
with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False) as f:
f.write('InvoiceDate,USD,Customer\n')
f.write('2023-01-01,100.0,Customer1\n')
f.write('2023-02-01,200.0,Customer2\n')
temp_path = f.name
try:
# Temporarily update config
import config
original_data_file = config.DATA_FILE
config.DATA_FILE = Path(temp_path).name
df = load_sales_data(Path(temp_path))
assert len(df) == 2
assert 'Year' in df.columns
assert 'YearMonth' in df.columns
# Restore config
config.DATA_FILE = original_data_file
finally:
os.unlink(temp_path)
def test_validate_data_structure(self):
"""Test data structure validation"""
# Valid DataFrame
df_valid = pd.DataFrame({
'InvoiceDate': pd.to_datetime(['2023-01-01', '2023-02-01']),
'USD': [100.0, 200.0]
})
is_valid, msg = validate_data_structure(df_valid)
assert is_valid
assert msg == "OK"
# Invalid DataFrame (missing column)
df_invalid = pd.DataFrame({
'InvoiceDate': pd.to_datetime(['2023-01-01'])
})
is_valid, msg = validate_data_structure(df_invalid)
assert not is_valid
assert 'Missing required column' in msg
if __name__ == "__main__":
pytest.main([__file__, '-v'])

95
validate_revenue.py Normal file
View File

@@ -0,0 +1,95 @@
"""
Revenue validation utility
Validates that revenue calculations are consistent across analyses
"""
import pandas as pd
from config import (
REVENUE_COLUMN, ANALYSIS_YEARS, VALIDATION_ENABLED,
EXPECTED_REVENUE, REVENUE_TOLERANCE_PCT, LTM_ENABLED,
get_ltm_period
)
from analysis_utils import get_annual_data
def validate_revenue(dataframe: pd.DataFrame, analysis_name: str = "Analysis") -> None:
"""
Print annual revenue summary for validation.
This function helps ensure that:
1. Data loading is working correctly
2. Revenue calculations are consistent
3. Filters are not accidentally excluding too much data
Args:
dataframe: DataFrame with revenue and date columns (should have REVENUE_COLUMN and Year)
analysis_name: Name of the analysis (for logging/display)
Example:
>>> validate_revenue(df, "Revenue Analysis")
>>> # Prints annual revenue summary by year
"""
df = dataframe.copy()
# Ensure date column is datetime
from config import DATE_COLUMN
if DATE_COLUMN in df.columns:
df[DATE_COLUMN] = pd.to_datetime(df[DATE_COLUMN], errors='coerce', format='mixed')
# Filter to analysis years
df = df[df['Year'].isin(ANALYSIS_YEARS)]
# Calculate annual revenue
annual_revenue = {}
ltm_start, ltm_end = get_ltm_period() if LTM_ENABLED else (None, None)
for year in sorted(ANALYSIS_YEARS):
if year in df['Year'].unique():
year_data, year_label = get_annual_data(df, year, ltm_start, ltm_end)
if len(year_data) > 0:
revenue = year_data[REVENUE_COLUMN].sum()
annual_revenue[year_label] = revenue
# Print summary
print(f"\n{'='*60}")
print(f"Annual Revenue Validation - {analysis_name}")
print(f"{'='*60}")
if annual_revenue:
for year_label, revenue in annual_revenue.items():
formatted = f"${revenue / 1e6:.2f}m"
print(f" {year_label}: {formatted}")
# Validation against expected values
if VALIDATION_ENABLED and EXPECTED_REVENUE:
print(f"\nValidation Check:")
all_valid = True
for year_label, actual_revenue in annual_revenue.items():
# Try to match year label to expected revenue
year_key = None
if isinstance(year_label, str):
# Extract year number from label (e.g., "2025 (LTM 9/2025)" -> 2025)
import re
year_match = re.search(r'(\d{4})', str(year_label))
if year_match:
year_key = int(year_match.group(1))
else:
year_key = year_label
if year_key in EXPECTED_REVENUE:
expected = EXPECTED_REVENUE[year_key]
tolerance = expected * REVENUE_TOLERANCE_PCT
diff = abs(actual_revenue - expected)
if diff <= tolerance:
print(f"{year_label}: Within tolerance ({diff/1e6:.2f}m difference)")
else:
print(f"{year_label}: Outside tolerance (expected ${expected/1e6:.2f}m, got ${actual_revenue/1e6:.2f}m, diff: ${diff/1e6:.2f}m)")
all_valid = False
if all_valid:
print(" All validations passed!")
else:
print(" WARNING: Some validations failed. Check data loading and filters.")
else:
print(" No revenue data found for analysis years")
print(f"{'='*60}\n")