Initial commit: sales analysis template
Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
307
.cursor/rules/advanced_analysis_patterns.md
Normal file
307
.cursor/rules/advanced_analysis_patterns.md
Normal file
@@ -0,0 +1,307 @@
|
|||||||
|
# Advanced Analysis Patterns
|
||||||
|
|
||||||
|
This document provides patterns for sophisticated, production-grade analyses that leverage the full capabilities of the template framework.
|
||||||
|
|
||||||
|
## ⭐ Using Cursor AI Effectively
|
||||||
|
|
||||||
|
When working in Cursor, you can ask the AI to:
|
||||||
|
- "Create a cohort analysis script using the template patterns"
|
||||||
|
- "Add statistical significance testing to this analysis"
|
||||||
|
- "Generate a multi-dimensional analysis with product, customer, and geography"
|
||||||
|
- "Create a forecasting analysis with confidence intervals"
|
||||||
|
|
||||||
|
The AI will automatically use these patterns and utilities.
|
||||||
|
|
||||||
|
## Advanced Analysis Types
|
||||||
|
|
||||||
|
### 1. Multi-Dimensional Analysis
|
||||||
|
|
||||||
|
**Pattern:** Analyze across multiple dimensions simultaneously (e.g., Product × Customer × Geography)
|
||||||
|
|
||||||
|
```python
|
||||||
|
from data_loader import load_sales_data
|
||||||
|
from analysis_utils import calculate_annual_metrics, get_ltm_period_config
|
||||||
|
from config import REVENUE_COLUMN, ITEM_COLUMN, CUSTOMER_COLUMN, REGION_COLUMN
|
||||||
|
|
||||||
|
df = load_sales_data(get_data_path())
|
||||||
|
|
||||||
|
# Multi-dimensional pivot
|
||||||
|
pivot = df.pivot_table(
|
||||||
|
index=[ITEM_COLUMN, CUSTOMER_COLUMN],
|
||||||
|
columns=REGION_COLUMN,
|
||||||
|
values=REVENUE_COLUMN,
|
||||||
|
aggfunc='sum',
|
||||||
|
fill_value=0
|
||||||
|
)
|
||||||
|
|
||||||
|
# Or use data_processing helper
|
||||||
|
from data_processing import create_pivot_table
|
||||||
|
pivot = create_pivot_table(
|
||||||
|
df,
|
||||||
|
index=[ITEM_COLUMN, CUSTOMER_COLUMN],
|
||||||
|
columns=REGION_COLUMN,
|
||||||
|
values=REVENUE_COLUMN
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2. Cohort Analysis with Retention Metrics
|
||||||
|
|
||||||
|
**Pattern:** Track customer cohorts over time with retention and revenue metrics
|
||||||
|
|
||||||
|
```python
|
||||||
|
from examples.cohort_analysis import create_cohorts, calculate_cohort_metrics
|
||||||
|
|
||||||
|
df_cohort = create_cohorts(df)
|
||||||
|
cohort_metrics = calculate_cohort_metrics(df_cohort)
|
||||||
|
|
||||||
|
# Calculate Net Revenue Retention (NRR)
|
||||||
|
nrr = cohort_metrics.groupby('Cohort').agg({
|
||||||
|
'Revenue_Retention': lambda x: x.iloc[-1] if len(x) > 0 else 0
|
||||||
|
})
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3. Statistical Significance Testing
|
||||||
|
|
||||||
|
**Pattern:** Compare segments with statistical tests
|
||||||
|
|
||||||
|
```python
|
||||||
|
from statistical_utils import test_statistical_significance
|
||||||
|
|
||||||
|
# Compare two groups
|
||||||
|
group1 = df[df['Segment'] == 'A'][REVENUE_COLUMN]
|
||||||
|
group2 = df[df['Segment'] == 'B'][REVENUE_COLUMN]
|
||||||
|
|
||||||
|
result = test_statistical_significance(group1, group2)
|
||||||
|
if result['significant']:
|
||||||
|
print(f"Significant difference (p={result['p_value']:.4f})")
|
||||||
|
```
|
||||||
|
|
||||||
|
### 4. Price-Volume-Mix (PVM) Decomposition
|
||||||
|
|
||||||
|
**Pattern:** Decompose revenue changes into price, volume, and mix effects
|
||||||
|
|
||||||
|
```python
|
||||||
|
from config import QUANTITY_COLUMN, REVENUE_COLUMN
|
||||||
|
|
||||||
|
def pvm_decomposition(df_base, df_current):
|
||||||
|
"""Decompose revenue change into price, volume, mix effects"""
|
||||||
|
base_price = df_base[REVENUE_COLUMN].sum() / df_base[QUANTITY_COLUMN].sum()
|
||||||
|
current_price = df_current[REVENUE_COLUMN].sum() / df_current[QUANTITY_COLUMN].sum()
|
||||||
|
|
||||||
|
base_volume = df_base[QUANTITY_COLUMN].sum()
|
||||||
|
current_volume = df_current[QUANTITY_COLUMN].sum()
|
||||||
|
|
||||||
|
# Price effect
|
||||||
|
price_effect = (current_price - base_price) * base_volume
|
||||||
|
|
||||||
|
# Volume effect
|
||||||
|
volume_effect = (current_volume - base_volume) * base_price
|
||||||
|
|
||||||
|
# Mix effect (residual)
|
||||||
|
total_change = df_current[REVENUE_COLUMN].sum() - df_base[REVENUE_COLUMN].sum()
|
||||||
|
mix_effect = total_change - price_effect - volume_effect
|
||||||
|
|
||||||
|
return {
|
||||||
|
'price_effect': price_effect,
|
||||||
|
'volume_effect': volume_effect,
|
||||||
|
'mix_effect': mix_effect,
|
||||||
|
'total_change': total_change
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### 5. Time Series Forecasting
|
||||||
|
|
||||||
|
**Pattern:** Forecast future revenue with confidence intervals
|
||||||
|
|
||||||
|
```python
|
||||||
|
from data_processing import prepare_time_series
|
||||||
|
from statistical_utils import calculate_confidence_interval
|
||||||
|
|
||||||
|
# Prepare time series
|
||||||
|
ts = prepare_time_series(df, freq='M')
|
||||||
|
|
||||||
|
# Simple forecast (extend trend)
|
||||||
|
from scipy import stats
|
||||||
|
x = np.arange(len(ts))
|
||||||
|
slope, intercept, r_value, p_value, std_err = stats.linregress(x, ts.values)
|
||||||
|
|
||||||
|
# Forecast next 12 months
|
||||||
|
future_x = np.arange(len(ts), len(ts) + 12)
|
||||||
|
forecast = slope * future_x + intercept
|
||||||
|
|
||||||
|
# Calculate confidence intervals
|
||||||
|
ci = calculate_confidence_interval(ts, confidence=0.95)
|
||||||
|
```
|
||||||
|
|
||||||
|
### 6. Customer Lifetime Value (CLV) Analysis
|
||||||
|
|
||||||
|
**Pattern:** Calculate CLV using historical data
|
||||||
|
|
||||||
|
```python
|
||||||
|
from config import CUSTOMER_COLUMN, REVENUE_COLUMN, DATE_COLUMN
|
||||||
|
|
||||||
|
def calculate_clv(df, years=3):
|
||||||
|
"""Calculate customer lifetime value"""
|
||||||
|
customer_metrics = df.groupby(CUSTOMER_COLUMN).agg({
|
||||||
|
REVENUE_COLUMN: 'sum',
|
||||||
|
DATE_COLUMN: ['min', 'max', 'count']
|
||||||
|
}).reset_index()
|
||||||
|
|
||||||
|
customer_metrics.columns = [CUSTOMER_COLUMN, 'Total_Revenue', 'First_Purchase', 'Last_Purchase', 'Order_Count']
|
||||||
|
|
||||||
|
# Calculate customer age (years)
|
||||||
|
customer_metrics['Customer_Age_Years'] = (
|
||||||
|
(customer_metrics['Last_Purchase'] - customer_metrics['First_Purchase']).dt.days / 365.25
|
||||||
|
)
|
||||||
|
|
||||||
|
# Annual revenue
|
||||||
|
customer_metrics['Annual_Revenue'] = customer_metrics['Total_Revenue'] / customer_metrics['Customer_Age_Years'].replace(0, 1)
|
||||||
|
|
||||||
|
# Projected CLV
|
||||||
|
customer_metrics['CLV'] = customer_metrics['Annual_Revenue'] * years
|
||||||
|
|
||||||
|
return customer_metrics
|
||||||
|
```
|
||||||
|
|
||||||
|
### 7. Market Basket Analysis
|
||||||
|
|
||||||
|
**Pattern:** Find product associations and cross-sell opportunities
|
||||||
|
|
||||||
|
```python
|
||||||
|
from mlxtend.frequent_patterns import apriori, association_rules
|
||||||
|
from mlxtend.preprocessing import TransactionEncoder
|
||||||
|
|
||||||
|
# Prepare transaction data
|
||||||
|
transactions = df.groupby(INVOICE_NUMBER_COLUMN)[ITEM_COLUMN].apply(list).tolist()
|
||||||
|
|
||||||
|
# Encode transactions
|
||||||
|
te = TransactionEncoder()
|
||||||
|
te_ary = te.fit(transactions).transform(transactions)
|
||||||
|
df_encoded = pd.DataFrame(te_ary, columns=te.columns_)
|
||||||
|
|
||||||
|
# Find frequent itemsets
|
||||||
|
frequent_itemsets = apriori(df_encoded, min_support=0.01, use_colnames=True)
|
||||||
|
|
||||||
|
# Generate association rules
|
||||||
|
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.5)
|
||||||
|
```
|
||||||
|
|
||||||
|
### 8. Segmentation with Machine Learning
|
||||||
|
|
||||||
|
**Pattern:** Advanced customer segmentation using clustering
|
||||||
|
|
||||||
|
```python
|
||||||
|
from sklearn.cluster import KMeans
|
||||||
|
from sklearn.preprocessing import StandardScaler
|
||||||
|
|
||||||
|
# Prepare features
|
||||||
|
features = df.groupby(CUSTOMER_COLUMN).agg({
|
||||||
|
REVENUE_COLUMN: ['sum', 'mean', 'count'],
|
||||||
|
DATE_COLUMN: lambda x: (x.max() - x.min()).days
|
||||||
|
}).reset_index()
|
||||||
|
features.columns = [CUSTOMER_COLUMN, 'Total_Revenue', 'Avg_Order', 'Order_Count', 'Customer_Tenure']
|
||||||
|
|
||||||
|
# Scale features
|
||||||
|
scaler = StandardScaler()
|
||||||
|
features_scaled = scaler.fit_transform(features[['Total_Revenue', 'Avg_Order', 'Order_Count', 'Customer_Tenure']])
|
||||||
|
|
||||||
|
# Cluster
|
||||||
|
kmeans = KMeans(n_clusters=5, random_state=42)
|
||||||
|
features['Segment'] = kmeans.fit_predict(features_scaled)
|
||||||
|
```
|
||||||
|
|
||||||
|
### 9. Anomaly Detection
|
||||||
|
|
||||||
|
**Pattern:** Identify unusual patterns in data
|
||||||
|
|
||||||
|
```python
|
||||||
|
from statistical_utils import calculate_z_score
|
||||||
|
|
||||||
|
# Calculate z-scores for revenue
|
||||||
|
mean_revenue = df[REVENUE_COLUMN].mean()
|
||||||
|
std_revenue = df[REVENUE_COLUMN].std()
|
||||||
|
|
||||||
|
df['Revenue_Z_Score'] = df[REVENUE_COLUMN].apply(
|
||||||
|
lambda x: calculate_z_score(x, mean_revenue, std_revenue)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Flag anomalies (|z| > 3)
|
||||||
|
df['Is_Anomaly'] = df['Revenue_Z_Score'].abs() > 3
|
||||||
|
```
|
||||||
|
|
||||||
|
### 10. Competitive Analysis Framework
|
||||||
|
|
||||||
|
**Pattern:** Compare performance across dimensions
|
||||||
|
|
||||||
|
```python
|
||||||
|
from statistical_utils import calculate_yoy_growth, calculate_cagr
|
||||||
|
|
||||||
|
def competitive_analysis(df, dimension_col):
|
||||||
|
"""Compare performance across dimension (e.g., products, regions)"""
|
||||||
|
analysis = df.groupby(dimension_col).agg({
|
||||||
|
REVENUE_COLUMN: ['sum', 'mean', 'count']
|
||||||
|
}).reset_index()
|
||||||
|
analysis.columns = [dimension_col, 'Total_Revenue', 'Avg_Order', 'Order_Count']
|
||||||
|
|
||||||
|
# Calculate growth rates
|
||||||
|
for year in sorted(df['Year'].unique())[1:]:
|
||||||
|
prev_year = year - 1
|
||||||
|
current = df[df['Year'] == year].groupby(dimension_col)[REVENUE_COLUMN].sum()
|
||||||
|
previous = df[df['Year'] == prev_year].groupby(dimension_col)[REVENUE_COLUMN].sum()
|
||||||
|
|
||||||
|
growth = calculate_yoy_growth(current, previous)
|
||||||
|
analysis[f'Growth_{year}'] = growth
|
||||||
|
|
||||||
|
return analysis
|
||||||
|
```
|
||||||
|
|
||||||
|
## Best Practices for Advanced Analyses
|
||||||
|
|
||||||
|
1. **Always validate data quality first:**
|
||||||
|
```python
|
||||||
|
from data_quality import generate_data_quality_report
|
||||||
|
report = generate_data_quality_report(df)
|
||||||
|
```
|
||||||
|
|
||||||
|
2. **Use logging for complex analyses:**
|
||||||
|
```python
|
||||||
|
from logger_config import get_logger
|
||||||
|
logger = get_logger('advanced_analysis')
|
||||||
|
logger.info("Starting complex analysis...")
|
||||||
|
```
|
||||||
|
|
||||||
|
3. **Export intermediate results:**
|
||||||
|
```python
|
||||||
|
from export_utils import export_to_excel
|
||||||
|
export_to_excel(intermediate_df, 'intermediate_results.xlsx')
|
||||||
|
```
|
||||||
|
|
||||||
|
4. **Generate comprehensive reports:**
|
||||||
|
```python
|
||||||
|
from report_generator import generate_pdf_report
|
||||||
|
generate_pdf_report(charts=['chart1.png', 'chart2.png'], summary_data=summary)
|
||||||
|
```
|
||||||
|
|
||||||
|
5. **Test statistical significance:**
|
||||||
|
```python
|
||||||
|
from statistical_utils import test_statistical_significance
|
||||||
|
# Always test before making conclusions
|
||||||
|
```
|
||||||
|
|
||||||
|
## Cursor AI Prompts for Advanced Analyses
|
||||||
|
|
||||||
|
When using Cursor, try these prompts:
|
||||||
|
|
||||||
|
- **"Create a cohort retention analysis with heatmaps"**
|
||||||
|
- **"Build a price-volume-mix decomposition analysis"**
|
||||||
|
- **"Generate a customer lifetime value analysis with segmentation"**
|
||||||
|
- **"Create a forecasting model with confidence intervals"**
|
||||||
|
- **"Build a multi-dimensional analysis across product, customer, and geography"**
|
||||||
|
- **"Create an anomaly detection analysis for unusual transactions"**
|
||||||
|
|
||||||
|
The AI will automatically use these patterns and the template utilities.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
**Last Updated:** January 2026
|
||||||
|
**For:** Advanced users and AI-assisted development
|
||||||
316
.cursor/rules/ai_assistant_guide.md
Normal file
316
.cursor/rules/ai_assistant_guide.md
Normal file
@@ -0,0 +1,316 @@
|
|||||||
|
# AI Assistant Guide for Sales Analysis Template
|
||||||
|
|
||||||
|
This guide helps you effectively use Cursor's AI assistant to create sophisticated sales analyses.
|
||||||
|
|
||||||
|
## 🎯 Quick Start with AI
|
||||||
|
|
||||||
|
### Basic Prompt Structure
|
||||||
|
|
||||||
|
When asking the AI to create an analysis, use this structure:
|
||||||
|
|
||||||
|
```
|
||||||
|
Create a [ANALYSIS_TYPE] analysis that:
|
||||||
|
1. [Specific requirement 1]
|
||||||
|
2. [Specific requirement 2]
|
||||||
|
3. Uses the sales_analysis_template patterns
|
||||||
|
4. Includes [specific visualizations/metrics]
|
||||||
|
```
|
||||||
|
|
||||||
|
### Example Prompts
|
||||||
|
|
||||||
|
**Simple Analysis:**
|
||||||
|
```
|
||||||
|
Create an annual revenue trend analysis using the template patterns,
|
||||||
|
with LTM support and proper chart formatting.
|
||||||
|
```
|
||||||
|
|
||||||
|
**Advanced Analysis:**
|
||||||
|
```
|
||||||
|
Create a customer cohort retention analysis that:
|
||||||
|
1. Groups customers by first purchase month
|
||||||
|
2. Calculates retention rates for 12 periods
|
||||||
|
3. Shows revenue retention metrics
|
||||||
|
4. Creates heatmap visualizations
|
||||||
|
5. Uses the template's cohort analysis patterns
|
||||||
|
```
|
||||||
|
|
||||||
|
**Multi-Dimensional Analysis:**
|
||||||
|
```
|
||||||
|
Create a product performance analysis across regions that:
|
||||||
|
1. Analyzes top products by revenue
|
||||||
|
2. Shows regional distribution
|
||||||
|
3. Calculates growth rates by region
|
||||||
|
4. Creates multi-panel visualizations
|
||||||
|
5. Exports results to Excel
|
||||||
|
```
|
||||||
|
|
||||||
|
## 📋 Template-Aware Prompts
|
||||||
|
|
||||||
|
The AI automatically knows about:
|
||||||
|
- `data_loader.py` - Always use this for loading data
|
||||||
|
- `analysis_utils.py` - Use utilities for formatting, LTM, etc.
|
||||||
|
- `config.py` - Use config values, never hardcode
|
||||||
|
- Template patterns - Follows best practices automatically
|
||||||
|
|
||||||
|
### What the AI Knows
|
||||||
|
|
||||||
|
When you mention the template, the AI will:
|
||||||
|
- ✅ Use `load_sales_data()` instead of `pd.read_csv()`
|
||||||
|
- ✅ Use `setup_revenue_chart()` for charts
|
||||||
|
- ✅ Divide revenue by 1e6 before plotting
|
||||||
|
- ✅ Use config values from `config.py`
|
||||||
|
- ✅ Apply exclusion filters if configured
|
||||||
|
- ✅ Validate data after loading
|
||||||
|
- ✅ Use LTM patterns correctly
|
||||||
|
|
||||||
|
## 🔧 Common AI Tasks
|
||||||
|
|
||||||
|
### 1. Create New Analysis Script
|
||||||
|
|
||||||
|
**Prompt:**
|
||||||
|
```
|
||||||
|
Create a new analysis script called [name].py that:
|
||||||
|
- Follows the template structure
|
||||||
|
- Analyzes [specific metric/dimension]
|
||||||
|
- Creates [type of visualization]
|
||||||
|
- Uses template utilities
|
||||||
|
```
|
||||||
|
|
||||||
|
**AI will:**
|
||||||
|
- Copy structure from `analysis_template.py`
|
||||||
|
- Use proper imports
|
||||||
|
- Follow template patterns
|
||||||
|
- Include validation
|
||||||
|
|
||||||
|
### 2. Add Advanced Features
|
||||||
|
|
||||||
|
**Prompt:**
|
||||||
|
```
|
||||||
|
Add statistical significance testing to [analysis].py:
|
||||||
|
- Compare [group1] vs [group2]
|
||||||
|
- Show p-values and confidence intervals
|
||||||
|
- Use statistical_utils functions
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3. Fix Common Issues
|
||||||
|
|
||||||
|
**Prompt:**
|
||||||
|
```
|
||||||
|
Fix the chart formatting in [analysis].py - it's showing scientific notation.
|
||||||
|
```
|
||||||
|
|
||||||
|
**AI will:**
|
||||||
|
- Add `data / 1e6` conversion
|
||||||
|
- Use `setup_revenue_chart()`
|
||||||
|
- Fix formatting issues
|
||||||
|
|
||||||
|
### 4. Enhance Existing Analysis
|
||||||
|
|
||||||
|
**Prompt:**
|
||||||
|
```
|
||||||
|
Enhance [analysis].py to:
|
||||||
|
- Add export to Excel functionality
|
||||||
|
- Include data quality checks
|
||||||
|
- Add logging
|
||||||
|
- Generate PDF report
|
||||||
|
```
|
||||||
|
|
||||||
|
## 🚀 Advanced AI Prompts
|
||||||
|
|
||||||
|
### Multi-Step Analysis
|
||||||
|
|
||||||
|
```
|
||||||
|
Create a comprehensive customer analysis that:
|
||||||
|
1. Segments customers using RFM
|
||||||
|
2. Calculates CLV for each segment
|
||||||
|
3. Identifies at-risk customers
|
||||||
|
4. Creates cohort retention analysis
|
||||||
|
5. Generates PDF report with all charts
|
||||||
|
```
|
||||||
|
|
||||||
|
### Data Quality First
|
||||||
|
|
||||||
|
```
|
||||||
|
Before running the analysis, check data quality:
|
||||||
|
1. Run data quality report
|
||||||
|
2. Fix any critical issues
|
||||||
|
3. Validate configuration
|
||||||
|
4. Then proceed with analysis
|
||||||
|
```
|
||||||
|
|
||||||
|
### Statistical Analysis
|
||||||
|
|
||||||
|
```
|
||||||
|
Add statistical analysis to [analysis].py:
|
||||||
|
- Calculate year-over-year growth with significance testing
|
||||||
|
- Show confidence intervals for forecasts
|
||||||
|
- Test differences between segments
|
||||||
|
- Use statistical_utils functions
|
||||||
|
```
|
||||||
|
|
||||||
|
## 💡 Pro Tips
|
||||||
|
|
||||||
|
### 1. Reference Existing Examples
|
||||||
|
|
||||||
|
```
|
||||||
|
Create an analysis similar to examples/customer_segmentation.py
|
||||||
|
but for product segmentation instead.
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2. Use Template Utilities
|
||||||
|
|
||||||
|
```
|
||||||
|
Use the template's export_utils to save results to Excel,
|
||||||
|
and report_generator to create a PDF report.
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3. Leverage Cursor Rules
|
||||||
|
|
||||||
|
The AI automatically reads `.cursor/rules/` files, so you can say:
|
||||||
|
```
|
||||||
|
Follow the advanced_analysis_patterns.md guide to create
|
||||||
|
a price-volume-mix decomposition analysis.
|
||||||
|
```
|
||||||
|
|
||||||
|
### 4. Iterative Development
|
||||||
|
|
||||||
|
```
|
||||||
|
Start with a basic version, then enhance it:
|
||||||
|
1. First version: Simple revenue trend
|
||||||
|
2. Add: Statistical significance
|
||||||
|
3. Add: Export functionality
|
||||||
|
4. Add: PDF report generation
|
||||||
|
```
|
||||||
|
|
||||||
|
## 🎨 Visualization Prompts
|
||||||
|
|
||||||
|
### Create Specific Chart Types
|
||||||
|
|
||||||
|
```
|
||||||
|
Create a heatmap showing [metric] across [dimension1] and [dimension2],
|
||||||
|
using seaborn and following template chart formatting.
|
||||||
|
```
|
||||||
|
|
||||||
|
```
|
||||||
|
Create an interactive Plotly chart for [analysis],
|
||||||
|
saving it as HTML using the template's interactive chart functions.
|
||||||
|
```
|
||||||
|
|
||||||
|
### Multi-Panel Visualizations
|
||||||
|
|
||||||
|
```
|
||||||
|
Create a 2x2 subplot showing:
|
||||||
|
- Top left: Revenue trend
|
||||||
|
- Top right: Customer count trend
|
||||||
|
- Bottom left: Average order value
|
||||||
|
- Bottom right: Growth rates
|
||||||
|
All using template chart formatting.
|
||||||
|
```
|
||||||
|
|
||||||
|
## 📊 Data Analysis Prompts
|
||||||
|
|
||||||
|
### Cohort Analysis
|
||||||
|
|
||||||
|
```
|
||||||
|
Create a cohort analysis that:
|
||||||
|
1. Groups customers by first purchase month
|
||||||
|
2. Tracks retention for 12 periods
|
||||||
|
3. Calculates revenue retention
|
||||||
|
4. Creates retention heatmap
|
||||||
|
5. Uses examples/cohort_analysis.py as reference
|
||||||
|
```
|
||||||
|
|
||||||
|
### Forecasting
|
||||||
|
|
||||||
|
```
|
||||||
|
Create a revenue forecasting analysis:
|
||||||
|
1. Prepare time series data
|
||||||
|
2. Fit trend model
|
||||||
|
3. Forecast next 12 months
|
||||||
|
4. Show confidence intervals
|
||||||
|
5. Use statistical_utils for calculations
|
||||||
|
```
|
||||||
|
|
||||||
|
### Segmentation
|
||||||
|
|
||||||
|
```
|
||||||
|
Create an advanced customer segmentation:
|
||||||
|
1. Calculate RFM scores
|
||||||
|
2. Apply clustering algorithm
|
||||||
|
3. Analyze segment characteristics
|
||||||
|
4. Create segment visualizations
|
||||||
|
5. Export segment data to Excel
|
||||||
|
```
|
||||||
|
|
||||||
|
## 🔍 Debugging with AI
|
||||||
|
|
||||||
|
### Fix Errors
|
||||||
|
|
||||||
|
```
|
||||||
|
I'm getting [error message] in [file].py.
|
||||||
|
Fix it using template best practices.
|
||||||
|
```
|
||||||
|
|
||||||
|
### Optimize Performance
|
||||||
|
|
||||||
|
```
|
||||||
|
Optimize [analysis].py for large datasets:
|
||||||
|
- Use efficient pandas operations
|
||||||
|
- Add progress indicators
|
||||||
|
- Consider data sampling if needed
|
||||||
|
```
|
||||||
|
|
||||||
|
### Improve Code Quality
|
||||||
|
|
||||||
|
```
|
||||||
|
Refactor [analysis].py to:
|
||||||
|
- Use more template utilities
|
||||||
|
- Follow template patterns better
|
||||||
|
- Add proper error handling
|
||||||
|
- Include logging
|
||||||
|
```
|
||||||
|
|
||||||
|
## 📝 Documentation Prompts
|
||||||
|
|
||||||
|
### Add Documentation
|
||||||
|
|
||||||
|
```
|
||||||
|
Add comprehensive docstrings to [analysis].py following
|
||||||
|
the template's documentation style.
|
||||||
|
```
|
||||||
|
|
||||||
|
### Create README
|
||||||
|
|
||||||
|
```
|
||||||
|
Create a README for [analysis].py explaining:
|
||||||
|
- What it does
|
||||||
|
- How to run it
|
||||||
|
- What outputs it generates
|
||||||
|
- Dependencies required
|
||||||
|
```
|
||||||
|
|
||||||
|
## 🎯 Best Practices for AI Interaction
|
||||||
|
|
||||||
|
1. **Be Specific:** Mention template files and utilities by name
|
||||||
|
2. **Reference Examples:** Point to existing examples when relevant
|
||||||
|
3. **Iterate:** Start simple, then add complexity
|
||||||
|
4. **Use Template Terms:** Mention "LTM", "config values", "template patterns"
|
||||||
|
5. **Ask for Validation:** Request data quality checks and validation
|
||||||
|
|
||||||
|
## Example Full Workflow
|
||||||
|
|
||||||
|
```
|
||||||
|
1. "Check my configuration using config_validator.py"
|
||||||
|
2. "Run data quality report on my data"
|
||||||
|
3. "Create a revenue trend analysis using template patterns"
|
||||||
|
4. "Add statistical significance testing to the analysis"
|
||||||
|
5. "Export results to Excel and generate PDF report"
|
||||||
|
6. "Create a cohort analysis similar to the example"
|
||||||
|
```
|
||||||
|
|
||||||
|
The AI will guide you through each step using template best practices.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
**Last Updated:** January 2026
|
||||||
|
**For:** Cursor AI users working with sales_analysis_template
|
||||||
161
.cursor/rules/analysis_patterns.md
Normal file
161
.cursor/rules/analysis_patterns.md
Normal file
@@ -0,0 +1,161 @@
|
|||||||
|
# Common Analysis Patterns
|
||||||
|
|
||||||
|
## ⭐ RECOMMENDED: Use Utilities
|
||||||
|
|
||||||
|
**Always prefer `analysis_utils.py` and `config.py` over manual implementations:**
|
||||||
|
- Consistent formatting
|
||||||
|
- Fewer errors
|
||||||
|
- Easier maintenance
|
||||||
|
- Standardized output
|
||||||
|
|
||||||
|
## Standard Script Structure (Using Utilities)
|
||||||
|
|
||||||
|
**RECOMMENDED:** Use `analysis_utils.py` and `config.py` for consistency:
|
||||||
|
|
||||||
|
```python
|
||||||
|
# 1. IMPORTS
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
from data_loader import load_sales_data, validate_data_structure
|
||||||
|
from validate_revenue import validate_revenue
|
||||||
|
from analysis_utils import (
|
||||||
|
get_ltm_period_config, get_annual_data, calculate_annual_metrics,
|
||||||
|
get_millions_formatter, setup_revenue_chart, save_chart,
|
||||||
|
format_currency, print_annual_summary, sort_mixed_years,
|
||||||
|
apply_exclusion_filters
|
||||||
|
)
|
||||||
|
from config import (
|
||||||
|
DATA_FILE, OUTPUT_DIR, CHART_SIZES, ensure_directories,
|
||||||
|
get_data_path, REVENUE_COLUMN, COMPANY_NAME
|
||||||
|
)
|
||||||
|
|
||||||
|
# 2. LOAD DATA (ALWAYS use data_loader)
|
||||||
|
df = load_sales_data(get_data_path())
|
||||||
|
|
||||||
|
# 3. VALIDATE DATA STRUCTURE
|
||||||
|
is_valid, msg = validate_data_structure(df)
|
||||||
|
if not is_valid:
|
||||||
|
print(f"ERROR: {msg}")
|
||||||
|
return
|
||||||
|
|
||||||
|
# 4. APPLY EXCLUSION FILTERS (if configured)
|
||||||
|
df = apply_exclusion_filters(df)
|
||||||
|
|
||||||
|
# 5. SETUP LTM (if doing annual comparisons and LTM is enabled)
|
||||||
|
ltm_start, ltm_end = get_ltm_period_config()
|
||||||
|
|
||||||
|
# 6. DATA PREPARATION
|
||||||
|
# Convert columns, filter data, create derived columns
|
||||||
|
|
||||||
|
# 7. ANALYSIS LOGIC
|
||||||
|
# Use calculate_annual_metrics() for annual aggregations
|
||||||
|
|
||||||
|
# 8. VISUALIZATIONS
|
||||||
|
# Use setup_revenue_chart() and save_chart() from analysis_utils
|
||||||
|
|
||||||
|
# 9. VALIDATION
|
||||||
|
validate_revenue(df, "Analysis Name")
|
||||||
|
```
|
||||||
|
|
||||||
|
## Annual Aggregation Pattern
|
||||||
|
|
||||||
|
**RECOMMENDED:** Use `calculate_annual_metrics()` from `analysis_utils.py`:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from analysis_utils import calculate_annual_metrics, get_ltm_period_config
|
||||||
|
from config import REVENUE_COLUMN
|
||||||
|
|
||||||
|
ltm_start, ltm_end = get_ltm_period_config()
|
||||||
|
|
||||||
|
def calculate_metrics(year_data):
|
||||||
|
"""Calculate metrics for a single year"""
|
||||||
|
return {
|
||||||
|
'Revenue': year_data[REVENUE_COLUMN].sum(),
|
||||||
|
# ... other metrics
|
||||||
|
}
|
||||||
|
|
||||||
|
annual_df = calculate_annual_metrics(df, calculate_metrics, ltm_start, ltm_end)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Chart Formatting Pattern
|
||||||
|
|
||||||
|
**ALWAYS use this pattern for revenue charts:**
|
||||||
|
|
||||||
|
```python
|
||||||
|
from analysis_utils import setup_revenue_chart, save_chart
|
||||||
|
from config import CHART_SIZES
|
||||||
|
|
||||||
|
fig, ax = plt.subplots(figsize=CHART_SIZES['medium'])
|
||||||
|
|
||||||
|
# Divide data by 1e6 BEFORE plotting
|
||||||
|
ax.plot(data / 1e6, ...)
|
||||||
|
# OR
|
||||||
|
ax.bar(x, values / 1e6, ...)
|
||||||
|
|
||||||
|
# Apply formatter automatically
|
||||||
|
setup_revenue_chart(ax)
|
||||||
|
|
||||||
|
# Save chart
|
||||||
|
save_chart(fig, 'chart_name.png')
|
||||||
|
plt.close()
|
||||||
|
```
|
||||||
|
|
||||||
|
## Mixed Type Handling
|
||||||
|
|
||||||
|
When dealing with year columns that may contain mixed int/str types (e.g., "2025 (LTM 9/2025)"):
|
||||||
|
|
||||||
|
```python
|
||||||
|
from analysis_utils import sort_mixed_years
|
||||||
|
|
||||||
|
# Sort DataFrame by year
|
||||||
|
df_sorted = sort_mixed_years(df, year_col='Year')
|
||||||
|
|
||||||
|
# For chart labels
|
||||||
|
years = df_sorted['Year'].tolist()
|
||||||
|
x_pos = range(len(years))
|
||||||
|
ax.set_xticks(x_pos)
|
||||||
|
ax.set_xticklabels(years, rotation=45, ha='right')
|
||||||
|
```
|
||||||
|
|
||||||
|
## Price Calculation Pattern
|
||||||
|
|
||||||
|
```python
|
||||||
|
from analysis_utils import calculate_price_per_unit
|
||||||
|
from config import QUANTITY_COLUMN, REVENUE_COLUMN
|
||||||
|
|
||||||
|
# Calculate average price per unit (excludes outliers automatically)
|
||||||
|
price_per_unit = calculate_price_per_unit(df, QUANTITY_COLUMN, REVENUE_COLUMN)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Exclusion Filters Pattern
|
||||||
|
|
||||||
|
If you need to exclude specific segments (e.g., test accounts, business units):
|
||||||
|
|
||||||
|
```python
|
||||||
|
from analysis_utils import apply_exclusion_filters
|
||||||
|
|
||||||
|
# Configure in config.py:
|
||||||
|
# EXCLUSION_FILTERS = {
|
||||||
|
# 'enabled': True,
|
||||||
|
# 'exclude_by_column': 'Country',
|
||||||
|
# 'exclude_values': ['KVT', 'Test']
|
||||||
|
# }
|
||||||
|
|
||||||
|
df = apply_exclusion_filters(df)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Using Configuration Values
|
||||||
|
|
||||||
|
**ALWAYS use config values instead of hardcoding:**
|
||||||
|
|
||||||
|
```python
|
||||||
|
from config import (
|
||||||
|
REVENUE_COLUMN, # Use this instead of 'USD' or 'Amount'
|
||||||
|
CUSTOMER_COLUMN, # Use this instead of 'Customer'
|
||||||
|
DATE_COLUMN, # Use this instead of 'InvoiceDate'
|
||||||
|
COMPANY_NAME, # Use this for titles
|
||||||
|
ANALYSIS_YEARS, # Use this for year filtering
|
||||||
|
CHART_SIZES, # Use this for figure sizes
|
||||||
|
)
|
||||||
|
```
|
||||||
111
.cursor/rules/chart_formatting.md
Normal file
111
.cursor/rules/chart_formatting.md
Normal file
@@ -0,0 +1,111 @@
|
|||||||
|
# Chart Formatting Rules
|
||||||
|
|
||||||
|
## ⭐ RECOMMENDED: Use analysis_utils.py
|
||||||
|
|
||||||
|
**Prefer utility functions:**
|
||||||
|
```python
|
||||||
|
from analysis_utils import setup_revenue_chart, save_chart, get_millions_formatter
|
||||||
|
from config import CHART_SIZES, OUTPUT_DIR
|
||||||
|
|
||||||
|
fig, ax = plt.subplots(figsize=CHART_SIZES['medium'])
|
||||||
|
ax.plot(data / 1e6, ...)
|
||||||
|
setup_revenue_chart(ax) # Applies formatter automatically
|
||||||
|
save_chart(fig, 'chart.png') # Saves to charts/ directory
|
||||||
|
```
|
||||||
|
|
||||||
|
## Revenue Charts: Millions Formatter
|
||||||
|
|
||||||
|
**ALWAYS use this pattern for revenue charts:**
|
||||||
|
|
||||||
|
```python
|
||||||
|
from analysis_utils import setup_revenue_chart
|
||||||
|
|
||||||
|
# Divide data by 1e6 BEFORE plotting
|
||||||
|
ax.plot(data / 1e6, ...)
|
||||||
|
# OR
|
||||||
|
ax.bar(x, values / 1e6, ...)
|
||||||
|
|
||||||
|
# Apply formatter automatically
|
||||||
|
setup_revenue_chart(ax)
|
||||||
|
```
|
||||||
|
|
||||||
|
**Manual approach (if not using utilities):**
|
||||||
|
```python
|
||||||
|
from matplotlib.ticker import FuncFormatter
|
||||||
|
|
||||||
|
def millions_formatter(x, pos):
|
||||||
|
return f'${x:.1f}m'
|
||||||
|
|
||||||
|
ax.plot(data / 1e6, ...)
|
||||||
|
ax.yaxis.set_major_formatter(FuncFormatter(millions_formatter))
|
||||||
|
ax.set_ylabel('Revenue (Millions USD)')
|
||||||
|
```
|
||||||
|
|
||||||
|
## Thousands Formatter (for smaller values)
|
||||||
|
|
||||||
|
```python
|
||||||
|
from analysis_utils import get_thousands_formatter
|
||||||
|
|
||||||
|
ax.xaxis.set_major_formatter(get_thousands_formatter())
|
||||||
|
ax.barh(x, values / 1e3, ...)
|
||||||
|
ax.set_xlabel('Value (Thousands USD)')
|
||||||
|
```
|
||||||
|
|
||||||
|
## Chart Labeling with LTM
|
||||||
|
|
||||||
|
**If LTM is enabled, ALWAYS include LTM notation:**
|
||||||
|
|
||||||
|
```python
|
||||||
|
from config import get_ltm_label, COMPANY_NAME
|
||||||
|
|
||||||
|
title = f'Annual Revenue Trend - {COMPANY_NAME}'
|
||||||
|
ltm_label = get_ltm_label()
|
||||||
|
if ltm_label:
|
||||||
|
title += f'\n({ltm_label})'
|
||||||
|
ax.set_title(title)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Chart Sizes
|
||||||
|
|
||||||
|
**Use predefined sizes from config:**
|
||||||
|
```python
|
||||||
|
from config import CHART_SIZES
|
||||||
|
|
||||||
|
fig, ax = plt.subplots(figsize=CHART_SIZES['medium']) # (10, 6)
|
||||||
|
# Options: 'small' (6, 4), 'medium' (10, 6), 'large' (12, 8), 'wide' (14, 6)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Common Mistakes
|
||||||
|
|
||||||
|
❌ **WRONG:**
|
||||||
|
```python
|
||||||
|
ax.plot(revenue, ...) # Shows scientific notation (1e8)
|
||||||
|
```
|
||||||
|
|
||||||
|
✅ **CORRECT:**
|
||||||
|
```python
|
||||||
|
ax.plot(revenue / 1e6, ...) # Divide first
|
||||||
|
setup_revenue_chart(ax) # Then format
|
||||||
|
```
|
||||||
|
|
||||||
|
## Saving Charts
|
||||||
|
|
||||||
|
**ALWAYS use save_chart() utility:**
|
||||||
|
```python
|
||||||
|
from analysis_utils import save_chart
|
||||||
|
|
||||||
|
save_chart(fig, 'chart_name.png') # Saves to charts/ with proper settings
|
||||||
|
plt.close() # Don't forget to close!
|
||||||
|
```
|
||||||
|
|
||||||
|
## Chart Styling
|
||||||
|
|
||||||
|
**Configure style in config.py:**
|
||||||
|
```python
|
||||||
|
# In config.py:
|
||||||
|
CHART_STYLE = 'seaborn-v0_8' # Options: 'default', 'ggplot', 'seaborn-v0_8'
|
||||||
|
|
||||||
|
# In your script:
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
plt.style.use(CHART_STYLE) # Apply before creating figures
|
||||||
|
```
|
||||||
389
.cursor/rules/code_quality.md
Normal file
389
.cursor/rules/code_quality.md
Normal file
@@ -0,0 +1,389 @@
|
|||||||
|
# Code Quality & Best Practices
|
||||||
|
|
||||||
|
**Comprehensive guide for writing Cursor-optimized code in the sales analysis template.**
|
||||||
|
|
||||||
|
This document combines code quality standards and Cursor best practices to ensure AI assistants can effectively understand, modify, and extend the codebase.
|
||||||
|
|
||||||
|
## Type Hints
|
||||||
|
|
||||||
|
### When to Use Type Hints
|
||||||
|
|
||||||
|
Use type hints for:
|
||||||
|
- Function parameters
|
||||||
|
- Return values
|
||||||
|
- Class attributes
|
||||||
|
- Complex data structures
|
||||||
|
|
||||||
|
### Example Pattern
|
||||||
|
|
||||||
|
```python
|
||||||
|
from typing import Dict, List, Optional, Tuple
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
def calculate_annual_metrics(
|
||||||
|
df: pd.DataFrame,
|
||||||
|
metrics_func: callable,
|
||||||
|
ltm_start: Optional[pd.Period] = None,
|
||||||
|
ltm_end: Optional[pd.Period] = None
|
||||||
|
) -> pd.DataFrame:
|
||||||
|
"""
|
||||||
|
Calculate annual metrics for all years
|
||||||
|
|
||||||
|
Args:
|
||||||
|
df: DataFrame with 'Year' and 'YearMonth' columns
|
||||||
|
metrics_func: Function that takes a DataFrame and returns a dict of metrics
|
||||||
|
ltm_start: LTM start period (defaults to config if None)
|
||||||
|
ltm_end: LTM end period (defaults to config if None)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
DataFrame with 'Year' index and metric columns
|
||||||
|
"""
|
||||||
|
# Implementation
|
||||||
|
```
|
||||||
|
|
||||||
|
## Docstrings
|
||||||
|
|
||||||
|
### Docstring Format
|
||||||
|
|
||||||
|
All functions should use Google-style docstrings:
|
||||||
|
|
||||||
|
```python
|
||||||
|
def function_name(param1: type, param2: type) -> return_type:
|
||||||
|
"""
|
||||||
|
Brief description of what the function does.
|
||||||
|
|
||||||
|
More detailed explanation if needed. Can span multiple lines.
|
||||||
|
Explain any complex logic or important considerations.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
param1: Description of param1
|
||||||
|
param2: Description of param2
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Description of return value
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ValueError: When and why this exception is raised
|
||||||
|
|
||||||
|
Example:
|
||||||
|
>>> result = function_name(value1, value2)
|
||||||
|
>>> print(result)
|
||||||
|
expected_output
|
||||||
|
"""
|
||||||
|
```
|
||||||
|
|
||||||
|
### Required Elements
|
||||||
|
|
||||||
|
- Brief one-line summary
|
||||||
|
- Detailed description (if needed)
|
||||||
|
- Args section (all parameters)
|
||||||
|
- Returns section (return value)
|
||||||
|
- Raises section (if exceptions raised)
|
||||||
|
- Example section (for complex functions)
|
||||||
|
|
||||||
|
## Variable Naming
|
||||||
|
|
||||||
|
### Conventions
|
||||||
|
|
||||||
|
- **Descriptive names:** `customer_revenue` not `cr`
|
||||||
|
- **Consistent prefixes:** `df_` for DataFrames, `annual_` for annual metrics
|
||||||
|
- **Clear abbreviations:** `ltm` for Last Twelve Months (well-known)
|
||||||
|
- **Avoid single letters:** Except for loop variables (`i`, `j`, `k`)
|
||||||
|
|
||||||
|
### Good Examples
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Good
|
||||||
|
customer_revenue_by_year = df.groupby(['Customer', 'Year'])[REVENUE_COLUMN].sum()
|
||||||
|
annual_metrics_df = calculate_annual_metrics(df, metrics_func)
|
||||||
|
ltm_start_period, ltm_end_period = get_ltm_period_config()
|
||||||
|
|
||||||
|
# Bad
|
||||||
|
cr = df.groupby(['C', 'Y'])['R'].sum()
|
||||||
|
am = calc(df, mf)
|
||||||
|
s, e = get_ltm()
|
||||||
|
```
|
||||||
|
|
||||||
|
## Error Messages
|
||||||
|
|
||||||
|
### Structure
|
||||||
|
|
||||||
|
Error messages should be:
|
||||||
|
1. **Specific:** What exactly went wrong
|
||||||
|
2. **Actionable:** How to fix it
|
||||||
|
3. **Contextual:** Where it occurred
|
||||||
|
4. **Helpful:** Reference to documentation
|
||||||
|
|
||||||
|
### Good Error Messages
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Good
|
||||||
|
raise ValueError(
|
||||||
|
f"Required column '{REVENUE_COLUMN}' not found in data.\n"
|
||||||
|
f"Available columns: {list(df.columns)}\n"
|
||||||
|
f"Please update config.py REVENUE_COLUMN to match your data.\n"
|
||||||
|
f"See .cursor/rules/data_loading.md for more help."
|
||||||
|
)
|
||||||
|
|
||||||
|
# Bad
|
||||||
|
raise ValueError("Column not found")
|
||||||
|
```
|
||||||
|
|
||||||
|
## Code Comments
|
||||||
|
|
||||||
|
### When to Comment
|
||||||
|
|
||||||
|
- Complex logic that isn't immediately obvious
|
||||||
|
- Business rules or domain-specific knowledge
|
||||||
|
- Workarounds or non-obvious solutions
|
||||||
|
- Performance considerations
|
||||||
|
- TODO items with context
|
||||||
|
|
||||||
|
### Comment Style
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Good: Explains WHY, not WHAT
|
||||||
|
# Use LTM for most recent year to enable apples-to-apples comparison
|
||||||
|
# with full calendar years (avoids partial year bias)
|
||||||
|
if year == LTM_END_YEAR and LTM_ENABLED:
|
||||||
|
year_data = get_ltm_data(df, ltm_start, ltm_end)
|
||||||
|
|
||||||
|
# Bad: States the obvious
|
||||||
|
# Check if year equals LTM_END_YEAR
|
||||||
|
if year == LTM_END_YEAR:
|
||||||
|
```
|
||||||
|
|
||||||
|
## Function Design
|
||||||
|
|
||||||
|
### Single Responsibility
|
||||||
|
|
||||||
|
Each function should do one thing well:
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Good: Single responsibility
|
||||||
|
def calculate_revenue(df: pd.DataFrame) -> float:
|
||||||
|
"""Calculate total revenue from DataFrame"""
|
||||||
|
return df[REVENUE_COLUMN].sum()
|
||||||
|
|
||||||
|
def calculate_customer_count(df: pd.DataFrame) -> int:
|
||||||
|
"""Calculate unique customer count"""
|
||||||
|
return df[CUSTOMER_COLUMN].nunique()
|
||||||
|
|
||||||
|
# Bad: Multiple responsibilities
|
||||||
|
def calculate_metrics(df):
|
||||||
|
"""Calculate revenue and customer count"""
|
||||||
|
revenue = df[REVENUE_COLUMN].sum()
|
||||||
|
customers = df[CUSTOMER_COLUMN].nunique()
|
||||||
|
return revenue, customers
|
||||||
|
```
|
||||||
|
|
||||||
|
### Function Length
|
||||||
|
|
||||||
|
- Keep functions under 50 lines when possible
|
||||||
|
- Break complex functions into smaller helper functions
|
||||||
|
- Use descriptive function names that explain purpose
|
||||||
|
|
||||||
|
## Import Organization
|
||||||
|
|
||||||
|
### Standard Order
|
||||||
|
|
||||||
|
1. Standard library imports
|
||||||
|
2. Third-party imports (pandas, numpy, matplotlib)
|
||||||
|
3. Local/template imports (data_loader, analysis_utils, config)
|
||||||
|
|
||||||
|
### Example
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Standard library
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Dict, Optional
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
# Third-party
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
|
||||||
|
# Template imports
|
||||||
|
from data_loader import load_sales_data, validate_data_structure
|
||||||
|
from analysis_utils import calculate_annual_metrics, setup_revenue_chart
|
||||||
|
from config import REVENUE_COLUMN, CHART_SIZES, COMPANY_NAME
|
||||||
|
```
|
||||||
|
|
||||||
|
## Constants and Configuration
|
||||||
|
|
||||||
|
### Use Config Values
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Good: From config
|
||||||
|
from config import REVENUE_COLUMN, DATE_COLUMN
|
||||||
|
revenue = df[REVENUE_COLUMN].sum()
|
||||||
|
|
||||||
|
# Bad: Hardcoded
|
||||||
|
revenue = df['USD'].sum()
|
||||||
|
```
|
||||||
|
|
||||||
|
### Magic Numbers
|
||||||
|
|
||||||
|
Avoid magic numbers - use named constants or config:
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Good: Named constant
|
||||||
|
MILLIONS_DIVISOR = 1e6
|
||||||
|
revenue_millions = revenue / MILLIONS_DIVISOR
|
||||||
|
|
||||||
|
# Or from config
|
||||||
|
CHART_DPI = 300 # In config.py
|
||||||
|
|
||||||
|
# Bad: Magic number
|
||||||
|
revenue_millions = revenue / 1000000
|
||||||
|
```
|
||||||
|
|
||||||
|
## Testing Considerations
|
||||||
|
|
||||||
|
### Testable Code
|
||||||
|
|
||||||
|
Write code that's easy to test:
|
||||||
|
- Pure functions when possible (no side effects)
|
||||||
|
- Dependency injection for external dependencies
|
||||||
|
- Clear inputs and outputs
|
||||||
|
|
||||||
|
### Example
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Good: Testable
|
||||||
|
def calculate_metrics(year_data: pd.DataFrame, revenue_col: str) -> Dict:
|
||||||
|
"""Calculate metrics - easy to test with sample data"""
|
||||||
|
return {
|
||||||
|
'Revenue': year_data[revenue_col].sum(),
|
||||||
|
'Count': len(year_data)
|
||||||
|
}
|
||||||
|
|
||||||
|
# Harder to test: Depends on global config
|
||||||
|
def calculate_metrics(year_data):
|
||||||
|
"""Uses global REVENUE_COLUMN - harder to test"""
|
||||||
|
return {'Revenue': year_data[REVENUE_COLUMN].sum()}
|
||||||
|
```
|
||||||
|
|
||||||
|
## AI-Friendly Patterns
|
||||||
|
|
||||||
|
### Clear Intent
|
||||||
|
|
||||||
|
Code should clearly express intent:
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Good: Intent is clear
|
||||||
|
customers_with_revenue = df[df[REVENUE_COLUMN] > 0][CUSTOMER_COLUMN].unique()
|
||||||
|
|
||||||
|
# Less clear: Requires understanding of pandas
|
||||||
|
customers_with_revenue = df.loc[df[REVENUE_COLUMN] > 0, CUSTOMER_COLUMN].unique()
|
||||||
|
```
|
||||||
|
|
||||||
|
### Explicit Over Implicit
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Good: Explicit
|
||||||
|
if LTM_ENABLED and ltm_start is not None and ltm_end is not None:
|
||||||
|
use_ltm = True
|
||||||
|
else:
|
||||||
|
use_ltm = False
|
||||||
|
|
||||||
|
# Less clear: Implicit truthiness
|
||||||
|
use_ltm = LTM_ENABLED and ltm_start and ltm_end
|
||||||
|
```
|
||||||
|
|
||||||
|
## Documentation for AI
|
||||||
|
|
||||||
|
### Help AI Understand Context
|
||||||
|
|
||||||
|
Add comments that help AI understand business context:
|
||||||
|
|
||||||
|
```python
|
||||||
|
# LTM (Last Twelve Months) is used for the most recent partial year
|
||||||
|
# to enable fair comparison with full calendar years.
|
||||||
|
# Example: If latest data is through Sep 2025, use Oct 2024 - Sep 2025
|
||||||
|
if year == LTM_END_YEAR and LTM_ENABLED:
|
||||||
|
# Use 12-month rolling period instead of partial calendar year
|
||||||
|
year_data = get_ltm_data(df, ltm_start, ltm_end)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Cursor-Specific Optimizations
|
||||||
|
|
||||||
|
### AI-Friendly Code Structure
|
||||||
|
|
||||||
|
Code should be structured so Cursor AI can:
|
||||||
|
1. **Understand intent** - Clear function names and comments
|
||||||
|
2. **Generate code** - Follow established patterns
|
||||||
|
3. **Fix errors** - Actionable error messages
|
||||||
|
4. **Extend functionality** - Modular, reusable functions
|
||||||
|
|
||||||
|
### Example: AI-Generated Code Pattern
|
||||||
|
|
||||||
|
When AI generates code, it should automatically:
|
||||||
|
```python
|
||||||
|
# AI recognizes this pattern and replicates it
|
||||||
|
def main():
|
||||||
|
# 1. Load data (AI knows to use data_loader)
|
||||||
|
df = load_sales_data(get_data_path())
|
||||||
|
|
||||||
|
# 2. Validate (AI knows to check structure)
|
||||||
|
is_valid, msg = validate_data_structure(df)
|
||||||
|
if not is_valid:
|
||||||
|
print(f"ERROR: {msg}")
|
||||||
|
return
|
||||||
|
|
||||||
|
# 3. Apply filters (AI knows exclusion filters)
|
||||||
|
df = apply_exclusion_filters(df)
|
||||||
|
|
||||||
|
# 4. Analysis logic (AI follows template patterns)
|
||||||
|
# ...
|
||||||
|
|
||||||
|
# 5. Create charts (AI knows formatting rules)
|
||||||
|
# ...
|
||||||
|
|
||||||
|
# 6. Validate revenue (AI knows to validate)
|
||||||
|
validate_revenue(df, ANALYSIS_NAME)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Help AI Generate Better Code
|
||||||
|
|
||||||
|
Add context comments that help AI:
|
||||||
|
```python
|
||||||
|
# LTM (Last Twelve Months) is used for the most recent partial year
|
||||||
|
# to enable fair comparison with full calendar years.
|
||||||
|
# Example: If latest data is through Sep 2025, use Oct 2024 - Sep 2025
|
||||||
|
# This avoids partial-year bias in year-over-year comparisons.
|
||||||
|
if year == LTM_END_YEAR and LTM_ENABLED:
|
||||||
|
# Use 12-month rolling period instead of partial calendar year
|
||||||
|
year_data = get_ltm_data(df, ltm_start, ltm_end)
|
||||||
|
year_label = get_ltm_label() # Returns "2025 (LTM 9/2025)"
|
||||||
|
```
|
||||||
|
|
||||||
|
## Summary Checklist
|
||||||
|
|
||||||
|
For Cursor-optimized code:
|
||||||
|
- ✅ Comprehensive docstrings with examples
|
||||||
|
- ✅ Type hints on functions
|
||||||
|
- ✅ Descriptive variable names
|
||||||
|
- ✅ Clear comments for business logic
|
||||||
|
- ✅ Structured error messages
|
||||||
|
- ✅ Consistent code patterns
|
||||||
|
- ✅ Use config values (never hardcode)
|
||||||
|
- ✅ Follow template utilities
|
||||||
|
- ✅ Include validation steps
|
||||||
|
- ✅ Reference documentation
|
||||||
|
|
||||||
|
## Summary
|
||||||
|
|
||||||
|
Follow these standards to ensure:
|
||||||
|
1. AI can understand code structure
|
||||||
|
2. AI can modify code safely
|
||||||
|
3. AI can generate new code following patterns
|
||||||
|
4. Code is maintainable and readable
|
||||||
|
5. Errors are clear and actionable
|
||||||
|
6. Cursor AI can assist effectively
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
**Last Updated:** January 2026
|
||||||
|
**For:** Cursor AI optimization and human developers
|
||||||
109
.cursor/rules/common_errors.md
Normal file
109
.cursor/rules/common_errors.md
Normal file
@@ -0,0 +1,109 @@
|
|||||||
|
# Common Errors and Troubleshooting
|
||||||
|
|
||||||
|
**Quick reference for fixing common issues. For error handling patterns when writing code, see `error_handling.md`.**
|
||||||
|
|
||||||
|
## Data Loading Errors
|
||||||
|
|
||||||
|
### Error: "Data file not found"
|
||||||
|
**Cause:** DATA_FILE path in config.py is incorrect
|
||||||
|
**Fix:**
|
||||||
|
1. Check that your CSV file exists
|
||||||
|
2. Update `DATA_FILE` in config.py with correct filename
|
||||||
|
3. If file is in a subdirectory, set `DATA_DIR` in config.py
|
||||||
|
|
||||||
|
### Error: "Required column 'USD' not found"
|
||||||
|
**Cause:** Column name in data doesn't match config
|
||||||
|
**Fix:**
|
||||||
|
1. Check your CSV column names
|
||||||
|
2. Update `REVENUE_COLUMN` in config.py to match your data
|
||||||
|
3. Update other column mappings (DATE_COLUMN, CUSTOMER_COLUMN, etc.)
|
||||||
|
|
||||||
|
### Error: "All InvoiceDate values are NaN"
|
||||||
|
**Cause:** Date column parsing failed
|
||||||
|
**Fix:**
|
||||||
|
1. Check date format in your CSV
|
||||||
|
2. Add fallback date columns to `DATE_FALLBACK_COLUMNS` in config.py
|
||||||
|
3. Ensure at least one date column exists (Month, Year, etc.)
|
||||||
|
|
||||||
|
## Analysis Errors
|
||||||
|
|
||||||
|
### Error: "DataFrame is empty" after filtering
|
||||||
|
**Cause:** Date range or year filters too restrictive
|
||||||
|
**Fix:**
|
||||||
|
1. Check `MIN_YEAR` and `MAX_DATE` in config.py
|
||||||
|
2. Check `ANALYSIS_YEARS` includes years in your data
|
||||||
|
3. Verify date parsing worked (check data_loader output)
|
||||||
|
|
||||||
|
### Error: Charts show scientific notation (1e8)
|
||||||
|
**Cause:** Forgot to divide by 1e6 before plotting
|
||||||
|
**Fix:**
|
||||||
|
```python
|
||||||
|
# WRONG:
|
||||||
|
ax.plot(revenue, ...)
|
||||||
|
|
||||||
|
# CORRECT:
|
||||||
|
ax.plot(revenue / 1e6, ...)
|
||||||
|
setup_revenue_chart(ax)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Error: "Year column has mixed types"
|
||||||
|
**Cause:** LTM year is string "2025 (LTM 9/2025)" while others are int
|
||||||
|
**Fix:**
|
||||||
|
```python
|
||||||
|
from analysis_utils import sort_mixed_years
|
||||||
|
df_sorted = sort_mixed_years(df, year_col='Year')
|
||||||
|
```
|
||||||
|
|
||||||
|
## Configuration Errors
|
||||||
|
|
||||||
|
### Error: LTM not working correctly
|
||||||
|
**Cause:** LTM configuration incorrect
|
||||||
|
**Fix:**
|
||||||
|
1. Check `LTM_ENABLED = True` in config.py
|
||||||
|
2. Verify `LTM_START_MONTH`, `LTM_START_YEAR`, `LTM_END_MONTH`, `LTM_END_YEAR`
|
||||||
|
3. Ensure dates are within your data range
|
||||||
|
|
||||||
|
### Error: Exclusion filters not working
|
||||||
|
**Cause:** Filter configuration incorrect
|
||||||
|
**Fix:**
|
||||||
|
1. Check `EXCLUSION_FILTERS['enabled'] = True`
|
||||||
|
2. Verify `exclude_by_column` matches a column in your data
|
||||||
|
3. Check `exclude_values` list is correct
|
||||||
|
|
||||||
|
## Import Errors
|
||||||
|
|
||||||
|
### Error: "No module named 'config'"
|
||||||
|
**Cause:** Running script from wrong directory
|
||||||
|
**Fix:**
|
||||||
|
1. Run scripts from template root directory
|
||||||
|
2. Or add template directory to Python path
|
||||||
|
|
||||||
|
### Error: "No module named 'data_loader'"
|
||||||
|
**Cause:** Missing import or wrong directory
|
||||||
|
**Fix:**
|
||||||
|
1. Ensure all template files are in the same directory
|
||||||
|
2. Check import statements match file names
|
||||||
|
|
||||||
|
## Best Practices to Avoid Errors
|
||||||
|
|
||||||
|
1. **Always use utilities:** Use `analysis_utils.py` functions instead of manual code
|
||||||
|
2. **Validate data:** Run `validate_data_structure()` after loading
|
||||||
|
3. **Check config:** Verify all column names match your data (use `config_validator.py`)
|
||||||
|
4. **Test incrementally:** Test data loading before running full analysis
|
||||||
|
5. **Read error messages:** They usually tell you exactly what's wrong
|
||||||
|
6. **Use Cursor AI:** Ask AI to fix errors - it knows template patterns
|
||||||
|
|
||||||
|
## Using Cursor AI to Fix Errors
|
||||||
|
|
||||||
|
When you encounter an error, ask Cursor AI:
|
||||||
|
```
|
||||||
|
"Fix this error: [paste error message]"
|
||||||
|
```
|
||||||
|
|
||||||
|
The AI will:
|
||||||
|
- ✅ Understand the error context
|
||||||
|
- ✅ Reference template patterns
|
||||||
|
- ✅ Suggest specific fixes
|
||||||
|
- ✅ Use template utilities correctly
|
||||||
|
|
||||||
|
**See also:** `.cursor/rules/error_handling.md` for how to write error messages that help AI fix issues.
|
||||||
69
.cursor/rules/data_loading.md
Normal file
69
.cursor/rules/data_loading.md
Normal file
@@ -0,0 +1,69 @@
|
|||||||
|
# Data Loading Rules
|
||||||
|
|
||||||
|
## CRITICAL: Always Use data_loader.py
|
||||||
|
|
||||||
|
**NEVER load data directly with `pd.read_csv()`. Always use:**
|
||||||
|
|
||||||
|
```python
|
||||||
|
from data_loader import load_sales_data
|
||||||
|
from config import get_data_path
|
||||||
|
df = load_sales_data(get_data_path())
|
||||||
|
```
|
||||||
|
|
||||||
|
## Why This Matters
|
||||||
|
|
||||||
|
The `data_loader.py` implements intelligent fallback logic to ensure 100% date coverage:
|
||||||
|
|
||||||
|
1. **Primary:** Parse primary date column (from config.DATE_COLUMN)
|
||||||
|
2. **Fallback 1:** Use fallback date columns if primary is missing (from config.DATE_FALLBACK_COLUMNS)
|
||||||
|
3. **Fallback 2:** Use Year column if both missing
|
||||||
|
4. **Result:** Maximum date coverage possible
|
||||||
|
|
||||||
|
## What data_loader.py Provides
|
||||||
|
|
||||||
|
- **Date Column:** Properly parsed datetime with fallback logic
|
||||||
|
- **Year:** Extracted year (100% coverage via fallback)
|
||||||
|
- **YearMonth:** Period format for monthly aggregations
|
||||||
|
- **Revenue Column:** Converted to numeric (from config.REVENUE_COLUMN)
|
||||||
|
|
||||||
|
## Column Configuration
|
||||||
|
|
||||||
|
Before using, configure column names in `config.py`:
|
||||||
|
- `REVENUE_COLUMN`: Your revenue/amount column name
|
||||||
|
- `DATE_COLUMN`: Primary date column name
|
||||||
|
- `DATE_FALLBACK_COLUMNS`: List of fallback date columns
|
||||||
|
- `CUSTOMER_COLUMN`: Customer/account column name
|
||||||
|
- Other columns as needed
|
||||||
|
|
||||||
|
## Common Mistakes
|
||||||
|
|
||||||
|
❌ **WRONG:**
|
||||||
|
```python
|
||||||
|
df = pd.read_csv('sales_data.csv')
|
||||||
|
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
|
||||||
|
df = df.dropna(subset=['Date']) # May drop significant data!
|
||||||
|
```
|
||||||
|
|
||||||
|
✅ **CORRECT:**
|
||||||
|
```python
|
||||||
|
from data_loader import load_sales_data
|
||||||
|
from config import get_data_path
|
||||||
|
df = load_sales_data(get_data_path()) # Uses fallback logic
|
||||||
|
```
|
||||||
|
|
||||||
|
## Data File Location
|
||||||
|
|
||||||
|
The data file path is configured in `config.py`:
|
||||||
|
- `DATA_FILE`: Filename (e.g., 'sales_data.csv')
|
||||||
|
- `DATA_DIR`: Optional subdirectory (defaults to current directory)
|
||||||
|
- Use `get_data_path()` to get the full path
|
||||||
|
|
||||||
|
## Validation
|
||||||
|
|
||||||
|
After loading, validate data structure:
|
||||||
|
```python
|
||||||
|
from data_loader import validate_data_structure
|
||||||
|
is_valid, msg = validate_data_structure(df)
|
||||||
|
if not is_valid:
|
||||||
|
print(f"ERROR: {msg}")
|
||||||
|
```
|
||||||
276
.cursor/rules/error_handling.md
Normal file
276
.cursor/rules/error_handling.md
Normal file
@@ -0,0 +1,276 @@
|
|||||||
|
# Error Handling Best Practices
|
||||||
|
|
||||||
|
This guide defines how to handle errors in a way that's helpful for both users and AI assistants.
|
||||||
|
|
||||||
|
## Error Message Structure
|
||||||
|
|
||||||
|
### Required Elements
|
||||||
|
|
||||||
|
Every error message should include:
|
||||||
|
1. **What went wrong** - Specific error description
|
||||||
|
2. **Where it occurred** - File/function context
|
||||||
|
3. **Why it happened** - Root cause explanation
|
||||||
|
4. **How to fix** - Actionable steps
|
||||||
|
5. **Reference** - Link to relevant documentation
|
||||||
|
|
||||||
|
### Template
|
||||||
|
|
||||||
|
```python
|
||||||
|
raise ErrorType(
|
||||||
|
f"[What] - [Specific description]\n"
|
||||||
|
f"\n"
|
||||||
|
f"Context: [Where/When this occurred]\n"
|
||||||
|
f"Reason: [Why this happened]\n"
|
||||||
|
f"\n"
|
||||||
|
f"Solution:\n"
|
||||||
|
f"1. [Step 1]\n"
|
||||||
|
f"2. [Step 2]\n"
|
||||||
|
f"\n"
|
||||||
|
f"For more help, see: [Documentation reference]"
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Common Error Patterns
|
||||||
|
|
||||||
|
### Data Loading Errors
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Good: Comprehensive error message
|
||||||
|
if REVENUE_COLUMN not in df.columns:
|
||||||
|
available_cols = list(df.columns)[:10] # Show first 10
|
||||||
|
raise ValueError(
|
||||||
|
f"Required column '{REVENUE_COLUMN}' not found in data.\n"
|
||||||
|
f"\n"
|
||||||
|
f"Context: Loading data from {filepath}\n"
|
||||||
|
f"Available columns: {available_cols}\n"
|
||||||
|
f"\n"
|
||||||
|
f"Solution:\n"
|
||||||
|
f"1. Check your CSV file column names\n"
|
||||||
|
f"2. Update REVENUE_COLUMN in config.py to match your data\n"
|
||||||
|
f"3. Run: python config_validator.py to validate configuration\n"
|
||||||
|
f"\n"
|
||||||
|
f"For more help, see: .cursor/rules/data_loading.md"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Bad: Vague error
|
||||||
|
if REVENUE_COLUMN not in df.columns:
|
||||||
|
raise ValueError("Column not found")
|
||||||
|
```
|
||||||
|
|
||||||
|
### Configuration Errors
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Good: Actionable error
|
||||||
|
if LTM_ENABLED and (LTM_START is None or LTM_END is None):
|
||||||
|
raise ValueError(
|
||||||
|
f"LTM configuration error: LTM_ENABLED is True but LTM period is not set.\n"
|
||||||
|
f"\n"
|
||||||
|
f"Context: Configuration in config.py\n"
|
||||||
|
f"Current values: LTM_ENABLED={LTM_ENABLED}, LTM_START={LTM_START}, LTM_END={LTM_END}\n"
|
||||||
|
f"\n"
|
||||||
|
f"Solution:\n"
|
||||||
|
f"1. Set LTM_START_MONTH, LTM_START_YEAR, LTM_END_MONTH, LTM_END_YEAR in config.py\n"
|
||||||
|
f"2. Or set LTM_ENABLED = False if you don't need LTM\n"
|
||||||
|
f"3. Run: python config_validator.py to check configuration\n"
|
||||||
|
f"\n"
|
||||||
|
f"For more help, see: .cursor/rules/ltm_methodology.md"
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Data Quality Errors
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Good: Helpful data quality error
|
||||||
|
if date_coverage < 0.5: # Less than 50% coverage
|
||||||
|
raise ValueError(
|
||||||
|
f"Data quality issue: Only {date_coverage:.1%} of rows have valid dates.\n"
|
||||||
|
f"\n"
|
||||||
|
f"Context: Date parsing in data_loader.py\n"
|
||||||
|
f"Rows with dates: {date_count:,} / {total_rows:,}\n"
|
||||||
|
f"\n"
|
||||||
|
f"Solution:\n"
|
||||||
|
f"1. Check date format in your CSV file\n"
|
||||||
|
f"2. Add fallback date columns to DATE_FALLBACK_COLUMNS in config.py\n"
|
||||||
|
f"3. Ensure at least one date column (Month, Year) exists\n"
|
||||||
|
f"4. Run: python data_quality.py to analyze data quality\n"
|
||||||
|
f"\n"
|
||||||
|
f"For more help, see: .cursor/rules/data_loading.md"
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Error Handling Patterns
|
||||||
|
|
||||||
|
### Try-Except with Context
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Good: Provides context and recovery options
|
||||||
|
try:
|
||||||
|
df = load_sales_data(get_data_path())
|
||||||
|
except FileNotFoundError as e:
|
||||||
|
error_msg = (
|
||||||
|
f"Data file not found: {e}\n"
|
||||||
|
f"\n"
|
||||||
|
f"Context: Attempting to load data for analysis\n"
|
||||||
|
f"Expected file: {get_data_path()}\n"
|
||||||
|
f"\n"
|
||||||
|
f"Solution:\n"
|
||||||
|
f"1. Check that your CSV file exists at the expected location\n"
|
||||||
|
f"2. Update DATA_FILE in config.py with correct filename\n"
|
||||||
|
f"3. Or update DATA_DIR if file is in a subdirectory\n"
|
||||||
|
f"4. Run: python setup_wizard.py to reconfigure\n"
|
||||||
|
f"\n"
|
||||||
|
f"For more help, see: .cursor/rules/common_errors.md"
|
||||||
|
)
|
||||||
|
raise FileNotFoundError(error_msg) from e
|
||||||
|
```
|
||||||
|
|
||||||
|
### Validation with Helpful Messages
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Good: Validates and provides specific guidance
|
||||||
|
def validate_data_structure(df: pd.DataFrame) -> Tuple[bool, str]:
|
||||||
|
"""
|
||||||
|
Validate DataFrame has required structure
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple[bool, str]: (is_valid, error_message)
|
||||||
|
If is_valid is False, error_message contains actionable guidance
|
||||||
|
"""
|
||||||
|
errors = []
|
||||||
|
|
||||||
|
if REVENUE_COLUMN not in df.columns:
|
||||||
|
errors.append(
|
||||||
|
f"Missing required column '{REVENUE_COLUMN}'. "
|
||||||
|
f"Update REVENUE_COLUMN in config.py to match your data."
|
||||||
|
)
|
||||||
|
|
||||||
|
if DATE_COLUMN not in df.columns:
|
||||||
|
errors.append(
|
||||||
|
f"Missing required column '{DATE_COLUMN}'. "
|
||||||
|
f"Update DATE_COLUMN in config.py or add fallback columns."
|
||||||
|
)
|
||||||
|
|
||||||
|
if len(df) == 0:
|
||||||
|
errors.append(
|
||||||
|
f"DataFrame is empty. Check date filters (MIN_YEAR, MAX_DATE) in config.py."
|
||||||
|
)
|
||||||
|
|
||||||
|
if errors:
|
||||||
|
error_msg = "Data validation failed:\n" + "\n".join(f" - {e}" for e in errors)
|
||||||
|
error_msg += "\n\nRun: python config_validator.py for detailed validation"
|
||||||
|
return False, error_msg
|
||||||
|
|
||||||
|
return True, "OK"
|
||||||
|
```
|
||||||
|
|
||||||
|
## Warning Messages
|
||||||
|
|
||||||
|
### When to Use Warnings
|
||||||
|
|
||||||
|
Use warnings (not errors) for:
|
||||||
|
- Non-critical data quality issues
|
||||||
|
- Optional features that aren't configured
|
||||||
|
- Deprecated functionality
|
||||||
|
- Performance considerations
|
||||||
|
|
||||||
|
### Warning Format
|
||||||
|
|
||||||
|
```python
|
||||||
|
import warnings
|
||||||
|
|
||||||
|
# Good: Informative warning
|
||||||
|
if date_coverage < 0.9: # Less than 90% but not critical
|
||||||
|
warnings.warn(
|
||||||
|
f"Date coverage is {date_coverage:.1%} ({missing_count:,} rows missing dates).\n"
|
||||||
|
f"Consider adding fallback date columns to improve coverage.\n"
|
||||||
|
f"See .cursor/rules/data_loading.md for details.",
|
||||||
|
UserWarning
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Logging Errors
|
||||||
|
|
||||||
|
### Use Structured Logging
|
||||||
|
|
||||||
|
```python
|
||||||
|
from logger_config import get_logger
|
||||||
|
|
||||||
|
logger = get_logger('analysis_name')
|
||||||
|
|
||||||
|
try:
|
||||||
|
df = load_sales_data(get_data_path())
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(
|
||||||
|
f"Failed to load data: {e}",
|
||||||
|
exc_info=True, # Include stack trace
|
||||||
|
extra={
|
||||||
|
'file_path': str(get_data_path()),
|
||||||
|
'config_file': 'config.py',
|
||||||
|
'suggestion': 'Run config_validator.py to check configuration'
|
||||||
|
}
|
||||||
|
)
|
||||||
|
raise
|
||||||
|
```
|
||||||
|
|
||||||
|
## AI-Friendly Error Messages
|
||||||
|
|
||||||
|
### Help AI Understand and Fix
|
||||||
|
|
||||||
|
Error messages should help AI assistants:
|
||||||
|
1. Understand what went wrong
|
||||||
|
2. Know where to look for fixes
|
||||||
|
3. Suggest specific solutions
|
||||||
|
4. Reference relevant documentation
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Good: AI can parse and act on this
|
||||||
|
if column not in df.columns:
|
||||||
|
raise ValueError(
|
||||||
|
f"Column '{column}' not found.\n"
|
||||||
|
f"Available: {list(df.columns)}\n"
|
||||||
|
f"Fix: Update {column}_COLUMN in config.py\n"
|
||||||
|
f"See: .cursor/rules/data_loading.md"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Bad: AI has no context
|
||||||
|
if column not in df.columns:
|
||||||
|
raise ValueError("Not found")
|
||||||
|
```
|
||||||
|
|
||||||
|
## Error Recovery
|
||||||
|
|
||||||
|
### Provide Recovery Options
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Good: Offers recovery path
|
||||||
|
def load_sales_data(filepath=None):
|
||||||
|
try:
|
||||||
|
df = pd.read_csv(filepath)
|
||||||
|
except FileNotFoundError:
|
||||||
|
# Suggest alternatives
|
||||||
|
suggestions = [
|
||||||
|
f"1. Check file path: {filepath}",
|
||||||
|
f"2. Update DATA_FILE in config.py",
|
||||||
|
f"3. Run: python setup_wizard.py",
|
||||||
|
f"4. Generate sample data: python generate_sample_data.py"
|
||||||
|
]
|
||||||
|
raise FileNotFoundError(
|
||||||
|
f"Data file not found: {filepath}\n"
|
||||||
|
f"\n"
|
||||||
|
f"Options:\n" + "\n".join(suggestions)
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Summary
|
||||||
|
|
||||||
|
Good error handling:
|
||||||
|
- ✅ Specific and actionable
|
||||||
|
- ✅ Provides context
|
||||||
|
- ✅ Suggests solutions
|
||||||
|
- ✅ References documentation
|
||||||
|
- ✅ Helps both users and AI assistants
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
**Last Updated:** January 2026
|
||||||
|
**For:** Error handling in sales_analysis_template
|
||||||
89
.cursor/rules/ltm_methodology.md
Normal file
89
.cursor/rules/ltm_methodology.md
Normal file
@@ -0,0 +1,89 @@
|
|||||||
|
# LTM (Last Twelve Months) Methodology Rules
|
||||||
|
|
||||||
|
## ⭐ RECOMMENDED: Use analysis_utils.py
|
||||||
|
|
||||||
|
**Prefer utility functions:**
|
||||||
|
```python
|
||||||
|
from analysis_utils import get_ltm_period_config, get_annual_data, calculate_annual_metrics
|
||||||
|
from config import get_ltm_period, get_ltm_label
|
||||||
|
|
||||||
|
ltm_start, ltm_end = get_ltm_period_config()
|
||||||
|
year_data, year_label = get_annual_data(df, 2025, ltm_start, ltm_end)
|
||||||
|
```
|
||||||
|
|
||||||
|
## What is LTM?
|
||||||
|
|
||||||
|
**LTM (Last Twelve Months)** = Rolling 12-month period for the most recent partial year
|
||||||
|
- **Purpose:** Apples-to-apples comparison with full calendar years
|
||||||
|
- **Example:** If latest data is through September 2025, use Oct 2024 - Sep 2025 (12 months)
|
||||||
|
|
||||||
|
## When to Use LTM
|
||||||
|
|
||||||
|
- **Full calendar years (2021-2024):** Use complete year data
|
||||||
|
- **Most recent partial year (2025):** Use LTM if you only have partial year data
|
||||||
|
- **Complete years only:** Disable LTM in config if all years are complete
|
||||||
|
|
||||||
|
## Configuration
|
||||||
|
|
||||||
|
**Configure in config.py:**
|
||||||
|
```python
|
||||||
|
LTM_ENABLED = True # Set to False if all years are complete
|
||||||
|
LTM_START_MONTH = 10 # Month number (1-12)
|
||||||
|
LTM_START_YEAR = 2024
|
||||||
|
LTM_END_MONTH = 9
|
||||||
|
LTM_END_YEAR = 2025
|
||||||
|
```
|
||||||
|
|
||||||
|
## Implementation Pattern
|
||||||
|
|
||||||
|
```python
|
||||||
|
from analysis_utils import get_ltm_period_config, get_annual_data
|
||||||
|
|
||||||
|
ltm_start, ltm_end = get_ltm_period_config()
|
||||||
|
|
||||||
|
for year in sorted(df['Year'].unique()):
|
||||||
|
year_data, year_label = get_annual_data(df, year, ltm_start, ltm_end)
|
||||||
|
# year_label will be "2025 (LTM 9/2025)" for LTM year, or "2025" for regular year
|
||||||
|
```
|
||||||
|
|
||||||
|
## Labeling Requirements
|
||||||
|
|
||||||
|
**ALWAYS label LTM year with notation in:**
|
||||||
|
- Chart titles
|
||||||
|
- Chart x-axis labels
|
||||||
|
- Table headers
|
||||||
|
- Print statements
|
||||||
|
- Report text
|
||||||
|
|
||||||
|
**Example:**
|
||||||
|
```python
|
||||||
|
from config import get_ltm_label
|
||||||
|
|
||||||
|
ltm_label = get_ltm_label() # Returns "2025 (LTM 9/2025)" or None
|
||||||
|
if ltm_label:
|
||||||
|
title = f'Annual Revenue Trend\n({ltm_label})'
|
||||||
|
```
|
||||||
|
|
||||||
|
## Common Mistakes
|
||||||
|
|
||||||
|
❌ **WRONG:**
|
||||||
|
```python
|
||||||
|
year_2025_data = df[df['Year'] == 2025] # Uses partial year (not comparable)
|
||||||
|
```
|
||||||
|
|
||||||
|
✅ **CORRECT:**
|
||||||
|
```python
|
||||||
|
from analysis_utils import get_annual_data
|
||||||
|
ltm_start, ltm_end = get_ltm_period_config()
|
||||||
|
year_2025_data, year_label = get_annual_data(df, 2025, ltm_start, ltm_end)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Disabling LTM
|
||||||
|
|
||||||
|
If all years in your analysis are complete calendar years:
|
||||||
|
```python
|
||||||
|
# In config.py:
|
||||||
|
LTM_ENABLED = False
|
||||||
|
```
|
||||||
|
|
||||||
|
Then all years will be treated as full calendar years.
|
||||||
203
EXAMPLES.md
Normal file
203
EXAMPLES.md
Normal file
@@ -0,0 +1,203 @@
|
|||||||
|
# Example Analysis Scripts
|
||||||
|
|
||||||
|
This directory contains working example analysis scripts that demonstrate how to use the sales analysis template framework.
|
||||||
|
|
||||||
|
## Available Examples
|
||||||
|
|
||||||
|
### 1. Annual Revenue Trend (`examples/annual_revenue_trend.py`)
|
||||||
|
|
||||||
|
**Purpose:** Simple annual revenue analysis with LTM support
|
||||||
|
|
||||||
|
**What it demonstrates:**
|
||||||
|
- Loading data using `data_loader`
|
||||||
|
- Calculating annual metrics with LTM
|
||||||
|
- Creating a revenue trend chart
|
||||||
|
- Following template best practices
|
||||||
|
|
||||||
|
**Usage:**
|
||||||
|
```bash
|
||||||
|
python examples/annual_revenue_trend.py
|
||||||
|
```
|
||||||
|
|
||||||
|
**Output:**
|
||||||
|
- Chart: `charts/annual_revenue_trend.png`
|
||||||
|
- Console output with annual revenue summary
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### 2. Customer Segmentation (`examples/customer_segmentation.py`)
|
||||||
|
|
||||||
|
**Purpose:** Customer segmentation using RFM (Recency, Frequency, Monetary) methodology
|
||||||
|
|
||||||
|
**What it demonstrates:**
|
||||||
|
- Customer-level aggregation
|
||||||
|
- RFM scoring and segmentation
|
||||||
|
- Segment analysis and visualization
|
||||||
|
- Multiple chart generation
|
||||||
|
|
||||||
|
**Usage:**
|
||||||
|
```bash
|
||||||
|
python examples/customer_segmentation.py
|
||||||
|
```
|
||||||
|
|
||||||
|
**Output:**
|
||||||
|
- Chart: `charts/customer_segmentation.png`
|
||||||
|
- Console output with segment summary
|
||||||
|
|
||||||
|
**Segments:**
|
||||||
|
- **Champions:** High recency, frequency, and monetary value
|
||||||
|
- **Loyal Customers:** Regular customers with good value
|
||||||
|
- **At Risk:** Recent but declining frequency
|
||||||
|
- **Hibernating:** Low recency, may need reactivation
|
||||||
|
- **Potential Loyalists:** Good recency and frequency, lower value
|
||||||
|
- **Need Attention:** Mixed signals, need engagement
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### 3. Product Performance (`examples/product_performance.py`)
|
||||||
|
|
||||||
|
**Purpose:** Product mix and performance analysis
|
||||||
|
|
||||||
|
**What it demonstrates:**
|
||||||
|
- Product-level aggregation
|
||||||
|
- Product performance metrics
|
||||||
|
- Top products identification
|
||||||
|
- Product mix visualization
|
||||||
|
|
||||||
|
**Usage:**
|
||||||
|
```bash
|
||||||
|
python examples/product_performance.py
|
||||||
|
```
|
||||||
|
|
||||||
|
**Output:**
|
||||||
|
- Chart: `charts/product_performance.png`
|
||||||
|
- Console output with top products summary
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## How to Use Examples
|
||||||
|
|
||||||
|
### Step 1: Configure Template
|
||||||
|
|
||||||
|
Before running examples, ensure your template is configured:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python setup_wizard.py
|
||||||
|
```
|
||||||
|
|
||||||
|
Or manually update `config.py` with your data file and column mappings.
|
||||||
|
|
||||||
|
### Step 2: Prepare Data
|
||||||
|
|
||||||
|
Place your sales data CSV file in the template directory, or update `DATA_DIR` in `config.py`.
|
||||||
|
|
||||||
|
Alternatively, generate sample data for testing:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python generate_sample_data.py
|
||||||
|
```
|
||||||
|
|
||||||
|
### Step 3: Run Example
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python examples/annual_revenue_trend.py
|
||||||
|
```
|
||||||
|
|
||||||
|
### Step 4: Customize
|
||||||
|
|
||||||
|
Copy an example script and modify it for your needs:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cp examples/annual_revenue_trend.py my_analysis.py
|
||||||
|
# Edit my_analysis.py
|
||||||
|
python my_analysis.py
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Example Patterns
|
||||||
|
|
||||||
|
### Pattern 1: Simple Annual Analysis
|
||||||
|
|
||||||
|
```python
|
||||||
|
from data_loader import load_sales_data
|
||||||
|
from analysis_utils import calculate_annual_metrics, get_ltm_period_config
|
||||||
|
from config import REVENUE_COLUMN
|
||||||
|
|
||||||
|
df = load_sales_data(get_data_path())
|
||||||
|
ltm_start, ltm_end = get_ltm_period_config()
|
||||||
|
|
||||||
|
def calculate_metrics(year_data):
|
||||||
|
return {'Revenue': year_data[REVENUE_COLUMN].sum()}
|
||||||
|
|
||||||
|
annual_df = calculate_annual_metrics(df, calculate_metrics, ltm_start, ltm_end)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Pattern 2: Customer-Level Analysis
|
||||||
|
|
||||||
|
```python
|
||||||
|
from config import CUSTOMER_COLUMN, REVENUE_COLUMN
|
||||||
|
|
||||||
|
customer_metrics = df.groupby(CUSTOMER_COLUMN).agg({
|
||||||
|
REVENUE_COLUMN: 'sum',
|
||||||
|
DATE_COLUMN: 'count'
|
||||||
|
}).reset_index()
|
||||||
|
```
|
||||||
|
|
||||||
|
### Pattern 3: Product-Level Analysis
|
||||||
|
|
||||||
|
```python
|
||||||
|
from config import ITEM_COLUMN, REVENUE_COLUMN
|
||||||
|
|
||||||
|
product_metrics = df.groupby(ITEM_COLUMN)[REVENUE_COLUMN].sum().sort_values(ascending=False)
|
||||||
|
top_10 = product_metrics.head(10)
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Learning Path
|
||||||
|
|
||||||
|
1. **Start with:** `annual_revenue_trend.py` - Simplest example
|
||||||
|
2. **Then try:** `product_performance.py` - More complex aggregation
|
||||||
|
3. **Advanced:** `customer_segmentation.py` - Multi-step analysis with custom logic
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
**"Module not found" errors:**
|
||||||
|
- Ensure you're running from the template root directory
|
||||||
|
- Check that all template files are present
|
||||||
|
|
||||||
|
**"Data file not found" errors:**
|
||||||
|
- Run `setup_wizard.py` to configure data file path
|
||||||
|
- Or update `DATA_FILE` in `config.py`
|
||||||
|
|
||||||
|
**"Column not found" errors:**
|
||||||
|
- Update column mappings in `config.py`
|
||||||
|
- Run `python config_validator.py` to check configuration
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Advanced Examples
|
||||||
|
|
||||||
|
For more sophisticated analyses, see:
|
||||||
|
- `.cursor/rules/advanced_analysis_patterns.md` - Advanced analysis patterns
|
||||||
|
- `.cursor/rules/ai_assistant_guide.md` - How to use Cursor AI effectively
|
||||||
|
|
||||||
|
## Next Steps
|
||||||
|
|
||||||
|
After running examples:
|
||||||
|
|
||||||
|
1. Review the generated charts
|
||||||
|
2. Examine the code to understand patterns
|
||||||
|
3. Copy an example and customize for your analysis
|
||||||
|
4. Check `.cursor/rules/analysis_patterns.md` for more patterns
|
||||||
|
5. Read `.cursor/rules/advanced_analysis_patterns.md` for advanced techniques
|
||||||
|
6. Use Cursor AI with prompts from `ai_assistant_guide.md`
|
||||||
|
7. Read `README.md` for comprehensive documentation
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
**Last Updated:** January 2026
|
||||||
|
**Template Version:** 1.0
|
||||||
175
QUICK_START.md
Normal file
175
QUICK_START.md
Normal file
@@ -0,0 +1,175 @@
|
|||||||
|
# Quick Start Guide
|
||||||
|
|
||||||
|
**For Cursor Users:** This template is optimized for Cursor AI. Just ask: *"Create a revenue analysis using the template"* and the AI will handle everything.
|
||||||
|
|
||||||
|
## 🚀 Get Started in 5 Minutes
|
||||||
|
|
||||||
|
### Step 1: Install Dependencies
|
||||||
|
```bash
|
||||||
|
pip install -r requirements.txt
|
||||||
|
```
|
||||||
|
|
||||||
|
### Step 2: Run Setup Wizard
|
||||||
|
```bash
|
||||||
|
python setup_wizard.py
|
||||||
|
```
|
||||||
|
|
||||||
|
The wizard will ask you:
|
||||||
|
- Company name
|
||||||
|
- Data file location
|
||||||
|
- Column names in your CSV
|
||||||
|
- Date range
|
||||||
|
- LTM configuration (if needed)
|
||||||
|
|
||||||
|
### Step 3: Test Data Loading
|
||||||
|
```bash
|
||||||
|
python -c "from data_loader import load_sales_data; from config import get_data_path; df = load_sales_data(get_data_path()); print(f'✓ Loaded {len(df):,} rows')"
|
||||||
|
```
|
||||||
|
|
||||||
|
### Step 4: Run Example Analysis (Recommended)
|
||||||
|
```bash
|
||||||
|
# Try an example first to see how it works
|
||||||
|
python examples/annual_revenue_trend.py
|
||||||
|
```
|
||||||
|
|
||||||
|
### Step 5: Create Your First Analysis
|
||||||
|
```bash
|
||||||
|
cp analysis_template.py my_analysis.py
|
||||||
|
# Or copy an example
|
||||||
|
cp examples/annual_revenue_trend.py my_analysis.py
|
||||||
|
# Edit my_analysis.py
|
||||||
|
python my_analysis.py
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 📋 Essential Configuration Checklist
|
||||||
|
|
||||||
|
Before running analyses, verify in `config.py`:
|
||||||
|
|
||||||
|
- [ ] `COMPANY_NAME` - Your company name
|
||||||
|
- [ ] `DATA_FILE` - Your CSV filename
|
||||||
|
- [ ] `REVENUE_COLUMN` - Your revenue column name
|
||||||
|
- [ ] `DATE_COLUMN` - Your date column name
|
||||||
|
- [ ] `CUSTOMER_COLUMN` - Your customer column name
|
||||||
|
- [ ] `ANALYSIS_YEARS` - Years to include
|
||||||
|
- [ ] `MIN_YEAR` and `MAX_DATE` - Date range
|
||||||
|
- [ ] `LTM_ENABLED` - Set to False if all years complete
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 💡 Common Patterns
|
||||||
|
|
||||||
|
### Load Data
|
||||||
|
```python
|
||||||
|
from data_loader import load_sales_data
|
||||||
|
from config import get_data_path
|
||||||
|
|
||||||
|
df = load_sales_data(get_data_path())
|
||||||
|
```
|
||||||
|
|
||||||
|
### Calculate Annual Metrics
|
||||||
|
```python
|
||||||
|
from analysis_utils import calculate_annual_metrics, get_ltm_period_config
|
||||||
|
from config import REVENUE_COLUMN
|
||||||
|
|
||||||
|
ltm_start, ltm_end = get_ltm_period_config()
|
||||||
|
|
||||||
|
def calculate_metrics(year_data):
|
||||||
|
return {'Revenue': year_data[REVENUE_COLUMN].sum()}
|
||||||
|
|
||||||
|
annual_df = calculate_annual_metrics(df, calculate_metrics, ltm_start, ltm_end)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Create Chart
|
||||||
|
```python
|
||||||
|
from analysis_utils import setup_revenue_chart, save_chart
|
||||||
|
from config import CHART_SIZES
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
|
||||||
|
fig, ax = plt.subplots(figsize=CHART_SIZES['medium'])
|
||||||
|
ax.plot(data / 1e6, ...) # Divide by 1e6!
|
||||||
|
setup_revenue_chart(ax)
|
||||||
|
save_chart(fig, 'chart.png')
|
||||||
|
plt.close()
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## ⚠️ Critical Rules
|
||||||
|
|
||||||
|
1. **ALWAYS use `data_loader.py`** - Never `pd.read_csv()` directly
|
||||||
|
2. **ALWAYS divide by 1e6** before plotting revenue
|
||||||
|
3. **ALWAYS use `setup_revenue_chart()`** for revenue charts
|
||||||
|
4. **ALWAYS use config values** - Never hardcode column names
|
||||||
|
5. **ALWAYS validate data** after loading
|
||||||
|
|
||||||
|
## 💡 New Utilities
|
||||||
|
|
||||||
|
### Data Quality Check
|
||||||
|
```bash
|
||||||
|
python -c "from data_quality import generate_data_quality_report, print_data_quality_report; from data_loader import load_sales_data; from config import get_data_path; df = load_sales_data(get_data_path()); report = generate_data_quality_report(df); print_data_quality_report(report)"
|
||||||
|
```
|
||||||
|
|
||||||
|
### Configuration Validation
|
||||||
|
```bash
|
||||||
|
python config_validator.py
|
||||||
|
```
|
||||||
|
|
||||||
|
### Export Results
|
||||||
|
```python
|
||||||
|
from export_utils import export_to_excel
|
||||||
|
export_to_excel(df, 'results.xlsx')
|
||||||
|
```
|
||||||
|
|
||||||
|
### Generate Sample Data
|
||||||
|
```bash
|
||||||
|
python generate_sample_data.py
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 🐛 Quick Troubleshooting
|
||||||
|
|
||||||
|
**"Data file not found"**
|
||||||
|
→ Check `DATA_FILE` in config.py
|
||||||
|
|
||||||
|
**"Column not found"**
|
||||||
|
→ Update column mappings in config.py
|
||||||
|
|
||||||
|
**Charts show 1e8 (scientific notation)**
|
||||||
|
→ Divide by 1e6 before plotting: `ax.plot(data / 1e6, ...)`
|
||||||
|
|
||||||
|
**"DataFrame is empty"**
|
||||||
|
→ Check `MIN_YEAR`, `MAX_DATE`, and `ANALYSIS_YEARS` in config.py
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 🎯 Using Cursor AI (Recommended)
|
||||||
|
|
||||||
|
This template is optimized for Cursor. Instead of manual setup, just ask:
|
||||||
|
|
||||||
|
```
|
||||||
|
"Create a revenue trend analysis using template patterns"
|
||||||
|
```
|
||||||
|
|
||||||
|
The AI will:
|
||||||
|
- ✅ Use all template utilities automatically
|
||||||
|
- ✅ Follow best practices
|
||||||
|
- ✅ Include proper validation
|
||||||
|
- ✅ Generate production-ready code
|
||||||
|
|
||||||
|
**See:** `.cursor/rules/ai_assistant_guide.md` for complete prompt library
|
||||||
|
|
||||||
|
## 📚 Next Steps
|
||||||
|
|
||||||
|
- **Run examples:** Try `examples/annual_revenue_trend.py` to see it in action
|
||||||
|
- **Check data quality:** Run `python data_quality.py` to analyze your data
|
||||||
|
- **Validate config:** Run `python config_validator.py` to check configuration
|
||||||
|
- **Read documentation:** See `README.md` for comprehensive guide
|
||||||
|
- **Review patterns:** Check `.cursor/rules/` for detailed patterns
|
||||||
|
- **See examples:** Check `EXAMPLES.md` for example script guide
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
**Need help?** Check `.cursor/rules/common_errors.md` for detailed troubleshooting.
|
||||||
589
README.md
Normal file
589
README.md
Normal file
@@ -0,0 +1,589 @@
|
|||||||
|
# Sales Analysis Template
|
||||||
|
|
||||||
|
**A best-in-class, reusable template for sales invoice detail analysis**
|
||||||
|
|
||||||
|
**Optimized for Cursor AI** - Just ask the AI to create analyses and it handles everything automatically.
|
||||||
|
|
||||||
|
This template provides a complete framework for analyzing sales data from any company. It's designed to be:
|
||||||
|
- **Flexible:** Works with different column names, date formats, and data structures
|
||||||
|
- **Automated:** Interactive setup wizard configures everything for your company
|
||||||
|
- **AI-Optimized:** Fully optimized for Cursor - AI knows all patterns and generates code automatically
|
||||||
|
- **Production-Ready:** Includes error handling, validation, and best practices
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 🚀 Quick Start
|
||||||
|
|
||||||
|
### 1. Setup (Automated)
|
||||||
|
|
||||||
|
Run the interactive setup wizard:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python setup_wizard.py
|
||||||
|
```
|
||||||
|
|
||||||
|
The wizard will ask you about:
|
||||||
|
- Company name and analysis date
|
||||||
|
- Data file location
|
||||||
|
- Column names in your CSV
|
||||||
|
- Date range and LTM configuration
|
||||||
|
- Exclusion filters (if needed)
|
||||||
|
|
||||||
|
### 2. Manual Setup (Alternative)
|
||||||
|
|
||||||
|
If you prefer to configure manually:
|
||||||
|
|
||||||
|
1. **Update `config.py`** with your company-specific settings:
|
||||||
|
- `COMPANY_NAME`: Your company name
|
||||||
|
- `DATA_FILE`: Your CSV filename
|
||||||
|
- `REVENUE_COLUMN`: Your revenue/amount column name
|
||||||
|
- `DATE_COLUMN`: Your primary date column
|
||||||
|
- Column mappings for Customer, Item, etc.
|
||||||
|
- Date range and LTM settings
|
||||||
|
|
||||||
|
2. **Place your data file** in the template directory (or update `DATA_DIR` in config.py)
|
||||||
|
|
||||||
|
### 3. Test Data Loading
|
||||||
|
|
||||||
|
Verify your configuration works:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python -c "from data_loader import load_sales_data; from config import get_data_path; df = load_sales_data(get_data_path()); print(f'Loaded {len(df):,} rows')"
|
||||||
|
```
|
||||||
|
|
||||||
|
### 4. Create Your First Analysis
|
||||||
|
|
||||||
|
Copy the template and customize:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cp analysis_template.py my_first_analysis.py
|
||||||
|
# Edit my_first_analysis.py with your analysis logic
|
||||||
|
python my_first_analysis.py
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 📁 Project Structure
|
||||||
|
|
||||||
|
```
|
||||||
|
sales_analysis_template/
|
||||||
|
├── README.md # This file
|
||||||
|
├── QUICK_START.md # Quick start guide
|
||||||
|
├── TEMPLATE_OVERVIEW.md # High-level overview
|
||||||
|
├── TEMPLATE_SUMMARY.md # Comprehensive template summary
|
||||||
|
├── EXAMPLES.md # Example scripts guide
|
||||||
|
├── SETUP_CHECKLIST.md # Setup verification checklist
|
||||||
|
├── requirements.txt # Python dependencies
|
||||||
|
├── setup_wizard.py # Interactive setup wizard
|
||||||
|
│
|
||||||
|
├── config.py # ⭐ Configuration (customize for your company)
|
||||||
|
├── config_validator.py # Configuration validation utility
|
||||||
|
│
|
||||||
|
├── data_loader.py # ⭐ Data loading with fallback logic
|
||||||
|
├── data_quality.py # Data quality reporting
|
||||||
|
├── data_processing.py # Data transformation utilities
|
||||||
|
│
|
||||||
|
├── analysis_utils.py # ⭐ Common utilities (formatters, LTM, helpers)
|
||||||
|
├── statistical_utils.py # Statistical analysis utilities
|
||||||
|
├── validate_revenue.py # Revenue validation utility
|
||||||
|
│
|
||||||
|
├── export_utils.py # Export to CSV/Excel
|
||||||
|
├── report_generator.py # PDF report generation
|
||||||
|
├── logger_config.py # Logging configuration
|
||||||
|
│
|
||||||
|
├── analysis_template.py # Template for creating new analyses
|
||||||
|
├── run_all_analyses.py # Batch runner for all scripts
|
||||||
|
├── generate_sample_data.py # Generate sample data for testing
|
||||||
|
│
|
||||||
|
├── examples/ # Example analysis scripts
|
||||||
|
│ ├── annual_revenue_trend.py # Simple annual revenue analysis
|
||||||
|
│ ├── customer_segmentation.py # RFM customer segmentation
|
||||||
|
│ ├── cohort_analysis.py # Customer cohort analysis
|
||||||
|
│ └── product_performance.py # Product performance analysis
|
||||||
|
│
|
||||||
|
├── tests/ # Unit tests
|
||||||
|
│ ├── test_data_loader.py # Data loader tests
|
||||||
|
│ ├── test_analysis_utils.py # Analysis utils tests
|
||||||
|
│ └── test_config_validator.py # Config validator tests
|
||||||
|
│
|
||||||
|
└── .cursor/
|
||||||
|
└── rules/ # Cursor IDE rules (auto-loaded)
|
||||||
|
├── ai_assistant_guide.md # Complete AI assistant guide
|
||||||
|
├── advanced_analysis_patterns.md # Advanced techniques
|
||||||
|
├── analysis_patterns.md # Common analysis patterns
|
||||||
|
├── chart_formatting.md # Chart formatting rules
|
||||||
|
├── code_quality.md # Code quality standards
|
||||||
|
├── common_errors.md # Error troubleshooting
|
||||||
|
├── data_loading.md # Data loading patterns
|
||||||
|
├── error_handling.md # Error handling patterns
|
||||||
|
└── ltm_methodology.md # LTM methodology
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 🔧 Configuration Guide
|
||||||
|
|
||||||
|
### Required Configuration
|
||||||
|
|
||||||
|
**In `config.py`, you MUST configure:**
|
||||||
|
|
||||||
|
1. **Company Information:**
|
||||||
|
```python
|
||||||
|
COMPANY_NAME = "Your Company Name"
|
||||||
|
```
|
||||||
|
|
||||||
|
2. **Data File:**
|
||||||
|
```python
|
||||||
|
DATA_FILE = 'your_sales_data.csv'
|
||||||
|
```
|
||||||
|
|
||||||
|
3. **Column Mappings:**
|
||||||
|
```python
|
||||||
|
REVENUE_COLUMN = 'USD' # Your revenue column name
|
||||||
|
DATE_COLUMN = 'InvoiceDate' # Your date column name
|
||||||
|
CUSTOMER_COLUMN = 'Customer' # Your customer column name
|
||||||
|
```
|
||||||
|
|
||||||
|
4. **Date Range:**
|
||||||
|
```python
|
||||||
|
MIN_YEAR = 2021
|
||||||
|
MAX_DATE = pd.Timestamp('2025-09-30')
|
||||||
|
ANALYSIS_YEARS = [2021, 2022, 2023, 2024, 2025]
|
||||||
|
```
|
||||||
|
|
||||||
|
### Optional Configuration
|
||||||
|
|
||||||
|
**LTM (Last Twelve Months):**
|
||||||
|
```python
|
||||||
|
LTM_ENABLED = True # Set to False if all years are complete
|
||||||
|
LTM_START_MONTH = 10
|
||||||
|
LTM_START_YEAR = 2024
|
||||||
|
LTM_END_MONTH = 9
|
||||||
|
LTM_END_YEAR = 2025
|
||||||
|
```
|
||||||
|
|
||||||
|
**Exclusion Filters:**
|
||||||
|
```python
|
||||||
|
EXCLUSION_FILTERS = {
|
||||||
|
'enabled': True,
|
||||||
|
'exclude_by_column': 'Country',
|
||||||
|
'exclude_values': ['Test', 'KVT']
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**See `config.py` for all available options and detailed comments.**
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 📊 Data Requirements
|
||||||
|
|
||||||
|
### Required Columns
|
||||||
|
|
||||||
|
Your CSV file must have:
|
||||||
|
- **Revenue column:** A numeric column with sales amounts (configured as `REVENUE_COLUMN`)
|
||||||
|
- **Date column:** At least one date column (configured as `DATE_COLUMN`)
|
||||||
|
|
||||||
|
### Recommended Columns
|
||||||
|
|
||||||
|
For full analysis capabilities, include:
|
||||||
|
- **Customer/Account:** For customer segmentation and analysis
|
||||||
|
- **Item/Product:** For product analysis
|
||||||
|
- **Quantity:** For price calculations
|
||||||
|
- **Geographic:** Region, Country for geographic analysis
|
||||||
|
- **Segments:** Technology, EndMarket, ProductGroup for segmentation
|
||||||
|
|
||||||
|
### Date Column Fallback
|
||||||
|
|
||||||
|
The data loader supports fallback logic:
|
||||||
|
1. **Primary:** Uses `DATE_COLUMN` (e.g., InvoiceDate)
|
||||||
|
2. **Fallback 1:** Uses columns in `DATE_FALLBACK_COLUMNS` (e.g., Month, Year)
|
||||||
|
3. **Fallback 2:** Constructs from Year column if available
|
||||||
|
|
||||||
|
This ensures maximum date coverage even if some rows have missing dates.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 💻 Creating Analysis Scripts
|
||||||
|
|
||||||
|
### Using the Template
|
||||||
|
|
||||||
|
1. **Copy the template:**
|
||||||
|
```bash
|
||||||
|
cp analysis_template.py my_analysis.py
|
||||||
|
```
|
||||||
|
|
||||||
|
2. **Update configuration:**
|
||||||
|
```python
|
||||||
|
ANALYSIS_NAME = "My Analysis"
|
||||||
|
DESCRIPTION = "Description of what this analysis does"
|
||||||
|
```
|
||||||
|
|
||||||
|
3. **Implement your logic:**
|
||||||
|
- Use `calculate_annual_metrics()` for annual aggregations
|
||||||
|
- Use `setup_revenue_chart()` and `save_chart()` for visualizations
|
||||||
|
- Follow patterns from `.cursor/rules/analysis_patterns.md`
|
||||||
|
|
||||||
|
4. **Run your analysis:**
|
||||||
|
```bash
|
||||||
|
python my_analysis.py
|
||||||
|
```
|
||||||
|
|
||||||
|
### Standard Pattern
|
||||||
|
|
||||||
|
```python
|
||||||
|
from data_loader import load_sales_data, validate_data_structure
|
||||||
|
from analysis_utils import (
|
||||||
|
get_ltm_period_config, calculate_annual_metrics,
|
||||||
|
setup_revenue_chart, save_chart, apply_exclusion_filters
|
||||||
|
)
|
||||||
|
from config import get_data_path, REVENUE_COLUMN, CHART_SIZES
|
||||||
|
|
||||||
|
# Load and validate
|
||||||
|
df = load_sales_data(get_data_path())
|
||||||
|
is_valid, msg = validate_data_structure(df)
|
||||||
|
if not is_valid:
|
||||||
|
print(f"ERROR: {msg}")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Apply filters
|
||||||
|
df = apply_exclusion_filters(df)
|
||||||
|
|
||||||
|
# Calculate metrics
|
||||||
|
ltm_start, ltm_end = get_ltm_period_config()
|
||||||
|
annual_df = calculate_annual_metrics(df, calculate_metrics, ltm_start, ltm_end)
|
||||||
|
|
||||||
|
# Create charts
|
||||||
|
fig, ax = plt.subplots(figsize=CHART_SIZES['medium'])
|
||||||
|
ax.plot(data / 1e6, ...)
|
||||||
|
setup_revenue_chart(ax)
|
||||||
|
save_chart(fig, 'chart.png')
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 🎯 Key Features
|
||||||
|
|
||||||
|
### 1. Flexible Data Loading
|
||||||
|
|
||||||
|
- Handles different column names via configuration
|
||||||
|
- Fallback logic for date parsing (100% coverage)
|
||||||
|
- Automatic validation and error reporting
|
||||||
|
|
||||||
|
### 2. LTM (Last Twelve Months) Support
|
||||||
|
|
||||||
|
- Automatic LTM calculation for partial years
|
||||||
|
- Apples-to-apples comparison with full calendar years
|
||||||
|
- Configurable LTM periods
|
||||||
|
|
||||||
|
### 3. Standardized Chart Formatting
|
||||||
|
|
||||||
|
- Automatic millions formatter for revenue charts
|
||||||
|
- Consistent styling and sizing
|
||||||
|
- Professional output ready for reports
|
||||||
|
- Optional interactive charts with Plotly
|
||||||
|
|
||||||
|
### 4. Exclusion Filters
|
||||||
|
|
||||||
|
- Easy configuration for excluding segments
|
||||||
|
- Useful for excluding test accounts, business units, etc.
|
||||||
|
|
||||||
|
### 5. Revenue Validation
|
||||||
|
|
||||||
|
- Automatic validation after each analysis
|
||||||
|
- Ensures data loading is working correctly
|
||||||
|
- Optional validation against expected values
|
||||||
|
|
||||||
|
### 6. Example Scripts
|
||||||
|
|
||||||
|
- Working examples for common analyses
|
||||||
|
- Demonstrates best practices
|
||||||
|
- Easy to customize and extend
|
||||||
|
|
||||||
|
### 7. Data Export
|
||||||
|
|
||||||
|
- Export results to CSV and Excel
|
||||||
|
- Formatted summary tables
|
||||||
|
- Multiple sheet support
|
||||||
|
|
||||||
|
### 8. Data Quality Reporting
|
||||||
|
|
||||||
|
- Comprehensive data quality checks
|
||||||
|
- Missing value analysis
|
||||||
|
- Outlier detection
|
||||||
|
- Data profiling
|
||||||
|
|
||||||
|
### 9. Configuration Validation
|
||||||
|
|
||||||
|
- Early error detection
|
||||||
|
- Validates column mappings
|
||||||
|
- Checks date ranges and LTM configuration
|
||||||
|
|
||||||
|
### 10. Statistical Utilities
|
||||||
|
|
||||||
|
- Year-over-year growth calculations
|
||||||
|
- CAGR (Compound Annual Growth Rate)
|
||||||
|
- Correlation analysis
|
||||||
|
- Statistical significance testing
|
||||||
|
|
||||||
|
### 11. Report Generation
|
||||||
|
|
||||||
|
- Combine multiple charts into PDF reports
|
||||||
|
- Professional formatting
|
||||||
|
- Summary tables and metadata
|
||||||
|
|
||||||
|
### 12. Logging Infrastructure
|
||||||
|
|
||||||
|
- Structured logging with file and console output
|
||||||
|
- Analysis execution tracking
|
||||||
|
- Configurable log levels
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 📚 Documentation
|
||||||
|
|
||||||
|
### For AI Agents (Cursor IDE)
|
||||||
|
|
||||||
|
The `.cursor/rules/` directory contains comprehensive rules that are automatically loaded by Cursor:
|
||||||
|
|
||||||
|
- **`ai_assistant_guide.md`:** Complete guide with ready-to-use prompts
|
||||||
|
- **`advanced_analysis_patterns.md`:** Advanced techniques (cohort, PVM, forecasting, etc.)
|
||||||
|
- **`analysis_patterns.md`:** Standard patterns for creating analyses
|
||||||
|
- **`data_loading.md`:** Always use `data_loader.py`, never `pd.read_csv()` directly
|
||||||
|
- **`chart_formatting.md`:** How to format charts correctly
|
||||||
|
- **`ltm_methodology.md`:** LTM implementation and usage
|
||||||
|
- **`common_errors.md`:** Troubleshooting guide
|
||||||
|
- **`code_quality.md`:** Code quality standards and Cursor best practices
|
||||||
|
- **`error_handling.md`:** How to write AI-friendly error messages
|
||||||
|
|
||||||
|
### For Developers
|
||||||
|
|
||||||
|
- **`config.py`:** Heavily commented with all configuration options
|
||||||
|
- **`analysis_template.py`:** Template with examples and comments
|
||||||
|
- **`analysis_utils.py`:** Well-documented utility functions
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 🔍 Common Analysis Types
|
||||||
|
|
||||||
|
This template supports all standard sales analyses:
|
||||||
|
|
||||||
|
### Revenue Analyses
|
||||||
|
- Annual revenue trends
|
||||||
|
- Monthly revenue analysis
|
||||||
|
- Revenue by segment/product/geography
|
||||||
|
|
||||||
|
### Customer Analyses
|
||||||
|
- Customer segmentation (RFM)
|
||||||
|
- Customer concentration
|
||||||
|
- Churn analysis
|
||||||
|
- Cohort analysis
|
||||||
|
- Customer lifetime value (CLV)
|
||||||
|
|
||||||
|
### Product Analyses
|
||||||
|
- Product performance
|
||||||
|
- Product lifecycle
|
||||||
|
- BCG matrix
|
||||||
|
- Market basket analysis
|
||||||
|
|
||||||
|
### Financial Analyses
|
||||||
|
- Price elasticity
|
||||||
|
- Contribution margin
|
||||||
|
- Price vs volume analysis
|
||||||
|
|
||||||
|
### Advanced Analyses
|
||||||
|
- Seasonality analysis
|
||||||
|
- Time series forecasting
|
||||||
|
- Customer churn prediction
|
||||||
|
|
||||||
|
**See `examples/` directory for working example scripts, or the original Dukane project for 24+ production analysis scripts.**
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 🛠️ Dependencies
|
||||||
|
|
||||||
|
Install required packages:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install -r requirements.txt
|
||||||
|
```
|
||||||
|
|
||||||
|
**Core dependencies:**
|
||||||
|
- `pandas` - Data manipulation
|
||||||
|
- `numpy` - Numerical operations
|
||||||
|
- `matplotlib` - Charting
|
||||||
|
- `seaborn` - Enhanced visualizations
|
||||||
|
|
||||||
|
**Optional dependencies** (uncomment in requirements.txt if needed):
|
||||||
|
- `openpyxl` - Excel export (export_utils.py)
|
||||||
|
- `plotly` - Interactive charts (analysis_utils.py)
|
||||||
|
- `reportlab` - PDF reports (report_generator.py)
|
||||||
|
- `scipy` - Statistical analysis (statistical_utils.py)
|
||||||
|
- `pytest` - Unit testing
|
||||||
|
- `pmdarima` - Time series forecasting
|
||||||
|
- `mlxtend` - Market basket analysis
|
||||||
|
- `scikit-learn` - Machine learning
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## ⚠️ Important Notes
|
||||||
|
|
||||||
|
### Always Use Utilities
|
||||||
|
|
||||||
|
**✅ DO:**
|
||||||
|
```python
|
||||||
|
from data_loader import load_sales_data
|
||||||
|
from analysis_utils import setup_revenue_chart, save_chart
|
||||||
|
from config import REVENUE_COLUMN, CHART_SIZES
|
||||||
|
```
|
||||||
|
|
||||||
|
**❌ DON'T:**
|
||||||
|
```python
|
||||||
|
df = pd.read_csv('data.csv') # Use data_loader instead
|
||||||
|
ax.plot(revenue, ...) # Divide by 1e6 first, use setup_revenue_chart()
|
||||||
|
```
|
||||||
|
|
||||||
|
### Chart Formatting
|
||||||
|
|
||||||
|
**ALWAYS divide revenue by 1e6 before plotting:**
|
||||||
|
```python
|
||||||
|
ax.plot(revenue / 1e6, ...) # Convert to millions
|
||||||
|
setup_revenue_chart(ax) # Apply formatter
|
||||||
|
```
|
||||||
|
|
||||||
|
### LTM Labeling
|
||||||
|
|
||||||
|
**ALWAYS label LTM years correctly:**
|
||||||
|
```python
|
||||||
|
from config import get_ltm_label
|
||||||
|
ltm_label = get_ltm_label() # Returns "2025 (LTM 9/2025)" or None
|
||||||
|
if ltm_label:
|
||||||
|
title += f'\n({ltm_label})'
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 🐛 Troubleshooting
|
||||||
|
|
||||||
|
### Data Loading Issues
|
||||||
|
|
||||||
|
**Problem:** "Data file not found"
|
||||||
|
- **Solution:** Check `DATA_FILE` path in config.py
|
||||||
|
- **Solution:** Ensure file is in template directory or update `DATA_DIR`
|
||||||
|
|
||||||
|
**Problem:** "Required column 'USD' not found"
|
||||||
|
- **Solution:** Update `REVENUE_COLUMN` in config.py to match your CSV
|
||||||
|
- **Solution:** Check all column mappings in config.py
|
||||||
|
|
||||||
|
**Problem:** "All dates are NaN"
|
||||||
|
- **Solution:** Add fallback date columns to `DATE_FALLBACK_COLUMNS`
|
||||||
|
- **Solution:** Check date format in your CSV
|
||||||
|
|
||||||
|
### Analysis Issues
|
||||||
|
|
||||||
|
**Problem:** Charts show scientific notation (1e8)
|
||||||
|
- **Solution:** Divide by 1e6 before plotting: `ax.plot(data / 1e6, ...)`
|
||||||
|
- **Solution:** Use `setup_revenue_chart(ax)` to apply formatter
|
||||||
|
|
||||||
|
**Problem:** "DataFrame is empty" after filtering
|
||||||
|
- **Solution:** Check `MIN_YEAR` and `MAX_DATE` in config.py
|
||||||
|
- **Solution:** Verify `ANALYSIS_YEARS` includes years in your data
|
||||||
|
|
||||||
|
**See `.cursor/rules/common_errors.md` for more troubleshooting help.**
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 📝 Example Workflow
|
||||||
|
|
||||||
|
### Complete Analysis Workflow
|
||||||
|
|
||||||
|
1. **Setup:**
|
||||||
|
```bash
|
||||||
|
python setup_wizard.py
|
||||||
|
```
|
||||||
|
|
||||||
|
2. **Test data loading:**
|
||||||
|
```bash
|
||||||
|
python -c "from data_loader import load_sales_data; from config import get_data_path; df = load_sales_data(get_data_path()); print(f'✓ Loaded {len(df):,} rows')"
|
||||||
|
```
|
||||||
|
|
||||||
|
3. **Create analysis:**
|
||||||
|
```bash
|
||||||
|
cp analysis_template.py revenue_analysis.py
|
||||||
|
# Edit revenue_analysis.py
|
||||||
|
```
|
||||||
|
|
||||||
|
4. **Run analysis:**
|
||||||
|
```bash
|
||||||
|
python revenue_analysis.py
|
||||||
|
```
|
||||||
|
|
||||||
|
5. **Add to batch runner:**
|
||||||
|
```python
|
||||||
|
# In run_all_analyses.py:
|
||||||
|
ANALYSIS_SCRIPTS = [
|
||||||
|
'revenue_analysis.py',
|
||||||
|
# ... other analyses
|
||||||
|
]
|
||||||
|
```
|
||||||
|
|
||||||
|
6. **Run all analyses:**
|
||||||
|
```bash
|
||||||
|
python run_all_analyses.py
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 🤝 Best Practices
|
||||||
|
|
||||||
|
1. **Always validate data** after loading:
|
||||||
|
```python
|
||||||
|
is_valid, msg = validate_data_structure(df)
|
||||||
|
```
|
||||||
|
|
||||||
|
2. **Use configuration values** instead of hardcoding:
|
||||||
|
```python
|
||||||
|
from config import REVENUE_COLUMN # ✅
|
||||||
|
revenue = df['USD'].sum() # ❌ Hardcoded
|
||||||
|
```
|
||||||
|
|
||||||
|
3. **Apply exclusion filters** if configured:
|
||||||
|
```python
|
||||||
|
df = apply_exclusion_filters(df)
|
||||||
|
```
|
||||||
|
|
||||||
|
4. **Validate revenue** at end of each analysis:
|
||||||
|
```python
|
||||||
|
validate_revenue(df, "Analysis Name")
|
||||||
|
```
|
||||||
|
|
||||||
|
5. **Use utility functions** for consistency:
|
||||||
|
```python
|
||||||
|
from analysis_utils import calculate_annual_metrics, setup_revenue_chart
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 📄 License
|
||||||
|
|
||||||
|
This template is provided as-is for use in sales analysis projects.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 🙏 Acknowledgments
|
||||||
|
|
||||||
|
This template is based on best practices developed during the Dukane Corporation sales analysis project, which included 24+ production-ready analysis scripts and comprehensive documentation.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 📞 Support
|
||||||
|
|
||||||
|
For questions or issues:
|
||||||
|
1. Check `.cursor/rules/` for detailed patterns and troubleshooting
|
||||||
|
2. Review `config.py` comments for configuration options
|
||||||
|
3. See example analyses in the original Dukane project
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
**Last Updated:** January 2026
|
||||||
|
**Template Version:** 1.0
|
||||||
|
**Status:** Production Ready
|
||||||
118
SETUP_CHECKLIST.md
Normal file
118
SETUP_CHECKLIST.md
Normal file
@@ -0,0 +1,118 @@
|
|||||||
|
# Setup Checklist
|
||||||
|
|
||||||
|
Use this checklist to ensure your template is properly configured before running analyses.
|
||||||
|
|
||||||
|
## ✅ Initial Setup
|
||||||
|
|
||||||
|
- [ ] **Install dependencies**
|
||||||
|
```bash
|
||||||
|
pip install -r requirements.txt
|
||||||
|
```
|
||||||
|
|
||||||
|
- [ ] **Run setup wizard**
|
||||||
|
```bash
|
||||||
|
python setup_wizard.py
|
||||||
|
```
|
||||||
|
|
||||||
|
- [ ] **Place data file** in template directory (or update `DATA_DIR` in config.py)
|
||||||
|
|
||||||
|
## ✅ Configuration Verification
|
||||||
|
|
||||||
|
Open `config.py` and verify:
|
||||||
|
|
||||||
|
- [ ] **Company Information**
|
||||||
|
- [ ] `COMPANY_NAME` is set
|
||||||
|
- [ ] `ANALYSIS_DATE` is current
|
||||||
|
|
||||||
|
- [ ] **Data File**
|
||||||
|
- [ ] `DATA_FILE` matches your CSV filename
|
||||||
|
- [ ] File exists in expected location
|
||||||
|
|
||||||
|
- [ ] **Column Mappings**
|
||||||
|
- [ ] `REVENUE_COLUMN` matches your CSV
|
||||||
|
- [ ] `DATE_COLUMN` matches your CSV
|
||||||
|
- [ ] `CUSTOMER_COLUMN` matches your CSV (if applicable)
|
||||||
|
- [ ] `ITEM_COLUMN` matches your CSV (if applicable)
|
||||||
|
- [ ] `QUANTITY_COLUMN` matches your CSV (if applicable)
|
||||||
|
|
||||||
|
- [ ] **Date Configuration**
|
||||||
|
- [ ] `MIN_YEAR` is correct
|
||||||
|
- [ ] `MAX_DATE` is correct
|
||||||
|
- [ ] `ANALYSIS_YEARS` includes all years you want to analyze
|
||||||
|
|
||||||
|
- [ ] **LTM Configuration** (if needed)
|
||||||
|
- [ ] `LTM_ENABLED` is set correctly
|
||||||
|
- [ ] `LTM_START_MONTH`, `LTM_START_YEAR` are correct
|
||||||
|
- [ ] `LTM_END_MONTH`, `LTM_END_YEAR` are correct
|
||||||
|
|
||||||
|
- [ ] **Exclusion Filters** (if needed)
|
||||||
|
- [ ] `EXCLUSION_FILTERS['enabled']` is set correctly
|
||||||
|
- [ ] `exclude_by_column` matches a column in your data
|
||||||
|
- [ ] `exclude_values` list is correct
|
||||||
|
|
||||||
|
## ✅ Data Loading Test
|
||||||
|
|
||||||
|
- [ ] **Test data loading**
|
||||||
|
```bash
|
||||||
|
python -c "from data_loader import load_sales_data; from config import get_data_path; df = load_sales_data(get_data_path()); print(f'✓ Loaded {len(df):,} rows')"
|
||||||
|
```
|
||||||
|
|
||||||
|
- [ ] **Verify date coverage**
|
||||||
|
- Check output shows good date coverage (>95% recommended)
|
||||||
|
- Verify date range matches expectations
|
||||||
|
|
||||||
|
- [ ] **Verify revenue column**
|
||||||
|
- Check that revenue values are numeric
|
||||||
|
- Verify no unexpected NaN values
|
||||||
|
|
||||||
|
## ✅ First Analysis Test
|
||||||
|
|
||||||
|
- [ ] **Copy template**
|
||||||
|
```bash
|
||||||
|
cp analysis_template.py test_analysis.py
|
||||||
|
```
|
||||||
|
|
||||||
|
- [ ] **Run test analysis**
|
||||||
|
```bash
|
||||||
|
python test_analysis.py
|
||||||
|
```
|
||||||
|
|
||||||
|
- [ ] **Verify outputs**
|
||||||
|
- [ ] Chart generated successfully
|
||||||
|
- [ ] Chart saved to `charts/` directory
|
||||||
|
- [ ] Revenue validation passed
|
||||||
|
- [ ] No errors in console output
|
||||||
|
|
||||||
|
## ✅ Common Issues Check
|
||||||
|
|
||||||
|
Before running full analyses, verify:
|
||||||
|
|
||||||
|
- [ ] **Column names match** - All column mappings in config.py match your CSV
|
||||||
|
- [ ] **Date format works** - Dates are parsing correctly (check data_loader output)
|
||||||
|
- [ ] **Date range is correct** - MIN_YEAR and MAX_DATE include your data
|
||||||
|
- [ ] **LTM is configured** - If using LTM, dates are within your data range
|
||||||
|
- [ ] **Exclusions work** - If using exclusions, column and values are correct
|
||||||
|
|
||||||
|
## ✅ Ready for Production
|
||||||
|
|
||||||
|
Once all checks pass:
|
||||||
|
|
||||||
|
- [ ] **Create your analyses** using `analysis_template.py`
|
||||||
|
- [ ] **Add to batch runner** in `run_all_analyses.py`
|
||||||
|
- [ ] **Run all analyses** to generate complete analysis suite
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 🐛 Troubleshooting
|
||||||
|
|
||||||
|
If any check fails:
|
||||||
|
|
||||||
|
1. **Data loading issues:** See `.cursor/rules/data_loading.md`
|
||||||
|
2. **Configuration issues:** Review `config.py` comments
|
||||||
|
3. **Common errors:** See `.cursor/rules/common_errors.md`
|
||||||
|
4. **Pattern questions:** See `.cursor/rules/analysis_patterns.md`
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
**Checklist Version:** 1.0
|
||||||
|
**Last Updated:** January 2026
|
||||||
150
TEMPLATE_OVERVIEW.md
Normal file
150
TEMPLATE_OVERVIEW.md
Normal file
@@ -0,0 +1,150 @@
|
|||||||
|
# Sales Analysis Template - Overview
|
||||||
|
|
||||||
|
**Start here for a high-level understanding of the template.**
|
||||||
|
|
||||||
|
For detailed setup, see `QUICK_START.md`. For complete documentation, see `README.md`.
|
||||||
|
|
||||||
|
## 🎯 Purpose
|
||||||
|
|
||||||
|
This template provides a **production-ready, reusable framework** for analyzing sales invoice detail data from any company. It's designed to be:
|
||||||
|
|
||||||
|
- **Flexible:** Works with different column names, date formats, and data structures
|
||||||
|
- **Automated:** Interactive setup wizard configures everything
|
||||||
|
- **AI-Optimized:** Fully optimized for Cursor AI - just ask and the AI generates complete analyses
|
||||||
|
- **Best-in-Class:** Based on proven patterns from 24+ production analyses
|
||||||
|
|
||||||
|
## 📦 What's Included
|
||||||
|
|
||||||
|
### Core Framework
|
||||||
|
- **`config.py`** - Centralized configuration (customize for your company)
|
||||||
|
- **`data_loader.py`** - Intelligent data loading with fallback logic
|
||||||
|
- **`analysis_utils.py`** - Common utilities (formatters, LTM, helpers)
|
||||||
|
- **`validate_revenue.py`** - Revenue validation utility
|
||||||
|
|
||||||
|
### Templates & Tools
|
||||||
|
- **`analysis_template.py`** - Template for creating new analyses
|
||||||
|
- **`run_all_analyses.py`** - Batch runner for all scripts
|
||||||
|
- **`setup_wizard.py`** - Interactive setup wizard
|
||||||
|
|
||||||
|
### Documentation
|
||||||
|
- **`README.md`** - Comprehensive documentation
|
||||||
|
- **`QUICK_START.md`** - Quick reference guide
|
||||||
|
- **`.cursor/rules/`** - Cursor IDE rules for automation
|
||||||
|
|
||||||
|
### Configuration
|
||||||
|
- **`requirements.txt`** - Python dependencies
|
||||||
|
- **`.gitignore`** - Git ignore patterns
|
||||||
|
|
||||||
|
## 🚀 Quick Start
|
||||||
|
|
||||||
|
1. **Run setup wizard:**
|
||||||
|
```bash
|
||||||
|
python setup_wizard.py
|
||||||
|
```
|
||||||
|
|
||||||
|
2. **Test data loading:**
|
||||||
|
```bash
|
||||||
|
python -c "from data_loader import load_sales_data; from config import get_data_path; df = load_sales_data(get_data_path()); print(f'✓ Loaded {len(df):,} rows')"
|
||||||
|
```
|
||||||
|
|
||||||
|
3. **Create your first analysis:**
|
||||||
|
```bash
|
||||||
|
cp analysis_template.py my_analysis.py
|
||||||
|
# Edit my_analysis.py
|
||||||
|
python my_analysis.py
|
||||||
|
```
|
||||||
|
|
||||||
|
## 🎨 Key Features
|
||||||
|
|
||||||
|
### 1. Flexible Data Loading
|
||||||
|
- Handles different column names via configuration
|
||||||
|
- Fallback logic for date parsing (100% coverage)
|
||||||
|
- Automatic validation
|
||||||
|
|
||||||
|
### 2. LTM Support
|
||||||
|
- Automatic Last Twelve Months calculation
|
||||||
|
- Apples-to-apples comparison with full years
|
||||||
|
- Configurable periods
|
||||||
|
|
||||||
|
### 3. Standardized Formatting
|
||||||
|
- Automatic millions formatter for revenue
|
||||||
|
- Consistent chart styling
|
||||||
|
- Professional output
|
||||||
|
|
||||||
|
### 4. Exclusion Filters
|
||||||
|
- Easy configuration for excluding segments
|
||||||
|
- Useful for test accounts, business units, etc.
|
||||||
|
|
||||||
|
### 5. AI Automation
|
||||||
|
- Comprehensive Cursor rules
|
||||||
|
- Automated agent assistance
|
||||||
|
- Best practices enforcement
|
||||||
|
|
||||||
|
## 📊 Analysis Types Supported
|
||||||
|
|
||||||
|
This template supports all standard sales analyses:
|
||||||
|
|
||||||
|
- **Revenue:** Annual trends, monthly analysis, by segment
|
||||||
|
- **Customer:** Segmentation, concentration, churn, CLV
|
||||||
|
- **Product:** Performance, lifecycle, BCG matrix
|
||||||
|
- **Financial:** Price elasticity, margins
|
||||||
|
- **Advanced:** Seasonality, forecasting, predictions
|
||||||
|
|
||||||
|
## 🔧 Customization Points
|
||||||
|
|
||||||
|
All customization happens in `config.py`:
|
||||||
|
|
||||||
|
1. **Company Info:** Name, analysis date
|
||||||
|
2. **Data File:** Location, filename
|
||||||
|
3. **Column Mappings:** Revenue, date, customer, product, etc.
|
||||||
|
4. **Date Range:** Years, LTM configuration
|
||||||
|
5. **Filters:** Exclusion rules
|
||||||
|
6. **Chart Settings:** Sizes, styles, DPI
|
||||||
|
|
||||||
|
## 📚 Documentation Structure
|
||||||
|
|
||||||
|
- **`README.md`** - Complete guide (start here)
|
||||||
|
- **`QUICK_START.md`** - Quick start (includes Cursor tips)
|
||||||
|
- **`EXAMPLES.md`** - Example scripts guide
|
||||||
|
- **`TEMPLATE_SUMMARY.md`** - Comprehensive template overview
|
||||||
|
- **`.cursor/rules/`** - Detailed patterns for AI agents (auto-loaded by Cursor)
|
||||||
|
- **`config.py`** - Heavily commented configuration
|
||||||
|
|
||||||
|
## 🎓 Learning Path
|
||||||
|
|
||||||
|
1. **Read:** `QUICK_START.md` (5 minutes)
|
||||||
|
2. **Run:** `setup_wizard.py` (2 minutes)
|
||||||
|
3. **Test:** Data loading (1 minute)
|
||||||
|
4. **Create:** First analysis using `analysis_template.py` (15 minutes)
|
||||||
|
5. **Explore:** `.cursor/rules/` for patterns (as needed)
|
||||||
|
|
||||||
|
## 💡 Best Practices
|
||||||
|
|
||||||
|
1. **Always use utilities** - Don't reinvent the wheel
|
||||||
|
2. **Use config values** - Never hardcode column names
|
||||||
|
3. **Validate data** - After loading and after analysis
|
||||||
|
4. **Follow patterns** - See `.cursor/rules/analysis_patterns.md`
|
||||||
|
5. **Test incrementally** - Test data loading before full analysis
|
||||||
|
|
||||||
|
## 🔍 What Makes This "Best-in-Class"
|
||||||
|
|
||||||
|
1. **Proven Patterns:** Based on 24+ production analyses
|
||||||
|
2. **Flexibility:** Works with any data structure
|
||||||
|
3. **Automation:** Setup wizard + AI-friendly rules
|
||||||
|
4. **Documentation:** Comprehensive guides and examples
|
||||||
|
5. **Error Handling:** Validation and troubleshooting built-in
|
||||||
|
6. **Consistency:** Standardized formatting and patterns
|
||||||
|
|
||||||
|
## 📈 Next Steps
|
||||||
|
|
||||||
|
1. Run `setup_wizard.py` to configure for your company
|
||||||
|
2. Review `config.py` to understand all options
|
||||||
|
3. Create your first analysis using `analysis_template.py`
|
||||||
|
4. Explore `.cursor/rules/` for detailed patterns
|
||||||
|
5. Build your analysis suite
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
**Template Version:** 1.0
|
||||||
|
**Last Updated:** January 2026
|
||||||
|
**Status:** Production Ready
|
||||||
254
TEMPLATE_SUMMARY.md
Normal file
254
TEMPLATE_SUMMARY.md
Normal file
@@ -0,0 +1,254 @@
|
|||||||
|
# Sales Analysis Template - Summary
|
||||||
|
|
||||||
|
**This document provides a comprehensive overview of the template structure and capabilities.**
|
||||||
|
|
||||||
|
For quick start, see `QUICK_START.md`. For detailed documentation, see `README.md`.
|
||||||
|
|
||||||
|
## 📋 What This Template Provides
|
||||||
|
|
||||||
|
This template was created based on the comprehensive Dukane Corporation sales analysis project, which included 24+ production-ready analysis scripts. All best practices, patterns, and lessons learned have been distilled into this reusable template.
|
||||||
|
|
||||||
|
## 📁 Complete File Structure
|
||||||
|
|
||||||
|
```
|
||||||
|
sales_analysis_template/
|
||||||
|
├── README.md # Comprehensive documentation
|
||||||
|
├── QUICK_START.md # Quick reference guide
|
||||||
|
├── TEMPLATE_OVERVIEW.md # Template overview and features
|
||||||
|
├── TEMPLATE_SUMMARY.md # This file
|
||||||
|
├── EXAMPLES.md # Example scripts guide
|
||||||
|
├── SETUP_CHECKLIST.md # Setup verification checklist
|
||||||
|
├── requirements.txt # Python dependencies
|
||||||
|
├── .gitignore # Git ignore patterns
|
||||||
|
│
|
||||||
|
├── Core Framework Files:
|
||||||
|
│ ├── config.py # ⭐ Centralized configuration
|
||||||
|
│ ├── config_validator.py # Configuration validation utility
|
||||||
|
│ ├── data_loader.py # ⭐ Intelligent data loading
|
||||||
|
│ ├── data_quality.py # Data quality reporting
|
||||||
|
│ ├── data_processing.py # Data transformation utilities
|
||||||
|
│ ├── analysis_utils.py # ⭐ Common utilities
|
||||||
|
│ ├── statistical_utils.py # Statistical analysis utilities
|
||||||
|
│ └── validate_revenue.py # Revenue validation
|
||||||
|
│
|
||||||
|
├── Utility Files:
|
||||||
|
│ ├── export_utils.py # Export to CSV/Excel
|
||||||
|
│ ├── report_generator.py # PDF report generation
|
||||||
|
│ ├── logger_config.py # Logging configuration
|
||||||
|
│ └── generate_sample_data.py # Generate sample data for testing
|
||||||
|
│
|
||||||
|
├── Templates & Tools:
|
||||||
|
│ ├── analysis_template.py # Template for new analyses
|
||||||
|
│ ├── run_all_analyses.py # Batch runner
|
||||||
|
│ └── setup_wizard.py # Interactive setup wizard
|
||||||
|
│
|
||||||
|
├── examples/ # Example analysis scripts
|
||||||
|
│ ├── annual_revenue_trend.py # Simple annual revenue analysis
|
||||||
|
│ ├── customer_segmentation.py # RFM customer segmentation
|
||||||
|
│ ├── cohort_analysis.py # Customer cohort analysis
|
||||||
|
│ └── product_performance.py # Product performance analysis
|
||||||
|
│
|
||||||
|
├── tests/ # Unit tests
|
||||||
|
│ ├── test_data_loader.py # Data loader tests
|
||||||
|
│ ├── test_analysis_utils.py # Analysis utils tests
|
||||||
|
│ └── test_config_validator.py # Config validator tests
|
||||||
|
│
|
||||||
|
└── .cursor/
|
||||||
|
└── rules/ # Cursor IDE rules (auto-loaded)
|
||||||
|
├── ai_assistant_guide.md # Complete AI assistant guide
|
||||||
|
├── advanced_analysis_patterns.md # Advanced techniques
|
||||||
|
├── analysis_patterns.md # Analysis patterns
|
||||||
|
├── chart_formatting.md # Chart formatting rules
|
||||||
|
├── code_quality.md # Code quality standards
|
||||||
|
├── common_errors.md # Error troubleshooting
|
||||||
|
├── data_loading.md # Data loading patterns
|
||||||
|
├── error_handling.md # Error handling patterns
|
||||||
|
└── ltm_methodology.md # LTM methodology
|
||||||
|
```
|
||||||
|
|
||||||
|
## 🎯 Key Features Implemented
|
||||||
|
|
||||||
|
### 1. Flexible Configuration System
|
||||||
|
- **`config.py`**: Centralized configuration with extensive comments
|
||||||
|
- All column names, date ranges, and settings configurable
|
||||||
|
- No hardcoded values - everything comes from config
|
||||||
|
|
||||||
|
### 2. Intelligent Data Loading
|
||||||
|
- **`data_loader.py`**: Fallback logic for date parsing
|
||||||
|
- Handles missing dates gracefully
|
||||||
|
- 100% date coverage via fallback columns
|
||||||
|
- Automatic validation and error reporting
|
||||||
|
|
||||||
|
### 3. Comprehensive Utilities
|
||||||
|
- **`analysis_utils.py`**: All common functions in one place
|
||||||
|
- Chart formatters (millions, thousands)
|
||||||
|
- LTM calculation helpers
|
||||||
|
- Mixed type handling for years
|
||||||
|
- Price calculation utilities
|
||||||
|
- Exclusion filter helpers
|
||||||
|
|
||||||
|
### 4. Interactive Setup
|
||||||
|
- **`setup_wizard.py`**: Asks clarifying questions
|
||||||
|
- Automatically configures `config.py`
|
||||||
|
- Validates inputs
|
||||||
|
- Provides next steps
|
||||||
|
|
||||||
|
### 5. AI-Friendly Rules
|
||||||
|
- **`.cursor/rules/`**: Comprehensive Cursor IDE rules
|
||||||
|
- Auto-loaded by Cursor
|
||||||
|
- Enforces best practices
|
||||||
|
- Provides patterns and troubleshooting
|
||||||
|
|
||||||
|
### 6. Production-Ready Templates
|
||||||
|
- **`analysis_template.py`**: Complete template with examples
|
||||||
|
- **`run_all_analyses.py`**: Batch runner with error handling
|
||||||
|
- Follows all best practices
|
||||||
|
|
||||||
|
## 🔑 Design Principles
|
||||||
|
|
||||||
|
### Flexibility
|
||||||
|
- Works with any column names (configured in config.py)
|
||||||
|
- Handles different date formats
|
||||||
|
- Supports various data structures
|
||||||
|
- Optional features (LTM, exclusions) can be disabled
|
||||||
|
|
||||||
|
### Automation
|
||||||
|
- Setup wizard asks all necessary questions
|
||||||
|
- Cursor rules guide AI agents automatically
|
||||||
|
- Batch runner handles multiple analyses
|
||||||
|
- Validation catches errors early
|
||||||
|
|
||||||
|
### Best Practices
|
||||||
|
- Always use utilities (never reinvent the wheel)
|
||||||
|
- Consistent formatting across all analyses
|
||||||
|
- Proper error handling and validation
|
||||||
|
- Comprehensive documentation
|
||||||
|
|
||||||
|
### Reusability
|
||||||
|
- Generic enough for any company
|
||||||
|
- Specific enough to be immediately useful
|
||||||
|
- Well-documented for future agents
|
||||||
|
- Easy to extend with new analyses
|
||||||
|
|
||||||
|
## 📊 Analysis Types Supported
|
||||||
|
|
||||||
|
The template supports all standard sales analyses:
|
||||||
|
|
||||||
|
### Revenue Analyses
|
||||||
|
- Annual revenue trends
|
||||||
|
- Monthly revenue analysis
|
||||||
|
- Revenue by segment/product/geography
|
||||||
|
|
||||||
|
### Customer Analyses
|
||||||
|
- Customer segmentation (RFM)
|
||||||
|
- Customer concentration
|
||||||
|
- Churn analysis
|
||||||
|
- Cohort analysis
|
||||||
|
- Customer lifetime value (CLV)
|
||||||
|
|
||||||
|
### Product Analyses
|
||||||
|
- Product performance
|
||||||
|
- Product lifecycle
|
||||||
|
- BCG matrix
|
||||||
|
- Market basket analysis
|
||||||
|
|
||||||
|
### Financial Analyses
|
||||||
|
- Price elasticity
|
||||||
|
- Contribution margin
|
||||||
|
- Price vs volume analysis
|
||||||
|
|
||||||
|
### Advanced Analyses
|
||||||
|
- Seasonality analysis
|
||||||
|
- Time series forecasting
|
||||||
|
- Customer churn prediction
|
||||||
|
|
||||||
|
## 🚀 Usage Workflow
|
||||||
|
|
||||||
|
1. **Setup** (5 minutes)
|
||||||
|
- Run `setup_wizard.py`
|
||||||
|
- Answer questions about your data
|
||||||
|
- Configuration automatically updated
|
||||||
|
|
||||||
|
2. **Test** (2 minutes)
|
||||||
|
- Test data loading
|
||||||
|
- Verify configuration works
|
||||||
|
|
||||||
|
3. **Create** (15 minutes)
|
||||||
|
- Copy `analysis_template.py`
|
||||||
|
- Customize for your analysis
|
||||||
|
- Run and verify
|
||||||
|
|
||||||
|
4. **Scale** (ongoing)
|
||||||
|
- Create multiple analyses
|
||||||
|
- Add to batch runner
|
||||||
|
- Generate complete analysis suite
|
||||||
|
|
||||||
|
## 💡 What Makes This "Best-in-Class"
|
||||||
|
|
||||||
|
1. **Proven Patterns**: Based on 24+ production analyses
|
||||||
|
2. **Comprehensive**: Covers all common analysis types
|
||||||
|
3. **Flexible**: Works with any data structure
|
||||||
|
4. **Automated**: Setup wizard + AI-friendly rules
|
||||||
|
5. **Documented**: Extensive documentation at every level
|
||||||
|
6. **Production-Ready**: Error handling, validation, best practices
|
||||||
|
|
||||||
|
## 📚 Documentation Hierarchy
|
||||||
|
|
||||||
|
1. **`QUICK_START.md`** - Start here (5-minute overview, includes Cursor tips)
|
||||||
|
2. **`README.md`** - Complete guide (comprehensive)
|
||||||
|
3. **`EXAMPLES.md`** - Example scripts guide
|
||||||
|
4. **`TEMPLATE_OVERVIEW.md`** - High-level overview
|
||||||
|
5. **`SETUP_CHECKLIST.md`** - Verification checklist
|
||||||
|
6. **`.cursor/rules/`** - Detailed patterns for AI agents (auto-loaded by Cursor)
|
||||||
|
7. **`config.py`** - Inline comments for all options
|
||||||
|
|
||||||
|
## 🎓 Learning Resources
|
||||||
|
|
||||||
|
- **Quick Start**: `QUICK_START.md` - Get running in 5 minutes
|
||||||
|
- **Full Guide**: `README.md` - Complete documentation
|
||||||
|
- **Patterns**: `.cursor/rules/analysis_patterns.md` - Code patterns
|
||||||
|
- **Troubleshooting**: `.cursor/rules/common_errors.md` - Fix issues
|
||||||
|
- **Examples**: `analysis_template.py` - Working example
|
||||||
|
|
||||||
|
## ✅ Quality Assurance
|
||||||
|
|
||||||
|
All components include:
|
||||||
|
- ✅ Error handling
|
||||||
|
- ✅ Input validation
|
||||||
|
- ✅ Comprehensive comments
|
||||||
|
- ✅ Type hints where helpful
|
||||||
|
- ✅ Documentation strings
|
||||||
|
- ✅ Best practices enforcement
|
||||||
|
|
||||||
|
## 🔄 Future Enhancements
|
||||||
|
|
||||||
|
Potential additions (not included in v1.0):
|
||||||
|
- Example analysis scripts (can be added from Dukane project)
|
||||||
|
- Unit tests
|
||||||
|
- CI/CD configuration
|
||||||
|
- Docker containerization
|
||||||
|
- Additional visualization libraries
|
||||||
|
|
||||||
|
## 📝 Notes for Users
|
||||||
|
|
||||||
|
1. **First Time**: Start with `QUICK_START.md` and `setup_wizard.py`
|
||||||
|
2. **Configuration**: All customization in `config.py`
|
||||||
|
3. **Creating Analyses**: Use `analysis_template.py` as starting point
|
||||||
|
4. **AI Assistance**: Cursor rules are auto-loaded, just ask for help
|
||||||
|
5. **Troubleshooting**: Check `.cursor/rules/common_errors.md` first
|
||||||
|
|
||||||
|
## 🎉 Success Criteria
|
||||||
|
|
||||||
|
The template is ready when:
|
||||||
|
- ✅ Setup wizard runs successfully
|
||||||
|
- ✅ Data loads without errors
|
||||||
|
- ✅ First analysis generates charts
|
||||||
|
- ✅ All validations pass
|
||||||
|
- ✅ Documentation is clear
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
**Template Version:** 1.0
|
||||||
|
**Created:** January 2026
|
||||||
|
**Based On:** Dukane Corporation Sales Analysis Project
|
||||||
|
**Status:** Production Ready ✅
|
||||||
147
analysis_template.py
Normal file
147
analysis_template.py
Normal file
@@ -0,0 +1,147 @@
|
|||||||
|
"""
|
||||||
|
Template for creating new analysis scripts
|
||||||
|
Copy this file and modify for your specific analysis
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
1. Copy this file: cp analysis_template.py my_new_analysis.py
|
||||||
|
2. Update the ANALYSIS_NAME and DESCRIPTION
|
||||||
|
3. Implement your analysis logic in the main() function
|
||||||
|
4. Update the chart generation section
|
||||||
|
5. Run: python my_new_analysis.py
|
||||||
|
"""
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
import seaborn as sns
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
# Import utilities
|
||||||
|
from data_loader import load_sales_data, validate_data_structure
|
||||||
|
from validate_revenue import validate_revenue
|
||||||
|
from analysis_utils import (
|
||||||
|
get_ltm_period_config, get_annual_data, calculate_annual_metrics,
|
||||||
|
get_millions_formatter, setup_revenue_chart, save_chart,
|
||||||
|
format_currency, print_annual_summary, sort_mixed_years,
|
||||||
|
apply_exclusion_filters
|
||||||
|
)
|
||||||
|
from config import (
|
||||||
|
DATA_FILE, OUTPUT_DIR, ANALYSIS_YEARS, MAX_DATE,
|
||||||
|
CHART_SIZES, ensure_directories, get_data_path, COMPANY_NAME
|
||||||
|
)
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# CONFIGURATION
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
ANALYSIS_NAME = "Template Analysis"
|
||||||
|
DESCRIPTION = "Template for new analyses - customize this for your specific analysis"
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# MAIN ANALYSIS FUNCTION
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""Main analysis function"""
|
||||||
|
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print(f"{ANALYSIS_NAME}")
|
||||||
|
print(f"{'='*60}\n")
|
||||||
|
|
||||||
|
# 1. Load data
|
||||||
|
print("Loading data...")
|
||||||
|
try:
|
||||||
|
df = load_sales_data(get_data_path())
|
||||||
|
print(f"Loaded {len(df):,} transactions")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"ERROR loading data: {e}")
|
||||||
|
return
|
||||||
|
|
||||||
|
# 2. Validate data structure
|
||||||
|
is_valid, msg = validate_data_structure(df)
|
||||||
|
if not is_valid:
|
||||||
|
print(f"ERROR: {msg}")
|
||||||
|
return
|
||||||
|
print("Data validation passed")
|
||||||
|
|
||||||
|
# 3. Apply exclusion filters (if configured)
|
||||||
|
df = apply_exclusion_filters(df)
|
||||||
|
|
||||||
|
# 4. Filter by date range
|
||||||
|
from config import MIN_YEAR, DATE_COLUMN
|
||||||
|
df = df[df['Year'] >= MIN_YEAR]
|
||||||
|
if DATE_COLUMN in df.columns:
|
||||||
|
df = df[df[DATE_COLUMN] <= MAX_DATE]
|
||||||
|
|
||||||
|
# 5. Setup LTM period (if enabled)
|
||||||
|
ltm_start, ltm_end = get_ltm_period_config()
|
||||||
|
if ltm_start and ltm_end:
|
||||||
|
print(f"LTM period: {ltm_start} to {ltm_end}")
|
||||||
|
|
||||||
|
# 6. Prepare data
|
||||||
|
print("\nPreparing data...")
|
||||||
|
# Add your data preparation logic here
|
||||||
|
# Example: df['CustomColumn'] = df[REVENUE_COLUMN] * df[QUANTITY_COLUMN]
|
||||||
|
|
||||||
|
# 7. Calculate annual metrics
|
||||||
|
print("\nCalculating annual metrics...")
|
||||||
|
|
||||||
|
def calculate_metrics(year_data):
|
||||||
|
"""Calculate metrics for a single year"""
|
||||||
|
from config import REVENUE_COLUMN
|
||||||
|
return {
|
||||||
|
'Revenue': year_data[REVENUE_COLUMN].sum(),
|
||||||
|
# Add your custom metrics here
|
||||||
|
# 'CustomMetric': year_data['CustomColumn'].mean(),
|
||||||
|
}
|
||||||
|
|
||||||
|
annual_df = calculate_annual_metrics(df, calculate_metrics, ltm_start, ltm_end)
|
||||||
|
|
||||||
|
# 8. Print summary
|
||||||
|
print_annual_summary(annual_df, 'Revenue', 'Revenue')
|
||||||
|
|
||||||
|
# 9. Create visualizations
|
||||||
|
print("Generating charts...")
|
||||||
|
ensure_directories()
|
||||||
|
|
||||||
|
# Example chart: Annual revenue trend
|
||||||
|
fig, ax = plt.subplots(figsize=CHART_SIZES['medium'])
|
||||||
|
|
||||||
|
# Prepare data for plotting (handle mixed types)
|
||||||
|
annual_df_sorted = sort_mixed_years(annual_df.reset_index(), 'Year')
|
||||||
|
years = annual_df_sorted['Year'].tolist()
|
||||||
|
revenue = annual_df_sorted['Revenue'].values / 1e6 # Convert to millions
|
||||||
|
|
||||||
|
# Create chart
|
||||||
|
ax.plot(range(len(years)), revenue, marker='o', linewidth=2, markersize=8)
|
||||||
|
ax.set_xticks(range(len(years)))
|
||||||
|
ax.set_xticklabels(years, rotation=45, ha='right')
|
||||||
|
setup_revenue_chart(ax)
|
||||||
|
|
||||||
|
# Add LTM notation to title if applicable
|
||||||
|
title = f'Annual Revenue Trend - {COMPANY_NAME}'
|
||||||
|
if ltm_start and ltm_end:
|
||||||
|
from config import get_ltm_label
|
||||||
|
ltm_label = get_ltm_label()
|
||||||
|
if ltm_label:
|
||||||
|
title += f'\n({ltm_label})'
|
||||||
|
ax.set_title(title)
|
||||||
|
|
||||||
|
plt.tight_layout()
|
||||||
|
save_chart(fig, f'{ANALYSIS_NAME.lower().replace(" ", "_")}_trend.png')
|
||||||
|
plt.close()
|
||||||
|
|
||||||
|
# Add more charts as needed...
|
||||||
|
|
||||||
|
# 10. Validate revenue
|
||||||
|
print("\nValidating revenue...")
|
||||||
|
validate_revenue(df, ANALYSIS_NAME)
|
||||||
|
|
||||||
|
print(f"\n{ANALYSIS_NAME} complete!")
|
||||||
|
print(f"Charts saved to: {OUTPUT_DIR}")
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# RUN ANALYSIS
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
510
analysis_utils.py
Normal file
510
analysis_utils.py
Normal file
@@ -0,0 +1,510 @@
|
|||||||
|
"""
|
||||||
|
Common utilities for analysis scripts
|
||||||
|
Provides formatters, LTM setup, and helper functions
|
||||||
|
|
||||||
|
This module is designed to work with any sales data structure
|
||||||
|
by using configuration from config.py
|
||||||
|
"""
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
from matplotlib.ticker import FuncFormatter
|
||||||
|
from pathlib import Path
|
||||||
|
from config import (
|
||||||
|
REVENUE_COLUMN, LTM_ENABLED, get_ltm_period, get_ltm_label,
|
||||||
|
OUTPUT_DIR, CHART_DPI, CHART_BBOX
|
||||||
|
)
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# CHART FORMATTERS
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
def millions_formatter(x: float, pos: int) -> str:
|
||||||
|
"""
|
||||||
|
Format numbers in millions for chart display (e.g., $99.9m)
|
||||||
|
|
||||||
|
This formatter is used with matplotlib FuncFormatter to display
|
||||||
|
revenue values in millions on chart axes.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
x: Numeric value (already in millions, e.g., 99.9 for $99.9m)
|
||||||
|
pos: Position parameter (required by FuncFormatter, not used)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: Formatted string like "$99.9m"
|
||||||
|
|
||||||
|
Example:
|
||||||
|
>>> from matplotlib.ticker import FuncFormatter
|
||||||
|
>>> formatter = FuncFormatter(millions_formatter)
|
||||||
|
>>> ax.yaxis.set_major_formatter(formatter)
|
||||||
|
"""
|
||||||
|
return f'${x:.1f}m'
|
||||||
|
|
||||||
|
def thousands_formatter(x: float, pos: int) -> str:
|
||||||
|
"""
|
||||||
|
Format numbers in thousands for chart display (e.g., $99.9k)
|
||||||
|
|
||||||
|
Args:
|
||||||
|
x: Numeric value (already in thousands)
|
||||||
|
pos: Position parameter (required by FuncFormatter, not used)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: Formatted string like "$99.9k"
|
||||||
|
"""
|
||||||
|
return f'${x:.1f}k'
|
||||||
|
|
||||||
|
def get_millions_formatter() -> FuncFormatter:
|
||||||
|
"""
|
||||||
|
Get FuncFormatter for millions
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
FuncFormatter: Configured formatter for millions display
|
||||||
|
"""
|
||||||
|
return FuncFormatter(millions_formatter)
|
||||||
|
|
||||||
|
def get_thousands_formatter() -> FuncFormatter:
|
||||||
|
"""
|
||||||
|
Get FuncFormatter for thousands
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
FuncFormatter: Configured formatter for thousands display
|
||||||
|
"""
|
||||||
|
return FuncFormatter(thousands_formatter)
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# LTM (Last Twelve Months) SETUP
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
def get_ltm_period_config():
|
||||||
|
"""
|
||||||
|
Get LTM period boundaries from config
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
tuple: (ltm_start, ltm_end) as pd.Period objects, or (None, None) if disabled
|
||||||
|
"""
|
||||||
|
if LTM_ENABLED:
|
||||||
|
return get_ltm_period()
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
def get_annual_data(df, year, ltm_start=None, ltm_end=None):
|
||||||
|
"""
|
||||||
|
Get data for a specific year, using LTM for the most recent partial year
|
||||||
|
|
||||||
|
Args:
|
||||||
|
df: DataFrame with 'Year' and 'YearMonth' columns
|
||||||
|
year: Year to extract (int)
|
||||||
|
ltm_start: LTM start period (defaults to config if None)
|
||||||
|
ltm_end: LTM end period (defaults to config if None)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
tuple: (year_data DataFrame, year_label string)
|
||||||
|
"""
|
||||||
|
from config import LTM_END_YEAR
|
||||||
|
|
||||||
|
# Get LTM period from config if not provided
|
||||||
|
if ltm_start is None or ltm_end is None:
|
||||||
|
ltm_start, ltm_end = get_ltm_period_config()
|
||||||
|
|
||||||
|
# Use LTM for the most recent year if enabled
|
||||||
|
if LTM_ENABLED and ltm_start and ltm_end and year == LTM_END_YEAR:
|
||||||
|
if 'YearMonth' in df.columns:
|
||||||
|
year_data = df[(df['YearMonth'] >= ltm_start) & (df['YearMonth'] <= ltm_end)]
|
||||||
|
year_label = get_ltm_label() or str(year)
|
||||||
|
else:
|
||||||
|
# Fallback if YearMonth not available
|
||||||
|
year_data = df[df['Year'] == year]
|
||||||
|
year_label = str(year)
|
||||||
|
else:
|
||||||
|
# Use full calendar year
|
||||||
|
year_data = df[df['Year'] == year]
|
||||||
|
year_label = str(year)
|
||||||
|
|
||||||
|
return year_data, year_label
|
||||||
|
|
||||||
|
def calculate_annual_metrics(df, metrics_func, ltm_start=None, ltm_end=None):
|
||||||
|
"""
|
||||||
|
Calculate annual metrics for all years, using LTM for most recent year
|
||||||
|
|
||||||
|
Args:
|
||||||
|
df: DataFrame with 'Year' and 'YearMonth' columns
|
||||||
|
metrics_func: Function that takes a DataFrame and returns a dict of metrics
|
||||||
|
ltm_start: LTM start period (defaults to config if None)
|
||||||
|
ltm_end: LTM end period (defaults to config if None)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
DataFrame with 'Year' index and metric columns
|
||||||
|
"""
|
||||||
|
from config import ANALYSIS_YEARS
|
||||||
|
|
||||||
|
if ltm_start is None or ltm_end is None:
|
||||||
|
ltm_start, ltm_end = get_ltm_period_config()
|
||||||
|
|
||||||
|
annual_data = []
|
||||||
|
for year in sorted(ANALYSIS_YEARS):
|
||||||
|
if year in df['Year'].unique():
|
||||||
|
year_data, year_label = get_annual_data(df, year, ltm_start, ltm_end)
|
||||||
|
|
||||||
|
if len(year_data) > 0:
|
||||||
|
metrics = metrics_func(year_data)
|
||||||
|
metrics['Year'] = year_label
|
||||||
|
annual_data.append(metrics)
|
||||||
|
|
||||||
|
if not annual_data:
|
||||||
|
return pd.DataFrame()
|
||||||
|
|
||||||
|
return pd.DataFrame(annual_data).set_index('Year')
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# MIXED TYPE HANDLING
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
def create_year_sort_column(df, year_col='Year'):
|
||||||
|
"""
|
||||||
|
Create a numeric sort column for mixed int/str year columns
|
||||||
|
|
||||||
|
Args:
|
||||||
|
df: DataFrame
|
||||||
|
year_col: Name of year column
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Series with numeric sort values
|
||||||
|
"""
|
||||||
|
from config import LTM_END_YEAR
|
||||||
|
|
||||||
|
def sort_value(x):
|
||||||
|
if isinstance(x, str) and str(LTM_END_YEAR) in str(x):
|
||||||
|
return float(LTM_END_YEAR) + 0.5
|
||||||
|
elif isinstance(x, (int, float)):
|
||||||
|
return float(x)
|
||||||
|
else:
|
||||||
|
return 9999
|
||||||
|
|
||||||
|
return df[year_col].apply(sort_value)
|
||||||
|
|
||||||
|
def sort_mixed_years(df, year_col='Year'):
|
||||||
|
"""
|
||||||
|
Sort DataFrame by year column that may contain mixed int/str types
|
||||||
|
|
||||||
|
Args:
|
||||||
|
df: DataFrame
|
||||||
|
year_col: Name of year column
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Sorted DataFrame
|
||||||
|
"""
|
||||||
|
df = df.copy()
|
||||||
|
df['_Year_Sort'] = create_year_sort_column(df, year_col)
|
||||||
|
df = df.sort_values('_Year_Sort').drop(columns=['_Year_Sort'])
|
||||||
|
return df
|
||||||
|
|
||||||
|
def safe_year_labels(years):
|
||||||
|
"""
|
||||||
|
Convert year values to safe string labels for chart axes
|
||||||
|
|
||||||
|
Args:
|
||||||
|
years: Iterable of year values (int or str)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of string labels
|
||||||
|
"""
|
||||||
|
return [str(year) for year in years]
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# CHART HELPERS
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
def setup_revenue_chart(ax, ylabel: str = 'Revenue (Millions USD)') -> None:
|
||||||
|
"""
|
||||||
|
Setup a chart axis for revenue display (millions)
|
||||||
|
|
||||||
|
CRITICAL: Always use this function for revenue charts. It applies
|
||||||
|
the millions formatter and standard styling.
|
||||||
|
|
||||||
|
IMPORTANT: Data must be divided by 1e6 BEFORE plotting:
|
||||||
|
ax.plot(revenue / 1e6, ...) # ✅ Correct
|
||||||
|
ax.plot(revenue, ...) # ❌ Wrong - will show scientific notation
|
||||||
|
|
||||||
|
Args:
|
||||||
|
ax: Matplotlib axis object to configure
|
||||||
|
ylabel: Y-axis label (default: 'Revenue (Millions USD)')
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
None: Modifies ax in place
|
||||||
|
|
||||||
|
Example:
|
||||||
|
>>> import matplotlib.pyplot as plt
|
||||||
|
>>> from analysis_utils import setup_revenue_chart
|
||||||
|
>>> fig, ax = plt.subplots()
|
||||||
|
>>> ax.plot(revenue_data / 1e6, marker='o') # Divide by 1e6 first!
|
||||||
|
>>> setup_revenue_chart(ax)
|
||||||
|
>>> plt.show()
|
||||||
|
|
||||||
|
See Also:
|
||||||
|
- .cursor/rules/chart_formatting.md for detailed patterns
|
||||||
|
- save_chart() for saving charts
|
||||||
|
"""
|
||||||
|
ax.yaxis.set_major_formatter(get_millions_formatter())
|
||||||
|
ax.set_ylabel(ylabel)
|
||||||
|
ax.grid(True, alpha=0.3)
|
||||||
|
|
||||||
|
def save_chart(fig, filename, output_dir=None):
|
||||||
|
"""
|
||||||
|
Save chart to file with organized directory structure
|
||||||
|
|
||||||
|
Args:
|
||||||
|
fig: Matplotlib figure object
|
||||||
|
filename: Output filename (e.g., 'revenue_trend.png')
|
||||||
|
output_dir: Output directory (defaults to config.OUTPUT_DIR)
|
||||||
|
"""
|
||||||
|
if output_dir is None:
|
||||||
|
output_dir = OUTPUT_DIR
|
||||||
|
else:
|
||||||
|
output_dir = Path(output_dir)
|
||||||
|
|
||||||
|
output_dir.mkdir(exist_ok=True)
|
||||||
|
|
||||||
|
filepath = output_dir / filename
|
||||||
|
fig.savefig(filepath, dpi=CHART_DPI, bbox_inches=CHART_BBOX, format='png')
|
||||||
|
print(f"Chart saved: {filepath}")
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# DATA VALIDATION
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
def validate_dataframe(df, required_columns=None):
|
||||||
|
"""
|
||||||
|
Validate DataFrame has required columns and basic data quality
|
||||||
|
|
||||||
|
Args:
|
||||||
|
df: DataFrame to validate
|
||||||
|
required_columns: List of required column names (defaults to config)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
tuple: (is_valid bool, error_message str)
|
||||||
|
"""
|
||||||
|
if required_columns is None:
|
||||||
|
required_columns = [REVENUE_COLUMN, 'Year']
|
||||||
|
if 'YearMonth' in df.columns:
|
||||||
|
required_columns.append('YearMonth')
|
||||||
|
|
||||||
|
missing_cols = [col for col in required_columns if col not in df.columns]
|
||||||
|
if missing_cols:
|
||||||
|
return False, f"Missing required columns: {missing_cols}"
|
||||||
|
|
||||||
|
if len(df) == 0:
|
||||||
|
return False, "DataFrame is empty"
|
||||||
|
|
||||||
|
if REVENUE_COLUMN in df.columns:
|
||||||
|
if df[REVENUE_COLUMN].isna().all():
|
||||||
|
return False, f"All {REVENUE_COLUMN} values are NaN"
|
||||||
|
|
||||||
|
return True, "OK"
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# PRICE CALCULATION
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
def calculate_price_per_unit(df, quantity_col=None, revenue_col=None):
|
||||||
|
"""
|
||||||
|
Calculate average price per unit, excluding invalid quantities
|
||||||
|
|
||||||
|
Args:
|
||||||
|
df: DataFrame with quantity and revenue columns
|
||||||
|
quantity_col: Name of quantity column (defaults to config)
|
||||||
|
revenue_col: Name of revenue column (defaults to config)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
float: Average price per unit
|
||||||
|
"""
|
||||||
|
from config import QUANTITY_COLUMN, REVENUE_COLUMN, MIN_QUANTITY, MAX_QUANTITY
|
||||||
|
|
||||||
|
if quantity_col is None:
|
||||||
|
quantity_col = QUANTITY_COLUMN
|
||||||
|
if revenue_col is None:
|
||||||
|
revenue_col = REVENUE_COLUMN
|
||||||
|
|
||||||
|
# Check if quantity column exists
|
||||||
|
if quantity_col not in df.columns:
|
||||||
|
return np.nan
|
||||||
|
|
||||||
|
# Filter for valid quantity transactions
|
||||||
|
df_valid = df[(df[quantity_col] > MIN_QUANTITY) & (df[quantity_col] <= MAX_QUANTITY)].copy()
|
||||||
|
|
||||||
|
if len(df_valid) == 0:
|
||||||
|
return np.nan
|
||||||
|
|
||||||
|
total_revenue = df_valid[revenue_col].sum()
|
||||||
|
total_quantity = df_valid[quantity_col].sum()
|
||||||
|
|
||||||
|
if total_quantity == 0:
|
||||||
|
return np.nan
|
||||||
|
|
||||||
|
return total_revenue / total_quantity
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# OUTPUT FORMATTING
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
def format_currency(value: float, millions: bool = True) -> str:
|
||||||
|
"""
|
||||||
|
Format currency value for console output
|
||||||
|
|
||||||
|
Args:
|
||||||
|
value: Numeric value to format
|
||||||
|
millions: If True, format as millions ($X.Xm), else thousands ($X.Xk)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: Formatted string like "$99.9m" or "$99.9k" or "N/A" if NaN
|
||||||
|
|
||||||
|
Example:
|
||||||
|
>>> format_currency(1000000)
|
||||||
|
'$1.00m'
|
||||||
|
>>> format_currency(1000, millions=False)
|
||||||
|
'$1.00k'
|
||||||
|
"""
|
||||||
|
if pd.isna(value):
|
||||||
|
return "N/A"
|
||||||
|
|
||||||
|
if millions:
|
||||||
|
return f"${value / 1e6:.2f}m"
|
||||||
|
else:
|
||||||
|
return f"${value / 1e3:.2f}k"
|
||||||
|
|
||||||
|
def print_annual_summary(annual_df, metric_col='Revenue', label='Revenue'):
|
||||||
|
"""
|
||||||
|
Print formatted annual summary to console
|
||||||
|
|
||||||
|
Args:
|
||||||
|
annual_df: DataFrame with annual metrics (indexed by Year)
|
||||||
|
metric_col: Column name to print
|
||||||
|
label: Label for the metric
|
||||||
|
"""
|
||||||
|
print(f"\n{label} by Year:")
|
||||||
|
print("-" * 40)
|
||||||
|
for year in annual_df.index:
|
||||||
|
value = annual_df.loc[year, metric_col]
|
||||||
|
formatted = format_currency(value)
|
||||||
|
print(f" {year}: {formatted}")
|
||||||
|
print()
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# DATA FILTERING HELPERS
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
def apply_exclusion_filters(df):
|
||||||
|
"""
|
||||||
|
Apply exclusion filters from config
|
||||||
|
|
||||||
|
Args:
|
||||||
|
df: DataFrame to filter
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Filtered DataFrame
|
||||||
|
"""
|
||||||
|
from config import EXCLUSION_FILTERS
|
||||||
|
|
||||||
|
if not EXCLUSION_FILTERS.get('enabled', False):
|
||||||
|
return df
|
||||||
|
|
||||||
|
exclude_col = EXCLUSION_FILTERS.get('exclude_by_column')
|
||||||
|
exclude_values = EXCLUSION_FILTERS.get('exclude_values', [])
|
||||||
|
|
||||||
|
if exclude_col and exclude_col in df.columns and exclude_values:
|
||||||
|
original_count = len(df)
|
||||||
|
df_filtered = df[~df[exclude_col].isin(exclude_values)]
|
||||||
|
excluded_count = original_count - len(df_filtered)
|
||||||
|
if excluded_count > 0:
|
||||||
|
print(f"Excluded {excluded_count:,} rows based on {exclude_col} filter")
|
||||||
|
return df_filtered
|
||||||
|
|
||||||
|
return df
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# INTERACTIVE VISUALIZATIONS (OPTIONAL - PLOTLY)
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
def create_interactive_chart(data, chart_type='line', title=None, xlabel=None, ylabel=None):
|
||||||
|
"""
|
||||||
|
Create interactive chart using Plotly (optional dependency)
|
||||||
|
|
||||||
|
Args:
|
||||||
|
data: DataFrame or dict with chart data
|
||||||
|
chart_type: Type of chart ('line', 'bar', 'scatter')
|
||||||
|
title: Chart title
|
||||||
|
xlabel: X-axis label
|
||||||
|
ylabel: Y-axis label
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
plotly.graph_objects.Figure: Plotly figure object
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ImportError: If plotly is not installed
|
||||||
|
|
||||||
|
Example:
|
||||||
|
fig = create_interactive_chart(
|
||||||
|
{'x': [1, 2, 3], 'y': [10, 20, 30]},
|
||||||
|
chart_type='line',
|
||||||
|
title='Revenue Trend'
|
||||||
|
)
|
||||||
|
fig.show()
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
import plotly.graph_objects as go
|
||||||
|
from plotly.subplots import make_subplots
|
||||||
|
except ImportError:
|
||||||
|
raise ImportError(
|
||||||
|
"plotly is required for interactive charts. Install with: pip install plotly"
|
||||||
|
)
|
||||||
|
|
||||||
|
fig = go.Figure()
|
||||||
|
|
||||||
|
if chart_type == 'line':
|
||||||
|
if isinstance(data, dict) and 'x' in data and 'y' in data:
|
||||||
|
fig.add_trace(go.Scatter(
|
||||||
|
x=data['x'],
|
||||||
|
y=data['y'],
|
||||||
|
mode='lines+markers',
|
||||||
|
name='Data'
|
||||||
|
))
|
||||||
|
elif chart_type == 'bar':
|
||||||
|
if isinstance(data, dict) and 'x' in data and 'y' in data:
|
||||||
|
fig.add_trace(go.Bar(
|
||||||
|
x=data['x'],
|
||||||
|
y=data['y'],
|
||||||
|
name='Data'
|
||||||
|
))
|
||||||
|
|
||||||
|
if title:
|
||||||
|
fig.update_layout(title=title)
|
||||||
|
if xlabel:
|
||||||
|
fig.update_xaxes(title_text=xlabel)
|
||||||
|
if ylabel:
|
||||||
|
fig.update_yaxes(title_text=ylabel)
|
||||||
|
|
||||||
|
fig.update_layout(
|
||||||
|
template='plotly_white',
|
||||||
|
hovermode='x unified'
|
||||||
|
)
|
||||||
|
|
||||||
|
return fig
|
||||||
|
|
||||||
|
def save_interactive_chart(fig, filename, output_dir=None):
|
||||||
|
"""
|
||||||
|
Save interactive Plotly chart to HTML file
|
||||||
|
|
||||||
|
Args:
|
||||||
|
fig: Plotly figure object
|
||||||
|
filename: Output filename (e.g., 'chart.html')
|
||||||
|
output_dir: Output directory (defaults to config.OUTPUT_DIR)
|
||||||
|
"""
|
||||||
|
if output_dir is None:
|
||||||
|
output_dir = OUTPUT_DIR
|
||||||
|
else:
|
||||||
|
output_dir = Path(output_dir)
|
||||||
|
|
||||||
|
output_dir.mkdir(exist_ok=True)
|
||||||
|
filepath = output_dir / filename
|
||||||
|
|
||||||
|
fig.write_html(str(filepath))
|
||||||
|
print(f"Interactive chart saved: {filepath}")
|
||||||
|
|
||||||
|
return filepath
|
||||||
277
config.py
Normal file
277
config.py
Normal file
@@ -0,0 +1,277 @@
|
|||||||
|
"""
|
||||||
|
Configuration file for sales analysis scripts
|
||||||
|
CONFIGURE THIS FILE FOR YOUR COMPANY'S SPECIFIC DATA STRUCTURE
|
||||||
|
|
||||||
|
This file should be customized based on:
|
||||||
|
- Your data file name and location
|
||||||
|
- Column names in your sales data
|
||||||
|
- Date range and LTM period
|
||||||
|
- Company-specific settings
|
||||||
|
|
||||||
|
CRITICAL: All column names, file paths, and settings are defined here.
|
||||||
|
Never hardcode these values in analysis scripts - always import from config.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
from config import REVENUE_COLUMN, DATE_COLUMN, get_data_path
|
||||||
|
revenue = df[REVENUE_COLUMN].sum() # ✅ Correct
|
||||||
|
revenue = df['USD'].sum() # ❌ Wrong - hardcoded
|
||||||
|
|
||||||
|
Quick Setup:
|
||||||
|
1. Run: python setup_wizard.py (interactive configuration)
|
||||||
|
2. Or manually edit this file following the TODO comments
|
||||||
|
3. Validate: python config_validator.py
|
||||||
|
|
||||||
|
See Also:
|
||||||
|
- .cursor/rules/analysis_patterns.md - How to use config values
|
||||||
|
- setup_wizard.py - Interactive configuration tool
|
||||||
|
- config_validator.py - Configuration validation
|
||||||
|
"""
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Optional, Tuple
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# COMPANY INFORMATION
|
||||||
|
# ============================================================================
|
||||||
|
# TODO: Update these values for your company
|
||||||
|
COMPANY_NAME = "Your Company Name" # Update this
|
||||||
|
ANALYSIS_DATE = "2026-01-12" # Update this to current date
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# DATA FILES
|
||||||
|
# ============================================================================
|
||||||
|
# TODO: Update with your actual data file name
|
||||||
|
DATA_FILE = 'sales_data.csv' # Update this to your CSV file name
|
||||||
|
OUTPUT_DIR = Path('charts')
|
||||||
|
REPORTS_DIR = Path('reports')
|
||||||
|
DATA_DIR = Path('data') # Optional: if data is in a subdirectory
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# DATA COLUMN MAPPINGS
|
||||||
|
# ============================================================================
|
||||||
|
# TODO: Map these to your actual column names
|
||||||
|
# These are the expected column names - update if your CSV uses different names
|
||||||
|
|
||||||
|
# Revenue column (REQUIRED)
|
||||||
|
REVENUE_COLUMN = 'USD' # Common alternatives: 'Amount', 'Revenue', 'Total', 'Sales'
|
||||||
|
|
||||||
|
# Date columns (at least one required)
|
||||||
|
DATE_COLUMN = 'InvoiceDate' # Primary date column
|
||||||
|
DATE_FALLBACK_COLUMNS = ['Month', 'Year'] # Fallback columns if primary is missing
|
||||||
|
|
||||||
|
# Customer/Account columns
|
||||||
|
CUSTOMER_COLUMN = 'Customer' # Common alternatives: 'Account', 'CustomerName', 'Client'
|
||||||
|
|
||||||
|
# Product/Item columns
|
||||||
|
ITEM_COLUMN = 'Item' # Common alternatives: 'Product', 'SKU', 'ItemCode'
|
||||||
|
PRODUCT_GROUP_COLUMN = 'ProductGroup' # Optional: for product categorization
|
||||||
|
QUANTITY_COLUMN = 'Quantity' # Optional: for price calculations
|
||||||
|
|
||||||
|
# Geographic columns (optional)
|
||||||
|
REGION_COLUMN = 'Region' # Optional: for geographic analysis
|
||||||
|
COUNTRY_COLUMN = 'Country' # Optional: for country-level analysis
|
||||||
|
|
||||||
|
# Segment/Category columns (optional - customize based on your data)
|
||||||
|
SEGMENT_COLUMNS = {
|
||||||
|
'Technology': 'Technology', # Optional: technology/product type
|
||||||
|
'EndMarket': 'EndMarket', # Optional: end market/industry
|
||||||
|
'ProductGroup': 'ProductGroup', # Optional: product category
|
||||||
|
}
|
||||||
|
|
||||||
|
# Invoice/Transaction columns
|
||||||
|
INVOICE_NUMBER_COLUMN = 'Invoice #' # Optional: for transaction-level analysis
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# DATE RANGE CONFIGURATION
|
||||||
|
# ============================================================================
|
||||||
|
# TODO: Update these based on your data and analysis needs
|
||||||
|
|
||||||
|
# Analysis years (years to include in analysis)
|
||||||
|
ANALYSIS_YEARS = [2021, 2022, 2023, 2024, 2025] # Update based on your data
|
||||||
|
|
||||||
|
# LTM (Last Twelve Months) Configuration
|
||||||
|
# For the most recent partial year, use LTM for apples-to-apples comparison
|
||||||
|
# Example: If latest data is through September 2025, use Oct 2024 - Sep 2025
|
||||||
|
LTM_ENABLED = True # Set to False if you have complete calendar years only
|
||||||
|
LTM_START_MONTH = 10 # Month number (1-12) for LTM start
|
||||||
|
LTM_START_YEAR = 2024 # Year for LTM start
|
||||||
|
LTM_END_MONTH = 9 # Month number (1-12) for LTM end
|
||||||
|
LTM_END_YEAR = 2025 # Year for LTM end
|
||||||
|
|
||||||
|
# Generate LTM period objects
|
||||||
|
if LTM_ENABLED:
|
||||||
|
LTM_START = pd.Period(f'{LTM_START_YEAR}-{LTM_START_MONTH:02d}', freq='M')
|
||||||
|
LTM_END = pd.Period(f'{LTM_END_YEAR}-{LTM_END_MONTH:02d}', freq='M')
|
||||||
|
LTM_LABEL = f'{LTM_END_YEAR} (LTM {LTM_END_MONTH}/{LTM_END_YEAR})'
|
||||||
|
else:
|
||||||
|
LTM_START = None
|
||||||
|
LTM_END = None
|
||||||
|
LTM_LABEL = None
|
||||||
|
|
||||||
|
# Data date range (filter data to this range)
|
||||||
|
MIN_YEAR = 2021 # Minimum year to include
|
||||||
|
MAX_DATE = pd.Timestamp('2025-09-30') # Maximum date to include (update based on your data)
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# CHART SETTINGS
|
||||||
|
# ============================================================================
|
||||||
|
CHART_DPI = 300
|
||||||
|
CHART_FORMAT = 'png'
|
||||||
|
CHART_BBOX = 'tight'
|
||||||
|
CHART_STYLE = 'seaborn-v0_8' # Options: 'default', 'ggplot', 'seaborn-v0_8', etc.
|
||||||
|
|
||||||
|
# Chart size presets
|
||||||
|
CHART_SIZES = {
|
||||||
|
'small': (6, 4),
|
||||||
|
'medium': (10, 6),
|
||||||
|
'large': (12, 8),
|
||||||
|
'wide': (14, 6)
|
||||||
|
}
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# DATA FILTERING
|
||||||
|
# ============================================================================
|
||||||
|
# Quantity filtering for price calculations (exclude outliers)
|
||||||
|
MIN_QUANTITY = 0 # Minimum valid quantity
|
||||||
|
MAX_QUANTITY = 1000 # Maximum valid quantity (adjust based on your data)
|
||||||
|
|
||||||
|
# Revenue filtering (optional - exclude negative values, returns, etc.)
|
||||||
|
EXCLUDE_NEGATIVE_REVENUE = False # Set to True to exclude negative revenue (returns/credits)
|
||||||
|
MIN_REVENUE = None # Optional: minimum revenue threshold
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# EXCLUSION FILTERS (Optional)
|
||||||
|
# ============================================================================
|
||||||
|
# Use this section to exclude specific segments, customers, or products
|
||||||
|
# Example: Exclude a business unit, test accounts, etc.
|
||||||
|
|
||||||
|
EXCLUSION_FILTERS = {
|
||||||
|
'enabled': False, # Set to True to enable exclusions
|
||||||
|
'exclude_by_column': None, # Column name to filter on (e.g., 'Country', 'Segment')
|
||||||
|
'exclude_values': [], # List of values to exclude (e.g., ['KVT', 'Test'])
|
||||||
|
}
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# VALIDATION THRESHOLDS (Optional)
|
||||||
|
# ============================================================================
|
||||||
|
# Expected revenue ranges for validation (update based on your company)
|
||||||
|
# These are used to validate that data loading is working correctly
|
||||||
|
VALIDATION_ENABLED = False # Set to True to enable validation
|
||||||
|
EXPECTED_REVENUE = {} # Example: {2021: 99_880_000, 2024: 89_990_000}
|
||||||
|
REVENUE_TOLERANCE_PCT = 0.01 # 1% tolerance for validation
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# HELPER FUNCTIONS
|
||||||
|
# ============================================================================
|
||||||
|
def ensure_directories() -> None:
|
||||||
|
"""
|
||||||
|
Create output directories if they don't exist
|
||||||
|
|
||||||
|
Creates charts/ and reports/ directories for saving analysis outputs.
|
||||||
|
Called automatically by get_chart_path() and get_report_path().
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
None: Creates directories in place
|
||||||
|
"""
|
||||||
|
OUTPUT_DIR.mkdir(exist_ok=True)
|
||||||
|
REPORTS_DIR.mkdir(exist_ok=True)
|
||||||
|
if DATA_DIR.exists():
|
||||||
|
DATA_DIR.mkdir(exist_ok=True)
|
||||||
|
|
||||||
|
def get_chart_path(filename: str) -> Path:
|
||||||
|
"""
|
||||||
|
Get full path for chart file
|
||||||
|
|
||||||
|
Args:
|
||||||
|
filename: Chart filename (e.g., 'revenue_trend.png')
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Path: Full path to chart file in OUTPUT_DIR
|
||||||
|
"""
|
||||||
|
ensure_directories()
|
||||||
|
return OUTPUT_DIR / filename
|
||||||
|
|
||||||
|
def get_report_path(filename: str) -> Path:
|
||||||
|
"""
|
||||||
|
Get full path for report file
|
||||||
|
|
||||||
|
Args:
|
||||||
|
filename: Report filename (e.g., 'analysis_report.pdf')
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Path: Full path to report file in REPORTS_DIR
|
||||||
|
"""
|
||||||
|
ensure_directories()
|
||||||
|
return REPORTS_DIR / filename
|
||||||
|
|
||||||
|
def get_data_path(filename: Optional[str] = None) -> Path:
|
||||||
|
"""
|
||||||
|
Get full path for data file
|
||||||
|
|
||||||
|
This function handles data file location logic:
|
||||||
|
- If DATA_DIR exists, looks there first
|
||||||
|
- Otherwise uses current directory
|
||||||
|
- Defaults to DATA_FILE from config if filename not provided
|
||||||
|
|
||||||
|
Args:
|
||||||
|
filename: Optional filename override (defaults to config.DATA_FILE)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Path: Full path to data file
|
||||||
|
|
||||||
|
Example:
|
||||||
|
>>> from config import get_data_path
|
||||||
|
>>> data_path = get_data_path()
|
||||||
|
>>> print(f"Loading from: {data_path}")
|
||||||
|
"""
|
||||||
|
if filename is None:
|
||||||
|
filename = DATA_FILE
|
||||||
|
if DATA_DIR.exists():
|
||||||
|
return DATA_DIR / filename
|
||||||
|
return Path(filename)
|
||||||
|
|
||||||
|
def get_ltm_period() -> Tuple[Optional[pd.Period], Optional[pd.Period]]:
|
||||||
|
"""
|
||||||
|
Get LTM (Last Twelve Months) period boundaries from config
|
||||||
|
|
||||||
|
Returns LTM start and end periods if LTM is enabled and configured,
|
||||||
|
otherwise returns (None, None).
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple[Optional[pd.Period], Optional[pd.Period]]:
|
||||||
|
(ltm_start, ltm_end) or (None, None) if disabled
|
||||||
|
|
||||||
|
Example:
|
||||||
|
>>> ltm_start, ltm_end = get_ltm_period()
|
||||||
|
>>> if ltm_start and ltm_end:
|
||||||
|
... print(f"LTM: {ltm_start} to {ltm_end}")
|
||||||
|
|
||||||
|
See Also:
|
||||||
|
- get_ltm_label() - Get formatted LTM label string
|
||||||
|
- .cursor/rules/ltm_methodology.md - LTM explanation
|
||||||
|
"""
|
||||||
|
if LTM_ENABLED and LTM_START and LTM_END:
|
||||||
|
return LTM_START, LTM_END
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
def get_ltm_label() -> Optional[str]:
|
||||||
|
"""
|
||||||
|
Get LTM label string for display
|
||||||
|
|
||||||
|
Returns formatted label like "2025 (LTM 9/2025)" if LTM is enabled,
|
||||||
|
otherwise None. Use this in chart titles and labels.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Optional[str]: LTM label string or None if LTM disabled
|
||||||
|
|
||||||
|
Example:
|
||||||
|
>>> from config import get_ltm_label
|
||||||
|
>>> ltm_label = get_ltm_label()
|
||||||
|
>>> if ltm_label:
|
||||||
|
... title = f'Revenue Trend\n({ltm_label})'
|
||||||
|
|
||||||
|
See Also:
|
||||||
|
- get_ltm_period() - Get LTM period objects
|
||||||
|
- .cursor/rules/ltm_methodology.md - LTM usage guide
|
||||||
|
"""
|
||||||
|
return LTM_LABEL if LTM_ENABLED else None
|
||||||
214
config_validator.py
Normal file
214
config_validator.py
Normal file
@@ -0,0 +1,214 @@
|
|||||||
|
"""
|
||||||
|
Configuration validation utility
|
||||||
|
Validates configuration settings against data to catch errors early
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
from config_validator import validate_config
|
||||||
|
|
||||||
|
# Validate configuration
|
||||||
|
errors, warnings = validate_config(df)
|
||||||
|
if errors:
|
||||||
|
print("Configuration errors found:", errors)
|
||||||
|
"""
|
||||||
|
import pandas as pd
|
||||||
|
from pathlib import Path
|
||||||
|
from config import (
|
||||||
|
DATA_FILE, REVENUE_COLUMN, DATE_COLUMN, DATE_FALLBACK_COLUMNS,
|
||||||
|
CUSTOMER_COLUMN, ITEM_COLUMN, QUANTITY_COLUMN,
|
||||||
|
MIN_YEAR, MAX_DATE, ANALYSIS_YEARS,
|
||||||
|
LTM_ENABLED, LTM_START, LTM_END, LTM_START_YEAR, LTM_END_YEAR,
|
||||||
|
EXCLUSION_FILTERS, get_data_path
|
||||||
|
)
|
||||||
|
|
||||||
|
def validate_config(df=None):
|
||||||
|
"""
|
||||||
|
Validate configuration against data
|
||||||
|
|
||||||
|
Args:
|
||||||
|
df: Optional DataFrame to validate against. If None, attempts to load data.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
tuple: (errors list, warnings list)
|
||||||
|
|
||||||
|
Example:
|
||||||
|
errors, warnings = validate_config(df)
|
||||||
|
if errors:
|
||||||
|
for error in errors:
|
||||||
|
print(f"ERROR: {error}")
|
||||||
|
if warnings:
|
||||||
|
for warning in warnings:
|
||||||
|
print(f"WARNING: {warning}")
|
||||||
|
"""
|
||||||
|
errors = []
|
||||||
|
warnings = []
|
||||||
|
|
||||||
|
# Load data if not provided
|
||||||
|
if df is None:
|
||||||
|
try:
|
||||||
|
from data_loader import load_sales_data
|
||||||
|
data_path = get_data_path()
|
||||||
|
if not data_path.exists():
|
||||||
|
errors.append(f"Data file not found: {data_path}")
|
||||||
|
return errors, warnings
|
||||||
|
df = load_sales_data(data_path)
|
||||||
|
except Exception as e:
|
||||||
|
errors.append(f"Could not load data for validation: {e}")
|
||||||
|
return errors, warnings
|
||||||
|
|
||||||
|
# 1. Validate required columns exist
|
||||||
|
required_columns = [REVENUE_COLUMN, DATE_COLUMN]
|
||||||
|
for col in required_columns:
|
||||||
|
if col not in df.columns:
|
||||||
|
errors.append(f"Required column '{col}' not found in data. Available columns: {list(df.columns)[:10]}")
|
||||||
|
|
||||||
|
# 2. Validate date column has valid dates
|
||||||
|
if DATE_COLUMN in df.columns:
|
||||||
|
date_coverage = df[DATE_COLUMN].notna().sum() / len(df) * 100
|
||||||
|
if date_coverage < 50:
|
||||||
|
errors.append(f"Date coverage is very low ({date_coverage:.1f}%). Check date column configuration.")
|
||||||
|
elif date_coverage < 90:
|
||||||
|
warnings.append(f"Date coverage is {date_coverage:.1f}%. Consider adding fallback date columns.")
|
||||||
|
|
||||||
|
# 3. Validate fallback date columns
|
||||||
|
if DATE_FALLBACK_COLUMNS:
|
||||||
|
missing_fallbacks = [col for col in DATE_FALLBACK_COLUMNS if col not in df.columns]
|
||||||
|
if missing_fallbacks:
|
||||||
|
warnings.append(f"Fallback date columns not found: {missing_fallbacks}")
|
||||||
|
|
||||||
|
# 4. Validate revenue column is numeric
|
||||||
|
if REVENUE_COLUMN in df.columns:
|
||||||
|
try:
|
||||||
|
pd.to_numeric(df[REVENUE_COLUMN], errors='coerce')
|
||||||
|
valid_revenue = df[REVENUE_COLUMN].notna().sum()
|
||||||
|
if valid_revenue == 0:
|
||||||
|
errors.append(f"Revenue column '{REVENUE_COLUMN}' has no valid numeric values")
|
||||||
|
elif valid_revenue < len(df) * 0.9:
|
||||||
|
warnings.append(f"Revenue column has {len(df) - valid_revenue} invalid values")
|
||||||
|
except Exception:
|
||||||
|
errors.append(f"Revenue column '{REVENUE_COLUMN}' cannot be converted to numeric")
|
||||||
|
|
||||||
|
# 5. Validate date range
|
||||||
|
if DATE_COLUMN in df.columns and df[DATE_COLUMN].notna().any():
|
||||||
|
min_date_in_data = df[DATE_COLUMN].min()
|
||||||
|
max_date_in_data = df[DATE_COLUMN].max()
|
||||||
|
|
||||||
|
if MIN_YEAR and min_date_in_data.year > MIN_YEAR:
|
||||||
|
warnings.append(f"MIN_YEAR ({MIN_YEAR}) is earlier than earliest data ({min_date_in_data.year})")
|
||||||
|
|
||||||
|
if MAX_DATE and max_date_in_data > MAX_DATE:
|
||||||
|
warnings.append(f"MAX_DATE ({MAX_DATE.date()}) is earlier than latest data ({max_date_in_data.date()})")
|
||||||
|
|
||||||
|
# 6. Validate analysis years
|
||||||
|
if 'Year' in df.columns:
|
||||||
|
available_years = sorted(df['Year'].unique())
|
||||||
|
missing_years = [year for year in ANALYSIS_YEARS if year not in available_years]
|
||||||
|
if missing_years:
|
||||||
|
warnings.append(f"ANALYSIS_YEARS includes years not in data: {missing_years}")
|
||||||
|
|
||||||
|
# 7. Validate LTM configuration
|
||||||
|
if LTM_ENABLED:
|
||||||
|
if LTM_START is None or LTM_END is None:
|
||||||
|
errors.append("LTM_ENABLED is True but LTM_START or LTM_END is None")
|
||||||
|
else:
|
||||||
|
if LTM_START > LTM_END:
|
||||||
|
errors.append(f"LTM_START ({LTM_START}) is after LTM_END ({LTM_END})")
|
||||||
|
|
||||||
|
if 'YearMonth' in df.columns:
|
||||||
|
available_periods = df['YearMonth'].unique()
|
||||||
|
if LTM_START not in available_periods:
|
||||||
|
warnings.append(f"LTM_START ({LTM_START}) not found in data")
|
||||||
|
if LTM_END not in available_periods:
|
||||||
|
warnings.append(f"LTM_END ({LTM_END}) not found in data")
|
||||||
|
|
||||||
|
# 8. Validate exclusion filters
|
||||||
|
if EXCLUSION_FILTERS.get('enabled', False):
|
||||||
|
exclude_col = EXCLUSION_FILTERS.get('exclude_by_column')
|
||||||
|
if exclude_col:
|
||||||
|
if exclude_col not in df.columns:
|
||||||
|
errors.append(f"Exclusion filter column '{exclude_col}' not found in data")
|
||||||
|
else:
|
||||||
|
exclude_values = EXCLUSION_FILTERS.get('exclude_values', [])
|
||||||
|
if exclude_values:
|
||||||
|
available_values = df[exclude_col].unique()
|
||||||
|
invalid_values = [v for v in exclude_values if v not in available_values]
|
||||||
|
if invalid_values:
|
||||||
|
warnings.append(f"Exclusion filter values not found in data: {invalid_values}")
|
||||||
|
|
||||||
|
# 9. Validate optional columns (warnings only)
|
||||||
|
optional_columns = {
|
||||||
|
'Customer': CUSTOMER_COLUMN,
|
||||||
|
'Item': ITEM_COLUMN,
|
||||||
|
'Quantity': QUANTITY_COLUMN
|
||||||
|
}
|
||||||
|
|
||||||
|
for col_type, col_name in optional_columns.items():
|
||||||
|
if col_name and col_name not in df.columns:
|
||||||
|
warnings.append(f"Optional {col_type} column '{col_name}' not found. Some analyses may not work.")
|
||||||
|
|
||||||
|
# 10. Validate data file exists
|
||||||
|
data_path = get_data_path()
|
||||||
|
if not data_path.exists():
|
||||||
|
errors.append(f"Data file not found: {data_path}")
|
||||||
|
|
||||||
|
return errors, warnings
|
||||||
|
|
||||||
|
def print_validation_report(errors, warnings):
|
||||||
|
"""
|
||||||
|
Print a formatted validation report
|
||||||
|
|
||||||
|
Args:
|
||||||
|
errors: List of error messages
|
||||||
|
warnings: List of warning messages
|
||||||
|
"""
|
||||||
|
print("\n" + "="*60)
|
||||||
|
print("Configuration Validation Report")
|
||||||
|
print("="*60)
|
||||||
|
|
||||||
|
if errors:
|
||||||
|
print(f"\n❌ ERRORS ({len(errors)}):")
|
||||||
|
for i, error in enumerate(errors, 1):
|
||||||
|
print(f" {i}. {error}")
|
||||||
|
else:
|
||||||
|
print("\n✅ No configuration errors found")
|
||||||
|
|
||||||
|
if warnings:
|
||||||
|
print(f"\n⚠️ WARNINGS ({len(warnings)}):")
|
||||||
|
for i, warning in enumerate(warnings, 1):
|
||||||
|
print(f" {i}. {warning}")
|
||||||
|
else:
|
||||||
|
print("\n✅ No warnings")
|
||||||
|
|
||||||
|
print("\n" + "="*60)
|
||||||
|
|
||||||
|
if errors:
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
def validate_and_report(df=None):
|
||||||
|
"""
|
||||||
|
Validate configuration and print report
|
||||||
|
|
||||||
|
Args:
|
||||||
|
df: Optional DataFrame to validate against
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
bool: True if no errors, False otherwise
|
||||||
|
"""
|
||||||
|
errors, warnings = validate_config(df)
|
||||||
|
return print_validation_report(errors, warnings)
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# STANDALONE VALIDATION SCRIPT
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
"""Run configuration validation"""
|
||||||
|
print("Validating configuration...")
|
||||||
|
is_valid = validate_and_report()
|
||||||
|
|
||||||
|
if is_valid:
|
||||||
|
print("\n✅ Configuration is valid!")
|
||||||
|
exit(0)
|
||||||
|
else:
|
||||||
|
print("\n❌ Configuration has errors. Please fix them before running analyses.")
|
||||||
|
exit(1)
|
||||||
224
data_loader.py
Normal file
224
data_loader.py
Normal file
@@ -0,0 +1,224 @@
|
|||||||
|
"""
|
||||||
|
Generic data loading utility with flexible date handling
|
||||||
|
Handles various date column formats and fallback logic
|
||||||
|
|
||||||
|
This loader is designed to work with different CSV structures by:
|
||||||
|
1. Trying primary date column first
|
||||||
|
2. Falling back to alternative date columns if needed
|
||||||
|
3. Ensuring 100% date coverage
|
||||||
|
"""
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
from pathlib import Path
|
||||||
|
from config import (
|
||||||
|
REVENUE_COLUMN, DATE_COLUMN, DATE_FALLBACK_COLUMNS,
|
||||||
|
get_data_path
|
||||||
|
)
|
||||||
|
|
||||||
|
def load_sales_data(filepath=None):
|
||||||
|
"""
|
||||||
|
Load sales data with flexible date handling
|
||||||
|
|
||||||
|
This function provides intelligent data loading with fallback logic:
|
||||||
|
1. Loads the CSV file
|
||||||
|
2. Converts revenue column to numeric
|
||||||
|
3. Attempts to parse dates using primary date column
|
||||||
|
4. Falls back to alternative date columns if needed (100% coverage)
|
||||||
|
5. Creates Year and YearMonth columns for analysis
|
||||||
|
|
||||||
|
CRITICAL: Always use this function instead of pd.read_csv() directly.
|
||||||
|
This ensures proper date parsing with fallback logic.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
filepath: Path to the CSV file (defaults to config.DATA_FILE).
|
||||||
|
Can be str, Path, or None (uses config.get_data_path())
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
pd.DataFrame: DataFrame with properly parsed dates and revenue.
|
||||||
|
Includes 'Year' and 'YearMonth' columns.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
FileNotFoundError: If data file doesn't exist.
|
||||||
|
Error message includes file path and suggests checking config.py
|
||||||
|
ValueError: If required columns (REVENUE_COLUMN) are missing.
|
||||||
|
Error message lists available columns and suggests updating config.py
|
||||||
|
|
||||||
|
Example:
|
||||||
|
>>> from data_loader import load_sales_data
|
||||||
|
>>> from config import get_data_path
|
||||||
|
>>> df = load_sales_data(get_data_path())
|
||||||
|
>>> print(f"Loaded {len(df):,} rows with {df['Year'].notna().sum():,} with dates")
|
||||||
|
|
||||||
|
See Also:
|
||||||
|
- .cursor/rules/data_loading.md for detailed patterns
|
||||||
|
- config.py for column name configuration
|
||||||
|
"""
|
||||||
|
# Get data file path
|
||||||
|
if filepath is None:
|
||||||
|
filepath = get_data_path()
|
||||||
|
else:
|
||||||
|
filepath = Path(filepath)
|
||||||
|
|
||||||
|
# Check if file exists
|
||||||
|
if not filepath.exists():
|
||||||
|
raise FileNotFoundError(
|
||||||
|
f"Data file not found: {filepath}\n"
|
||||||
|
f"Please update config.py with the correct DATA_FILE path."
|
||||||
|
)
|
||||||
|
|
||||||
|
# Load CSV
|
||||||
|
print(f"Loading data from: {filepath}")
|
||||||
|
df = pd.read_csv(filepath, low_memory=False)
|
||||||
|
print(f"Loaded {len(df):,} rows")
|
||||||
|
|
||||||
|
# Validate required columns
|
||||||
|
if REVENUE_COLUMN not in df.columns:
|
||||||
|
raise ValueError(
|
||||||
|
f"Required column '{REVENUE_COLUMN}' not found in data.\n"
|
||||||
|
f"Available columns: {list(df.columns)}\n"
|
||||||
|
f"Please update config.py REVENUE_COLUMN to match your data."
|
||||||
|
)
|
||||||
|
|
||||||
|
# Convert revenue column to numeric
|
||||||
|
df[REVENUE_COLUMN] = pd.to_numeric(df[REVENUE_COLUMN], errors='coerce')
|
||||||
|
|
||||||
|
# Count missing revenue values
|
||||||
|
missing_revenue = df[REVENUE_COLUMN].isna().sum()
|
||||||
|
if missing_revenue > 0:
|
||||||
|
print(f"Warning: {missing_revenue:,} rows have missing/invalid revenue values")
|
||||||
|
|
||||||
|
# Create working date column
|
||||||
|
df['WorkingDate'] = pd.NaT
|
||||||
|
|
||||||
|
# Try primary date column first
|
||||||
|
if DATE_COLUMN in df.columns:
|
||||||
|
print(f"Attempting to parse {DATE_COLUMN}...")
|
||||||
|
df['Date_Parsed'] = pd.to_datetime(df[DATE_COLUMN], errors='coerce', format='mixed')
|
||||||
|
parsed_count = df['Date_Parsed'].notna().sum()
|
||||||
|
df.loc[df['Date_Parsed'].notna(), 'WorkingDate'] = df.loc[df['Date_Parsed'].notna(), 'Date_Parsed']
|
||||||
|
print(f" Parsed {parsed_count:,} dates from {DATE_COLUMN}")
|
||||||
|
else:
|
||||||
|
print(f"Warning: Primary date column '{DATE_COLUMN}' not found")
|
||||||
|
|
||||||
|
# Use fallback date columns
|
||||||
|
if DATE_FALLBACK_COLUMNS:
|
||||||
|
for fallback_col in DATE_FALLBACK_COLUMNS:
|
||||||
|
if fallback_col in df.columns:
|
||||||
|
missing_dates = df['WorkingDate'].isna()
|
||||||
|
if missing_dates.sum() > 0:
|
||||||
|
print(f"Using fallback column: {fallback_col}...")
|
||||||
|
fallback_parsed = pd.to_datetime(
|
||||||
|
df.loc[missing_dates, fallback_col],
|
||||||
|
errors='coerce',
|
||||||
|
format='mixed'
|
||||||
|
)
|
||||||
|
newly_parsed = missing_dates & fallback_parsed.notna()
|
||||||
|
if newly_parsed.sum() > 0:
|
||||||
|
df.loc[newly_parsed, 'WorkingDate'] = fallback_parsed[newly_parsed]
|
||||||
|
print(f" Parsed {newly_parsed.sum():,} additional dates from {fallback_col}")
|
||||||
|
|
||||||
|
# Final fallback: try to construct from Year column if available
|
||||||
|
if 'Year' in df.columns and df['WorkingDate'].isna().sum() > 0:
|
||||||
|
missing_dates = df['WorkingDate'].isna()
|
||||||
|
year_values = pd.to_numeric(df.loc[missing_dates, 'Year'], errors='coerce')
|
||||||
|
valid_years = missing_dates & year_values.notna()
|
||||||
|
if valid_years.sum() > 0:
|
||||||
|
print(f"Using Year column for remaining {valid_years.sum():,} rows...")
|
||||||
|
df.loc[valid_years, 'WorkingDate'] = pd.to_datetime(
|
||||||
|
df.loc[valid_years, 'Year'].astype(int).astype(str) + '-01-01',
|
||||||
|
errors='coerce'
|
||||||
|
)
|
||||||
|
|
||||||
|
# Set WorkingDate as the primary date column
|
||||||
|
df[DATE_COLUMN] = df['WorkingDate']
|
||||||
|
|
||||||
|
# Clean up temporary columns
|
||||||
|
df = df.drop(columns=['Date_Parsed', 'WorkingDate'], errors='ignore')
|
||||||
|
|
||||||
|
# Extract Year from date column
|
||||||
|
df['Year'] = df[DATE_COLUMN].dt.year
|
||||||
|
|
||||||
|
# Fill missing Year from Year column if it exists and date is missing
|
||||||
|
if 'Year' in df.columns:
|
||||||
|
year_orig = pd.to_numeric(df['Year'], errors='coerce')
|
||||||
|
missing_year = df['Year'].isna()
|
||||||
|
if missing_year.sum() > 0 and 'Year' in df.columns:
|
||||||
|
year_fallback = pd.to_numeric(df.loc[missing_year, 'Year'], errors='coerce')
|
||||||
|
df.loc[missing_year & year_fallback.notna(), 'Year'] = year_fallback[missing_year & year_fallback.notna()]
|
||||||
|
|
||||||
|
# Create YearMonth for monthly analysis
|
||||||
|
if DATE_COLUMN in df.columns:
|
||||||
|
df['YearMonth'] = df[DATE_COLUMN].dt.to_period('M')
|
||||||
|
|
||||||
|
# Report date coverage
|
||||||
|
total_rows = len(df)
|
||||||
|
date_coverage = df[DATE_COLUMN].notna().sum()
|
||||||
|
coverage_pct = (date_coverage / total_rows * 100) if total_rows > 0 else 0
|
||||||
|
print(f"Date coverage: {date_coverage:,} / {total_rows:,} rows ({coverage_pct:.1f}%)")
|
||||||
|
|
||||||
|
if coverage_pct < 100:
|
||||||
|
print(f"Warning: {total_rows - date_coverage:,} rows have missing dates")
|
||||||
|
|
||||||
|
# Report date range
|
||||||
|
if df[DATE_COLUMN].notna().any():
|
||||||
|
min_date = df[DATE_COLUMN].min()
|
||||||
|
max_date = df[DATE_COLUMN].max()
|
||||||
|
print(f"Date range: {min_date.strftime('%Y-%m-%d')} to {max_date.strftime('%Y-%m-%d')}")
|
||||||
|
|
||||||
|
return df
|
||||||
|
|
||||||
|
def validate_data_structure(df: pd.DataFrame) -> tuple[bool, str]:
|
||||||
|
"""
|
||||||
|
Validate that loaded data has expected structure.
|
||||||
|
|
||||||
|
Checks for required columns, data quality, and basic validity.
|
||||||
|
Returns actionable error messages if validation fails.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
df: DataFrame to validate (should be result of load_sales_data())
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
tuple[bool, str]: (is_valid, error_message)
|
||||||
|
- is_valid: True if data structure is valid, False otherwise
|
||||||
|
- error_message: "OK" if valid, otherwise descriptive error message
|
||||||
|
|
||||||
|
Example:
|
||||||
|
>>> df = load_sales_data(get_data_path())
|
||||||
|
>>> is_valid, msg = validate_data_structure(df)
|
||||||
|
>>> if not is_valid:
|
||||||
|
... print(f"ERROR: {msg}")
|
||||||
|
|
||||||
|
See Also:
|
||||||
|
- load_sales_data() - Load data before validating
|
||||||
|
- config_validator.py - Comprehensive configuration validation
|
||||||
|
"""
|
||||||
|
from config import REVENUE_COLUMN, DATE_COLUMN
|
||||||
|
|
||||||
|
errors = []
|
||||||
|
|
||||||
|
# Check required columns
|
||||||
|
if REVENUE_COLUMN not in df.columns:
|
||||||
|
errors.append(f"Missing required column: {REVENUE_COLUMN}")
|
||||||
|
|
||||||
|
if DATE_COLUMN not in df.columns:
|
||||||
|
errors.append(f"Missing required column: {DATE_COLUMN}")
|
||||||
|
|
||||||
|
# Check data quality
|
||||||
|
if len(df) == 0:
|
||||||
|
errors.append("DataFrame is empty")
|
||||||
|
|
||||||
|
if REVENUE_COLUMN in df.columns:
|
||||||
|
if df[REVENUE_COLUMN].isna().all():
|
||||||
|
errors.append(f"All {REVENUE_COLUMN} values are NaN")
|
||||||
|
|
||||||
|
if df[REVENUE_COLUMN].notna().sum() == 0:
|
||||||
|
errors.append(f"No valid {REVENUE_COLUMN} values")
|
||||||
|
|
||||||
|
if DATE_COLUMN in df.columns:
|
||||||
|
if df[DATE_COLUMN].isna().all():
|
||||||
|
errors.append(f"All {DATE_COLUMN} values are NaN")
|
||||||
|
|
||||||
|
if errors:
|
||||||
|
return False, "; ".join(errors)
|
||||||
|
|
||||||
|
return True, "OK"
|
||||||
285
data_processing.py
Normal file
285
data_processing.py
Normal file
@@ -0,0 +1,285 @@
|
|||||||
|
"""
|
||||||
|
Data processing utilities
|
||||||
|
Common data cleaning and transformation helpers
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
from data_processing import clean_data, create_pivot_table, prepare_time_series
|
||||||
|
|
||||||
|
# Clean data
|
||||||
|
df_clean = clean_data(df)
|
||||||
|
|
||||||
|
# Create pivot table
|
||||||
|
pivot = create_pivot_table(df, index='Year', columns='Product', values='Revenue')
|
||||||
|
"""
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
from config import REVENUE_COLUMN, DATE_COLUMN, MIN_QUANTITY, MAX_QUANTITY
|
||||||
|
|
||||||
|
def clean_data(df, remove_duplicates=True, handle_missing_dates=True):
|
||||||
|
"""
|
||||||
|
Clean data with common operations
|
||||||
|
|
||||||
|
Args:
|
||||||
|
df: DataFrame to clean
|
||||||
|
remove_duplicates: Whether to remove duplicate rows
|
||||||
|
handle_missing_dates: Whether to handle missing dates
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
DataFrame: Cleaned DataFrame
|
||||||
|
"""
|
||||||
|
df_clean = df.copy()
|
||||||
|
|
||||||
|
# Remove duplicates
|
||||||
|
if remove_duplicates:
|
||||||
|
initial_count = len(df_clean)
|
||||||
|
df_clean = df_clean.drop_duplicates()
|
||||||
|
removed = initial_count - len(df_clean)
|
||||||
|
if removed > 0:
|
||||||
|
print(f"Removed {removed:,} duplicate rows")
|
||||||
|
|
||||||
|
# Handle missing dates
|
||||||
|
if handle_missing_dates and DATE_COLUMN in df_clean.columns:
|
||||||
|
missing_dates = df_clean[DATE_COLUMN].isna().sum()
|
||||||
|
if missing_dates > 0:
|
||||||
|
print(f"Warning: {missing_dates:,} rows have missing dates")
|
||||||
|
|
||||||
|
# Remove rows with negative revenue (if configured)
|
||||||
|
if REVENUE_COLUMN in df_clean.columns:
|
||||||
|
negative_revenue = (df_clean[REVENUE_COLUMN] < 0).sum()
|
||||||
|
if negative_revenue > 0:
|
||||||
|
print(f"Found {negative_revenue:,} rows with negative revenue")
|
||||||
|
# Optionally remove: df_clean = df_clean[df_clean[REVENUE_COLUMN] >= 0]
|
||||||
|
|
||||||
|
return df_clean
|
||||||
|
|
||||||
|
def create_pivot_table(df, index, columns=None, values=None, aggfunc='sum', fill_value=0):
|
||||||
|
"""
|
||||||
|
Create pivot table with common defaults
|
||||||
|
|
||||||
|
Args:
|
||||||
|
df: DataFrame
|
||||||
|
index: Column(s) to use as index
|
||||||
|
columns: Column(s) to use as columns
|
||||||
|
values: Column(s) to aggregate
|
||||||
|
aggfunc: Aggregation function (default: 'sum')
|
||||||
|
fill_value: Value to fill missing cells (default: 0)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
DataFrame: Pivot table
|
||||||
|
"""
|
||||||
|
if values is None and REVENUE_COLUMN in df.columns:
|
||||||
|
values = REVENUE_COLUMN
|
||||||
|
|
||||||
|
pivot = pd.pivot_table(
|
||||||
|
df,
|
||||||
|
index=index,
|
||||||
|
columns=columns,
|
||||||
|
values=values,
|
||||||
|
aggfunc=aggfunc,
|
||||||
|
fill_value=fill_value
|
||||||
|
)
|
||||||
|
|
||||||
|
return pivot
|
||||||
|
|
||||||
|
def prepare_time_series(df, date_column=None, value_column=None, freq='M'):
|
||||||
|
"""
|
||||||
|
Prepare time series data
|
||||||
|
|
||||||
|
Args:
|
||||||
|
df: DataFrame
|
||||||
|
date_column: Date column name (defaults to config.DATE_COLUMN)
|
||||||
|
value_column: Value column to aggregate (defaults to config.REVENUE_COLUMN)
|
||||||
|
freq: Frequency for resampling ('D', 'W', 'M', 'Q', 'Y')
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Series: Time series data
|
||||||
|
"""
|
||||||
|
if date_column is None:
|
||||||
|
date_column = DATE_COLUMN
|
||||||
|
|
||||||
|
if value_column is None:
|
||||||
|
value_column = REVENUE_COLUMN
|
||||||
|
|
||||||
|
if date_column not in df.columns:
|
||||||
|
raise ValueError(f"Date column '{date_column}' not found")
|
||||||
|
|
||||||
|
if value_column not in df.columns:
|
||||||
|
raise ValueError(f"Value column '{value_column}' not found")
|
||||||
|
|
||||||
|
# Ensure date column is datetime
|
||||||
|
df = df.copy()
|
||||||
|
df[date_column] = pd.to_datetime(df[date_column], errors='coerce')
|
||||||
|
|
||||||
|
# Set date as index
|
||||||
|
df_indexed = df.set_index(date_column)
|
||||||
|
|
||||||
|
# Resample and aggregate
|
||||||
|
time_series = df_indexed[value_column].resample(freq).sum()
|
||||||
|
|
||||||
|
return time_series
|
||||||
|
|
||||||
|
def aggregate_by_period(df, period='year', date_column=None, value_column=None):
|
||||||
|
"""
|
||||||
|
Aggregate data by time period
|
||||||
|
|
||||||
|
Args:
|
||||||
|
df: DataFrame
|
||||||
|
period: Period type ('year', 'month', 'quarter')
|
||||||
|
date_column: Date column name
|
||||||
|
value_column: Value column to aggregate
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
DataFrame: Aggregated data
|
||||||
|
"""
|
||||||
|
if date_column is None:
|
||||||
|
date_column = DATE_COLUMN
|
||||||
|
|
||||||
|
if value_column is None:
|
||||||
|
value_column = REVENUE_COLUMN
|
||||||
|
|
||||||
|
df = df.copy()
|
||||||
|
df[date_column] = pd.to_datetime(df[date_column], errors='coerce')
|
||||||
|
|
||||||
|
# Extract period
|
||||||
|
if period == 'year':
|
||||||
|
df['Period'] = df[date_column].dt.year
|
||||||
|
elif period == 'month':
|
||||||
|
df['Period'] = df[date_column].dt.to_period('M')
|
||||||
|
elif period == 'quarter':
|
||||||
|
df['Period'] = df[date_column].dt.to_period('Q')
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unknown period: {period}")
|
||||||
|
|
||||||
|
# Aggregate
|
||||||
|
aggregated = df.groupby('Period')[value_column].agg(['sum', 'count', 'mean']).reset_index()
|
||||||
|
aggregated.columns = ['Period', 'Total', 'Count', 'Average']
|
||||||
|
|
||||||
|
return aggregated
|
||||||
|
|
||||||
|
def filter_outliers(df, column, method='iqr', lower_bound=None, upper_bound=None):
|
||||||
|
"""
|
||||||
|
Filter outliers from DataFrame
|
||||||
|
|
||||||
|
Args:
|
||||||
|
df: DataFrame
|
||||||
|
column: Column name to filter on
|
||||||
|
method: Method ('iqr' for interquartile range, 'zscore' for z-score)
|
||||||
|
lower_bound: Manual lower bound
|
||||||
|
upper_bound: Manual upper bound
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
DataFrame: Filtered DataFrame
|
||||||
|
"""
|
||||||
|
df_filtered = df.copy()
|
||||||
|
|
||||||
|
if method == 'iqr':
|
||||||
|
q1 = df[column].quantile(0.25)
|
||||||
|
q3 = df[column].quantile(0.75)
|
||||||
|
iqr = q3 - q1
|
||||||
|
lower = lower_bound if lower_bound is not None else q1 - 1.5 * iqr
|
||||||
|
upper = upper_bound if upper_bound is not None else q3 + 1.5 * iqr
|
||||||
|
elif method == 'zscore':
|
||||||
|
mean = df[column].mean()
|
||||||
|
std = df[column].std()
|
||||||
|
lower = lower_bound if lower_bound is not None else mean - 3 * std
|
||||||
|
upper = upper_bound if upper_bound is not None else mean + 3 * std
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unknown method: {method}")
|
||||||
|
|
||||||
|
initial_count = len(df_filtered)
|
||||||
|
df_filtered = df_filtered[(df_filtered[column] >= lower) & (df_filtered[column] <= upper)]
|
||||||
|
removed = initial_count - len(df_filtered)
|
||||||
|
|
||||||
|
if removed > 0:
|
||||||
|
print(f"Removed {removed:,} outliers from {column} ({removed/initial_count*100:.1f}%)")
|
||||||
|
|
||||||
|
return df_filtered
|
||||||
|
|
||||||
|
def normalize_column(df, column, method='min_max'):
|
||||||
|
"""
|
||||||
|
Normalize a column
|
||||||
|
|
||||||
|
Args:
|
||||||
|
df: DataFrame
|
||||||
|
column: Column name to normalize
|
||||||
|
method: Normalization method ('min_max', 'zscore')
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Series: Normalized values
|
||||||
|
"""
|
||||||
|
if method == 'min_max':
|
||||||
|
min_val = df[column].min()
|
||||||
|
max_val = df[column].max()
|
||||||
|
if max_val - min_val == 0:
|
||||||
|
return pd.Series([0] * len(df), index=df.index)
|
||||||
|
return (df[column] - min_val) / (max_val - min_val)
|
||||||
|
elif method == 'zscore':
|
||||||
|
mean = df[column].mean()
|
||||||
|
std = df[column].std()
|
||||||
|
if std == 0:
|
||||||
|
return pd.Series([0] * len(df), index=df.index)
|
||||||
|
return (df[column] - mean) / std
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unknown method: {method}")
|
||||||
|
|
||||||
|
def create_derived_columns(df):
|
||||||
|
"""
|
||||||
|
Create common derived columns
|
||||||
|
|
||||||
|
Args:
|
||||||
|
df: DataFrame
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
DataFrame: DataFrame with derived columns
|
||||||
|
"""
|
||||||
|
df_derived = df.copy()
|
||||||
|
|
||||||
|
# Extract year, month, quarter if date column exists
|
||||||
|
if DATE_COLUMN in df_derived.columns:
|
||||||
|
df_derived[DATE_COLUMN] = pd.to_datetime(df_derived[DATE_COLUMN], errors='coerce')
|
||||||
|
|
||||||
|
if 'Year' not in df_derived.columns:
|
||||||
|
df_derived['Year'] = df_derived[DATE_COLUMN].dt.year
|
||||||
|
|
||||||
|
if 'Month' not in df_derived.columns:
|
||||||
|
df_derived['Month'] = df_derived[DATE_COLUMN].dt.month
|
||||||
|
|
||||||
|
if 'Quarter' not in df_derived.columns:
|
||||||
|
df_derived['Quarter'] = df_derived[DATE_COLUMN].dt.quarter
|
||||||
|
|
||||||
|
if 'YearMonth' not in df_derived.columns:
|
||||||
|
df_derived['YearMonth'] = df_derived[DATE_COLUMN].dt.to_period('M')
|
||||||
|
|
||||||
|
# Calculate price per unit if quantity and revenue exist
|
||||||
|
from config import QUANTITY_COLUMN
|
||||||
|
if QUANTITY_COLUMN in df_derived.columns and REVENUE_COLUMN in df_derived.columns:
|
||||||
|
df_derived['Price_Per_Unit'] = df_derived[REVENUE_COLUMN] / df_derived[QUANTITY_COLUMN].replace(0, np.nan)
|
||||||
|
|
||||||
|
return df_derived
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# EXAMPLE USAGE
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
"""Example usage"""
|
||||||
|
# Create sample data
|
||||||
|
df = pd.DataFrame({
|
||||||
|
'InvoiceDate': pd.date_range('2023-01-01', periods=100, freq='D'),
|
||||||
|
'USD': np.random.normal(1000, 200, 100),
|
||||||
|
'Quantity': np.random.randint(1, 100, 100)
|
||||||
|
})
|
||||||
|
|
||||||
|
# Clean data
|
||||||
|
df_clean = clean_data(df)
|
||||||
|
print(f"Cleaned data: {len(df_clean)} rows")
|
||||||
|
|
||||||
|
# Create pivot table
|
||||||
|
df_clean['Year'] = df_clean['InvoiceDate'].dt.year
|
||||||
|
pivot = create_pivot_table(df_clean, index='Year', values='USD')
|
||||||
|
print("\nPivot table:")
|
||||||
|
print(pivot)
|
||||||
|
|
||||||
|
# Prepare time series
|
||||||
|
ts = prepare_time_series(df_clean, freq='M')
|
||||||
|
print(f"\nTime series: {len(ts)} periods")
|
||||||
344
data_quality.py
Normal file
344
data_quality.py
Normal file
@@ -0,0 +1,344 @@
|
|||||||
|
"""
|
||||||
|
Data quality reporting utility
|
||||||
|
Generates comprehensive data quality reports
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
from data_quality import generate_data_quality_report, print_data_quality_report
|
||||||
|
|
||||||
|
# Generate and print report
|
||||||
|
report = generate_data_quality_report(df)
|
||||||
|
print_data_quality_report(report)
|
||||||
|
"""
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
from config import (
|
||||||
|
REVENUE_COLUMN, DATE_COLUMN, CUSTOMER_COLUMN, ITEM_COLUMN,
|
||||||
|
QUANTITY_COLUMN, MIN_QUANTITY, MAX_QUANTITY
|
||||||
|
)
|
||||||
|
|
||||||
|
def generate_data_quality_report(df):
|
||||||
|
"""
|
||||||
|
Generate comprehensive data quality report
|
||||||
|
|
||||||
|
Args:
|
||||||
|
df: DataFrame to analyze
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
dict: Dictionary containing data quality metrics
|
||||||
|
"""
|
||||||
|
report = {
|
||||||
|
'overview': {},
|
||||||
|
'missing_values': {},
|
||||||
|
'duplicates': {},
|
||||||
|
'outliers': {},
|
||||||
|
'data_types': {},
|
||||||
|
'date_coverage': {},
|
||||||
|
'revenue_summary': {},
|
||||||
|
'issues': []
|
||||||
|
}
|
||||||
|
|
||||||
|
# Overview
|
||||||
|
report['overview'] = {
|
||||||
|
'total_rows': len(df),
|
||||||
|
'total_columns': len(df.columns),
|
||||||
|
'memory_usage_mb': df.memory_usage(deep=True).sum() / 1024**2
|
||||||
|
}
|
||||||
|
|
||||||
|
# Missing values
|
||||||
|
missing = df.isnull().sum()
|
||||||
|
missing_pct = (missing / len(df)) * 100
|
||||||
|
report['missing_values'] = {
|
||||||
|
'by_column': missing[missing > 0].to_dict(),
|
||||||
|
'percentages': missing_pct[missing > 0].to_dict(),
|
||||||
|
'total_missing': missing.sum(),
|
||||||
|
'columns_with_missing': len(missing[missing > 0])
|
||||||
|
}
|
||||||
|
|
||||||
|
# Duplicates
|
||||||
|
duplicate_rows = df.duplicated().sum()
|
||||||
|
report['duplicates'] = {
|
||||||
|
'duplicate_rows': int(duplicate_rows),
|
||||||
|
'duplicate_percentage': (duplicate_rows / len(df)) * 100 if len(df) > 0 else 0
|
||||||
|
}
|
||||||
|
|
||||||
|
# Outliers (revenue and quantity)
|
||||||
|
outliers = {}
|
||||||
|
|
||||||
|
if REVENUE_COLUMN in df.columns:
|
||||||
|
revenue = pd.to_numeric(df[REVENUE_COLUMN], errors='coerce')
|
||||||
|
q1 = revenue.quantile(0.25)
|
||||||
|
q3 = revenue.quantile(0.75)
|
||||||
|
iqr = q3 - q1
|
||||||
|
lower_bound = q1 - 1.5 * iqr
|
||||||
|
upper_bound = q3 + 1.5 * iqr
|
||||||
|
|
||||||
|
revenue_outliers = ((revenue < lower_bound) | (revenue > upper_bound)).sum()
|
||||||
|
outliers['revenue'] = {
|
||||||
|
'count': int(revenue_outliers),
|
||||||
|
'percentage': (revenue_outliers / len(df)) * 100 if len(df) > 0 else 0,
|
||||||
|
'lower_bound': float(lower_bound),
|
||||||
|
'upper_bound': float(upper_bound),
|
||||||
|
'negative_values': int((revenue < 0).sum())
|
||||||
|
}
|
||||||
|
|
||||||
|
if QUANTITY_COLUMN in df.columns:
|
||||||
|
quantity = pd.to_numeric(df[QUANTITY_COLUMN], errors='coerce')
|
||||||
|
# Use config thresholds if available
|
||||||
|
if MIN_QUANTITY is not None and MAX_QUANTITY is not None:
|
||||||
|
quantity_outliers = ((quantity < MIN_QUANTITY) | (quantity > MAX_QUANTITY)).sum()
|
||||||
|
outliers['quantity'] = {
|
||||||
|
'count': int(quantity_outliers),
|
||||||
|
'percentage': (quantity_outliers / len(df)) * 100 if len(df) > 0 else 0,
|
||||||
|
'below_min': int((quantity < MIN_QUANTITY).sum()),
|
||||||
|
'above_max': int((quantity > MAX_QUANTITY).sum())
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
q1 = quantity.quantile(0.25)
|
||||||
|
q3 = quantity.quantile(0.75)
|
||||||
|
iqr = q3 - q1
|
||||||
|
lower_bound = q1 - 1.5 * iqr
|
||||||
|
upper_bound = q3 + 1.5 * iqr
|
||||||
|
|
||||||
|
quantity_outliers = ((quantity < lower_bound) | (quantity > upper_bound)).sum()
|
||||||
|
outliers['quantity'] = {
|
||||||
|
'count': int(quantity_outliers),
|
||||||
|
'percentage': (quantity_outliers / len(df)) * 100 if len(df) > 0 else 0,
|
||||||
|
'lower_bound': float(lower_bound),
|
||||||
|
'upper_bound': float(upper_bound)
|
||||||
|
}
|
||||||
|
|
||||||
|
report['outliers'] = outliers
|
||||||
|
|
||||||
|
# Data types
|
||||||
|
report['data_types'] = {
|
||||||
|
'numeric_columns': list(df.select_dtypes(include=[np.number]).columns),
|
||||||
|
'datetime_columns': list(df.select_dtypes(include=['datetime64']).columns),
|
||||||
|
'object_columns': list(df.select_dtypes(include=['object']).columns),
|
||||||
|
'type_summary': df.dtypes.value_counts().to_dict()
|
||||||
|
}
|
||||||
|
|
||||||
|
# Date coverage
|
||||||
|
if DATE_COLUMN in df.columns:
|
||||||
|
date_coverage = df[DATE_COLUMN].notna().sum()
|
||||||
|
report['date_coverage'] = {
|
||||||
|
'total_rows': len(df),
|
||||||
|
'rows_with_dates': int(date_coverage),
|
||||||
|
'coverage_percentage': (date_coverage / len(df)) * 100 if len(df) > 0 else 0,
|
||||||
|
'min_date': str(df[DATE_COLUMN].min()) if date_coverage > 0 else None,
|
||||||
|
'max_date': str(df[DATE_COLUMN].max()) if date_coverage > 0 else None
|
||||||
|
}
|
||||||
|
|
||||||
|
# Revenue summary
|
||||||
|
if REVENUE_COLUMN in df.columns:
|
||||||
|
revenue = pd.to_numeric(df[REVENUE_COLUMN], errors='coerce')
|
||||||
|
valid_revenue = revenue.dropna()
|
||||||
|
|
||||||
|
if len(valid_revenue) > 0:
|
||||||
|
report['revenue_summary'] = {
|
||||||
|
'total_revenue': float(valid_revenue.sum()),
|
||||||
|
'mean_revenue': float(valid_revenue.mean()),
|
||||||
|
'median_revenue': float(valid_revenue.median()),
|
||||||
|
'min_revenue': float(valid_revenue.min()),
|
||||||
|
'max_revenue': float(valid_revenue.max()),
|
||||||
|
'std_revenue': float(valid_revenue.std()),
|
||||||
|
'valid_rows': int(len(valid_revenue)),
|
||||||
|
'invalid_rows': int(len(df) - len(valid_revenue))
|
||||||
|
}
|
||||||
|
|
||||||
|
# Identify issues
|
||||||
|
issues = []
|
||||||
|
|
||||||
|
# Critical issues
|
||||||
|
if report['missing_values']['columns_with_missing'] > 0:
|
||||||
|
high_missing = {k: v for k, v in report['missing_values']['percentages'].items() if v > 50}
|
||||||
|
if high_missing:
|
||||||
|
issues.append({
|
||||||
|
'severity': 'critical',
|
||||||
|
'issue': f"Columns with >50% missing values: {list(high_missing.keys())}",
|
||||||
|
'impact': 'High'
|
||||||
|
})
|
||||||
|
|
||||||
|
if DATE_COLUMN in df.columns:
|
||||||
|
if report['date_coverage']['coverage_percentage'] < 50:
|
||||||
|
issues.append({
|
||||||
|
'severity': 'critical',
|
||||||
|
'issue': f"Date coverage is only {report['date_coverage']['coverage_percentage']:.1f}%",
|
||||||
|
'impact': 'High - analyses may fail'
|
||||||
|
})
|
||||||
|
|
||||||
|
if REVENUE_COLUMN in df.columns:
|
||||||
|
if report['revenue_summary'].get('invalid_rows', 0) > len(df) * 0.1:
|
||||||
|
issues.append({
|
||||||
|
'severity': 'critical',
|
||||||
|
'issue': f"{report['revenue_summary']['invalid_rows']} rows have invalid revenue values",
|
||||||
|
'impact': 'High'
|
||||||
|
})
|
||||||
|
|
||||||
|
# Warnings
|
||||||
|
if report['duplicates']['duplicate_percentage'] > 5:
|
||||||
|
issues.append({
|
||||||
|
'severity': 'warning',
|
||||||
|
'issue': f"{report['duplicates']['duplicate_rows']} duplicate rows ({report['duplicates']['duplicate_percentage']:.1f}%)",
|
||||||
|
'impact': 'Medium'
|
||||||
|
})
|
||||||
|
|
||||||
|
if 'revenue' in outliers:
|
||||||
|
if outliers['revenue']['percentage'] > 10:
|
||||||
|
issues.append({
|
||||||
|
'severity': 'warning',
|
||||||
|
'issue': f"{outliers['revenue']['count']} revenue outliers ({outliers['revenue']['percentage']:.1f}%)",
|
||||||
|
'impact': 'Medium'
|
||||||
|
})
|
||||||
|
|
||||||
|
report['issues'] = issues
|
||||||
|
|
||||||
|
return report
|
||||||
|
|
||||||
|
def print_data_quality_report(report):
|
||||||
|
"""
|
||||||
|
Print formatted data quality report
|
||||||
|
|
||||||
|
Args:
|
||||||
|
report: Dictionary from generate_data_quality_report()
|
||||||
|
"""
|
||||||
|
print("\n" + "="*70)
|
||||||
|
print("DATA QUALITY REPORT")
|
||||||
|
print("="*70)
|
||||||
|
|
||||||
|
# Overview
|
||||||
|
print("\n📊 OVERVIEW")
|
||||||
|
print("-" * 70)
|
||||||
|
print(f"Total Rows: {report['overview']['total_rows']:,}")
|
||||||
|
print(f"Total Columns: {report['overview']['total_columns']}")
|
||||||
|
print(f"Memory Usage: {report['overview']['memory_usage_mb']:.2f} MB")
|
||||||
|
|
||||||
|
# Missing values
|
||||||
|
print("\n🔍 MISSING VALUES")
|
||||||
|
print("-" * 70)
|
||||||
|
if report['missing_values']['columns_with_missing'] > 0:
|
||||||
|
print(f"Columns with missing values: {report['missing_values']['columns_with_missing']}")
|
||||||
|
print(f"Total missing values: {report['missing_values']['total_missing']:,}")
|
||||||
|
print("\nTop columns by missing values:")
|
||||||
|
missing_sorted = sorted(
|
||||||
|
report['missing_values']['percentages'].items(),
|
||||||
|
key=lambda x: x[1],
|
||||||
|
reverse=True
|
||||||
|
)[:10]
|
||||||
|
for col, pct in missing_sorted:
|
||||||
|
count = report['missing_values']['by_column'][col]
|
||||||
|
print(f" {col:30s}: {count:8,} ({pct:5.1f}%)")
|
||||||
|
else:
|
||||||
|
print("✅ No missing values found")
|
||||||
|
|
||||||
|
# Duplicates
|
||||||
|
print("\n🔄 DUPLICATES")
|
||||||
|
print("-" * 70)
|
||||||
|
if report['duplicates']['duplicate_rows'] > 0:
|
||||||
|
print(f"Duplicate Rows: {report['duplicates']['duplicate_rows']:,} ({report['duplicates']['duplicate_percentage']:.2f}%)")
|
||||||
|
else:
|
||||||
|
print("✅ No duplicate rows found")
|
||||||
|
|
||||||
|
# Outliers
|
||||||
|
print("\n📈 OUTLIERS")
|
||||||
|
print("-" * 70)
|
||||||
|
if 'revenue' in report['outliers']:
|
||||||
|
rev_out = report['outliers']['revenue']
|
||||||
|
print(f"Revenue Outliers: {rev_out['count']:,} ({rev_out['percentage']:.2f}%)")
|
||||||
|
if 'negative_values' in rev_out and rev_out['negative_values'] > 0:
|
||||||
|
print(f" Negative Revenue Values: {rev_out['negative_values']:,}")
|
||||||
|
|
||||||
|
if 'quantity' in report['outliers']:
|
||||||
|
qty_out = report['outliers']['quantity']
|
||||||
|
print(f"Quantity Outliers: {qty_out['count']:,} ({qty_out['percentage']:.2f}%)")
|
||||||
|
|
||||||
|
if not report['outliers']:
|
||||||
|
print("✅ No significant outliers detected")
|
||||||
|
|
||||||
|
# Date coverage
|
||||||
|
if report['date_coverage']:
|
||||||
|
print("\n📅 DATE COVERAGE")
|
||||||
|
print("-" * 70)
|
||||||
|
dc = report['date_coverage']
|
||||||
|
print(f"Rows with Dates: {dc['rows_with_dates']:,} / {dc['total_rows']:,} ({dc['coverage_percentage']:.1f}%)")
|
||||||
|
if dc['min_date']:
|
||||||
|
print(f"Date Range: {dc['min_date']} to {dc['max_date']}")
|
||||||
|
|
||||||
|
# Revenue summary
|
||||||
|
if report['revenue_summary']:
|
||||||
|
print("\n💰 REVENUE SUMMARY")
|
||||||
|
print("-" * 70)
|
||||||
|
rs = report['revenue_summary']
|
||||||
|
print(f"Total Revenue: ${rs['total_revenue'] / 1e6:.2f}m")
|
||||||
|
print(f"Valid Rows: {rs['valid_rows']:,} / {rs['valid_rows'] + rs['invalid_rows']:,}")
|
||||||
|
if rs['invalid_rows'] > 0:
|
||||||
|
print(f"Invalid Rows: {rs['invalid_rows']:,}")
|
||||||
|
print(f"Mean: ${rs['mean_revenue']:,.2f}")
|
||||||
|
print(f"Median: ${rs['median_revenue']:,.2f}")
|
||||||
|
print(f"Min: ${rs['min_revenue']:,.2f}")
|
||||||
|
print(f"Max: ${rs['max_revenue']:,.2f}")
|
||||||
|
|
||||||
|
# Issues
|
||||||
|
if report['issues']:
|
||||||
|
print("\n⚠️ ISSUES DETECTED")
|
||||||
|
print("-" * 70)
|
||||||
|
critical = [i for i in report['issues'] if i['severity'] == 'critical']
|
||||||
|
warnings = [i for i in report['issues'] if i['severity'] == 'warning']
|
||||||
|
|
||||||
|
if critical:
|
||||||
|
print("❌ CRITICAL ISSUES:")
|
||||||
|
for issue in critical:
|
||||||
|
print(f" • {issue['issue']}")
|
||||||
|
print(f" Impact: {issue['impact']}")
|
||||||
|
|
||||||
|
if warnings:
|
||||||
|
print("\n⚠️ WARNINGS:")
|
||||||
|
for issue in warnings:
|
||||||
|
print(f" • {issue['issue']}")
|
||||||
|
print(f" Impact: {issue['impact']}")
|
||||||
|
else:
|
||||||
|
print("\n✅ NO ISSUES DETECTED")
|
||||||
|
|
||||||
|
print("\n" + "="*70)
|
||||||
|
|
||||||
|
def generate_data_quality_report_simple(df):
|
||||||
|
"""
|
||||||
|
Generate a simple data quality summary (quick check)
|
||||||
|
|
||||||
|
Args:
|
||||||
|
df: DataFrame to analyze
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: Simple summary string
|
||||||
|
"""
|
||||||
|
summary_parts = []
|
||||||
|
|
||||||
|
summary_parts.append(f"Rows: {len(df):,}")
|
||||||
|
summary_parts.append(f"Columns: {len(df.columns)}")
|
||||||
|
|
||||||
|
if REVENUE_COLUMN in df.columns:
|
||||||
|
revenue = pd.to_numeric(df[REVENUE_COLUMN], errors='coerce')
|
||||||
|
valid = revenue.notna().sum()
|
||||||
|
summary_parts.append(f"Valid Revenue: {valid:,} ({valid/len(df)*100:.1f}%)")
|
||||||
|
|
||||||
|
if DATE_COLUMN in df.columns:
|
||||||
|
date_coverage = df[DATE_COLUMN].notna().sum()
|
||||||
|
summary_parts.append(f"Date Coverage: {date_coverage:,} ({date_coverage/len(df)*100:.1f}%)")
|
||||||
|
|
||||||
|
return " | ".join(summary_parts)
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# STANDALONE DATA QUALITY CHECK
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
"""Run data quality check"""
|
||||||
|
from data_loader import load_sales_data
|
||||||
|
from config import get_data_path
|
||||||
|
|
||||||
|
print("Loading data for quality check...")
|
||||||
|
try:
|
||||||
|
df = load_sales_data(get_data_path())
|
||||||
|
report = generate_data_quality_report(df)
|
||||||
|
print_data_quality_report(report)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"ERROR: {e}")
|
||||||
134
examples/annual_revenue_trend.py
Normal file
134
examples/annual_revenue_trend.py
Normal file
@@ -0,0 +1,134 @@
|
|||||||
|
"""
|
||||||
|
Example: Annual Revenue Trend Analysis
|
||||||
|
Simple example showing annual revenue with LTM support
|
||||||
|
|
||||||
|
This is a working example that demonstrates:
|
||||||
|
- Loading data using data_loader
|
||||||
|
- Calculating annual metrics with LTM
|
||||||
|
- Creating a revenue trend chart
|
||||||
|
- Following template best practices
|
||||||
|
"""
|
||||||
|
import pandas as pd
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
# Import utilities
|
||||||
|
from data_loader import load_sales_data, validate_data_structure
|
||||||
|
from validate_revenue import validate_revenue
|
||||||
|
from analysis_utils import (
|
||||||
|
get_ltm_period_config, calculate_annual_metrics,
|
||||||
|
setup_revenue_chart, save_chart,
|
||||||
|
format_currency, print_annual_summary, sort_mixed_years,
|
||||||
|
apply_exclusion_filters
|
||||||
|
)
|
||||||
|
from config import (
|
||||||
|
OUTPUT_DIR, ANALYSIS_YEARS, MAX_DATE,
|
||||||
|
CHART_SIZES, ensure_directories, get_data_path, COMPANY_NAME,
|
||||||
|
REVENUE_COLUMN, MIN_YEAR, DATE_COLUMN
|
||||||
|
)
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# CONFIGURATION
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
ANALYSIS_NAME = "Annual Revenue Trend"
|
||||||
|
DESCRIPTION = "Simple annual revenue trend analysis with LTM support"
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# MAIN ANALYSIS FUNCTION
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""Main analysis function"""
|
||||||
|
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print(f"{ANALYSIS_NAME}")
|
||||||
|
print(f"{'='*60}\n")
|
||||||
|
|
||||||
|
# 1. Load data
|
||||||
|
print("Loading data...")
|
||||||
|
try:
|
||||||
|
df = load_sales_data(get_data_path())
|
||||||
|
print(f"Loaded {len(df):,} transactions")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"ERROR loading data: {e}")
|
||||||
|
return
|
||||||
|
|
||||||
|
# 2. Validate data structure
|
||||||
|
is_valid, msg = validate_data_structure(df)
|
||||||
|
if not is_valid:
|
||||||
|
print(f"ERROR: {msg}")
|
||||||
|
return
|
||||||
|
print("Data validation passed")
|
||||||
|
|
||||||
|
# 3. Apply exclusion filters (if configured)
|
||||||
|
df = apply_exclusion_filters(df)
|
||||||
|
|
||||||
|
# 4. Filter by date range
|
||||||
|
df = df[df['Year'] >= MIN_YEAR]
|
||||||
|
if DATE_COLUMN in df.columns:
|
||||||
|
df = df[df[DATE_COLUMN] <= MAX_DATE]
|
||||||
|
|
||||||
|
# 5. Setup LTM period (if enabled)
|
||||||
|
ltm_start, ltm_end = get_ltm_period_config()
|
||||||
|
if ltm_start and ltm_end:
|
||||||
|
print(f"LTM period: {ltm_start} to {ltm_end}")
|
||||||
|
|
||||||
|
# 6. Calculate annual metrics
|
||||||
|
print("\nCalculating annual metrics...")
|
||||||
|
|
||||||
|
def calculate_metrics(year_data):
|
||||||
|
"""Calculate metrics for a single year"""
|
||||||
|
return {
|
||||||
|
'Revenue': year_data[REVENUE_COLUMN].sum(),
|
||||||
|
}
|
||||||
|
|
||||||
|
annual_df = calculate_annual_metrics(df, calculate_metrics, ltm_start, ltm_end)
|
||||||
|
|
||||||
|
# 7. Print summary
|
||||||
|
print_annual_summary(annual_df, 'Revenue', 'Revenue')
|
||||||
|
|
||||||
|
# 8. Create visualization
|
||||||
|
print("Generating chart...")
|
||||||
|
ensure_directories()
|
||||||
|
|
||||||
|
# Annual revenue trend chart
|
||||||
|
fig, ax = plt.subplots(figsize=CHART_SIZES['medium'])
|
||||||
|
|
||||||
|
# Prepare data for plotting (handle mixed types)
|
||||||
|
annual_df_sorted = sort_mixed_years(annual_df.reset_index(), 'Year')
|
||||||
|
years = annual_df_sorted['Year'].tolist()
|
||||||
|
revenue = annual_df_sorted['Revenue'].values / 1e6 # Convert to millions
|
||||||
|
|
||||||
|
# Create chart
|
||||||
|
ax.plot(range(len(years)), revenue, marker='o', linewidth=2, markersize=8, color='#2E86AB')
|
||||||
|
ax.set_xticks(range(len(years)))
|
||||||
|
ax.set_xticklabels(years, rotation=45, ha='right')
|
||||||
|
setup_revenue_chart(ax)
|
||||||
|
|
||||||
|
# Add LTM notation to title if applicable
|
||||||
|
title = f'Annual Revenue Trend - {COMPANY_NAME}'
|
||||||
|
if ltm_start and ltm_end:
|
||||||
|
from config import get_ltm_label
|
||||||
|
ltm_label = get_ltm_label()
|
||||||
|
if ltm_label:
|
||||||
|
title += f'\n({ltm_label})'
|
||||||
|
ax.set_title(title, fontsize=14, fontweight='bold')
|
||||||
|
|
||||||
|
plt.tight_layout()
|
||||||
|
save_chart(fig, 'annual_revenue_trend.png')
|
||||||
|
plt.close()
|
||||||
|
|
||||||
|
# 9. Validate revenue
|
||||||
|
print("\nValidating revenue...")
|
||||||
|
validate_revenue(df, ANALYSIS_NAME)
|
||||||
|
|
||||||
|
print(f"\n{ANALYSIS_NAME} complete!")
|
||||||
|
print(f"Chart saved to: {OUTPUT_DIR}")
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# RUN ANALYSIS
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
218
examples/cohort_analysis.py
Normal file
218
examples/cohort_analysis.py
Normal file
@@ -0,0 +1,218 @@
|
|||||||
|
"""
|
||||||
|
Example: Cohort Analysis
|
||||||
|
Advanced example showing customer cohort retention analysis
|
||||||
|
|
||||||
|
This demonstrates:
|
||||||
|
- Cohort-based analysis
|
||||||
|
- Retention rate calculations
|
||||||
|
- Revenue retention metrics
|
||||||
|
- Advanced visualization
|
||||||
|
"""
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
import seaborn as sns
|
||||||
|
from pathlib import Path
|
||||||
|
from operator import attrgetter
|
||||||
|
|
||||||
|
# Import utilities
|
||||||
|
from data_loader import load_sales_data, validate_data_structure
|
||||||
|
from validate_revenue import validate_revenue
|
||||||
|
from analysis_utils import (
|
||||||
|
get_ltm_period_config, apply_exclusion_filters,
|
||||||
|
setup_revenue_chart, save_chart, format_currency
|
||||||
|
)
|
||||||
|
from config import (
|
||||||
|
OUTPUT_DIR, MAX_DATE, CHART_SIZES, ensure_directories,
|
||||||
|
get_data_path, COMPANY_NAME, REVENUE_COLUMN, CUSTOMER_COLUMN,
|
||||||
|
DATE_COLUMN, MIN_YEAR
|
||||||
|
)
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# CONFIGURATION
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
ANALYSIS_NAME = "Cohort Analysis"
|
||||||
|
DESCRIPTION = "Customer cohort retention and revenue analysis"
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# COHORT ANALYSIS FUNCTIONS
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
def create_cohorts(df):
|
||||||
|
"""
|
||||||
|
Create customer cohorts based on first purchase date
|
||||||
|
|
||||||
|
Args:
|
||||||
|
df: DataFrame with customer and date columns
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
DataFrame: Original DataFrame with 'Cohort' and 'CohortPeriod' columns
|
||||||
|
"""
|
||||||
|
from config import CUSTOMER_COLUMN, DATE_COLUMN
|
||||||
|
|
||||||
|
# Get first purchase date for each customer
|
||||||
|
first_purchase = df.groupby(CUSTOMER_COLUMN)[DATE_COLUMN].min().reset_index()
|
||||||
|
first_purchase.columns = [CUSTOMER_COLUMN, 'FirstPurchaseDate']
|
||||||
|
|
||||||
|
# Extract cohort year-month
|
||||||
|
first_purchase['Cohort'] = first_purchase['FirstPurchaseDate'].dt.to_period('M')
|
||||||
|
|
||||||
|
# Merge back to original data
|
||||||
|
df_with_cohort = df.merge(first_purchase[[CUSTOMER_COLUMN, 'Cohort']], on=CUSTOMER_COLUMN)
|
||||||
|
|
||||||
|
# Calculate period number (months since first purchase)
|
||||||
|
df_with_cohort['Period'] = df_with_cohort[DATE_COLUMN].dt.to_period('M')
|
||||||
|
df_with_cohort['CohortPeriod'] = (df_with_cohort['Period'] - df_with_cohort['Cohort']).apply(attrgetter('n'))
|
||||||
|
|
||||||
|
return df_with_cohort
|
||||||
|
|
||||||
|
def calculate_cohort_metrics(df_with_cohort):
|
||||||
|
"""
|
||||||
|
Calculate cohort retention metrics
|
||||||
|
|
||||||
|
Args:
|
||||||
|
df_with_cohort: DataFrame with Cohort and CohortPeriod columns
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
DataFrame: Cohort metrics by period
|
||||||
|
"""
|
||||||
|
from config import REVENUE_COLUMN, CUSTOMER_COLUMN
|
||||||
|
|
||||||
|
# Customer count by cohort and period
|
||||||
|
cohort_size = df_with_cohort.groupby('Cohort')[CUSTOMER_COLUMN].nunique()
|
||||||
|
|
||||||
|
# Revenue by cohort and period
|
||||||
|
cohort_revenue = df_with_cohort.groupby(['Cohort', 'CohortPeriod']).agg({
|
||||||
|
CUSTOMER_COLUMN: 'nunique',
|
||||||
|
REVENUE_COLUMN: 'sum'
|
||||||
|
}).reset_index()
|
||||||
|
cohort_revenue.columns = ['Cohort', 'Period', 'Customers', 'Revenue']
|
||||||
|
|
||||||
|
# Calculate retention rates
|
||||||
|
cohort_retention = []
|
||||||
|
for cohort in cohort_revenue['Cohort'].unique():
|
||||||
|
cohort_data = cohort_revenue[cohort_revenue['Cohort'] == cohort].copy()
|
||||||
|
initial_customers = cohort_data[cohort_data['Period'] == 0]['Customers'].values[0]
|
||||||
|
|
||||||
|
cohort_data['Retention_Rate'] = (cohort_data['Customers'] / initial_customers) * 100
|
||||||
|
cohort_data['Revenue_Retention'] = cohort_data['Revenue'] / cohort_data[cohort_data['Period'] == 0]['Revenue'].values[0] * 100
|
||||||
|
|
||||||
|
cohort_retention.append(cohort_data)
|
||||||
|
|
||||||
|
return pd.concat(cohort_retention, ignore_index=True)
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# MAIN ANALYSIS FUNCTION
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""Main analysis function"""
|
||||||
|
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print(f"{ANALYSIS_NAME}")
|
||||||
|
print(f"{'='*60}\n")
|
||||||
|
|
||||||
|
# 1. Load data
|
||||||
|
print("Loading data...")
|
||||||
|
try:
|
||||||
|
df = load_sales_data(get_data_path())
|
||||||
|
print(f"Loaded {len(df):,} transactions")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"ERROR loading data: {e}")
|
||||||
|
return
|
||||||
|
|
||||||
|
# 2. Validate
|
||||||
|
is_valid, msg = validate_data_structure(df)
|
||||||
|
if not is_valid:
|
||||||
|
print(f"ERROR: {msg}")
|
||||||
|
return
|
||||||
|
|
||||||
|
if CUSTOMER_COLUMN not in df.columns:
|
||||||
|
print(f"ERROR: Customer column '{CUSTOMER_COLUMN}' not found")
|
||||||
|
return
|
||||||
|
|
||||||
|
# 3. Apply filters
|
||||||
|
df = apply_exclusion_filters(df)
|
||||||
|
df = df[df['Year'] >= MIN_YEAR]
|
||||||
|
if DATE_COLUMN in df.columns:
|
||||||
|
df = df[df[DATE_COLUMN] <= MAX_DATE]
|
||||||
|
|
||||||
|
# 4. Create cohorts
|
||||||
|
print("\nCreating customer cohorts...")
|
||||||
|
df_cohort = create_cohorts(df)
|
||||||
|
|
||||||
|
# 5. Calculate cohort metrics
|
||||||
|
print("Calculating cohort metrics...")
|
||||||
|
cohort_metrics = calculate_cohort_metrics(df_cohort)
|
||||||
|
|
||||||
|
# 6. Print summary
|
||||||
|
print("\nCohort Summary:")
|
||||||
|
print("-" * 60)
|
||||||
|
for cohort in sorted(cohort_metrics['Cohort'].unique())[:5]: # Show top 5 cohorts
|
||||||
|
cohort_data = cohort_metrics[cohort_metrics['Cohort'] == cohort]
|
||||||
|
period_0 = cohort_data[cohort_data['Period'] == 0]
|
||||||
|
if len(period_0) > 0:
|
||||||
|
initial_customers = period_0['Customers'].values[0]
|
||||||
|
initial_revenue = period_0['Revenue'].values[0]
|
||||||
|
print(f"\n{cohort}:")
|
||||||
|
print(f" Initial: {initial_customers:,} customers, {format_currency(initial_revenue)}")
|
||||||
|
|
||||||
|
# Show retention at period 12
|
||||||
|
period_12 = cohort_data[cohort_data['Period'] == 12]
|
||||||
|
if len(period_12) > 0:
|
||||||
|
retention = period_12['Retention_Rate'].values[0]
|
||||||
|
revenue_ret = period_12['Revenue_Retention'].values[0]
|
||||||
|
print(f" Period 12: {retention:.1f}% customer retention, {revenue_ret:.1f}% revenue retention")
|
||||||
|
|
||||||
|
# 7. Create visualizations
|
||||||
|
print("\nGenerating charts...")
|
||||||
|
ensure_directories()
|
||||||
|
|
||||||
|
# Heatmap: Customer retention
|
||||||
|
pivot_retention = cohort_metrics.pivot_table(
|
||||||
|
index='Cohort',
|
||||||
|
columns='Period',
|
||||||
|
values='Retention_Rate',
|
||||||
|
aggfunc='mean'
|
||||||
|
)
|
||||||
|
|
||||||
|
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=CHART_SIZES['wide'])
|
||||||
|
|
||||||
|
# Retention heatmap
|
||||||
|
sns.heatmap(pivot_retention, annot=True, fmt='.0f', cmap='YlOrRd', ax=ax1, cbar_kws={'label': 'Retention %'})
|
||||||
|
ax1.set_title('Customer Retention by Cohort\n(Period 0 = 100%)', fontsize=12, fontweight='bold')
|
||||||
|
ax1.set_xlabel('Months Since First Purchase')
|
||||||
|
ax1.set_ylabel('Cohort')
|
||||||
|
|
||||||
|
# Revenue retention heatmap
|
||||||
|
pivot_revenue = cohort_metrics.pivot_table(
|
||||||
|
index='Cohort',
|
||||||
|
columns='Period',
|
||||||
|
values='Revenue_Retention',
|
||||||
|
aggfunc='mean'
|
||||||
|
)
|
||||||
|
|
||||||
|
sns.heatmap(pivot_revenue, annot=True, fmt='.0f', cmap='YlGnBu', ax=ax2, cbar_kws={'label': 'Revenue Retention %'})
|
||||||
|
ax2.set_title('Revenue Retention by Cohort\n(Period 0 = 100%)', fontsize=12, fontweight='bold')
|
||||||
|
ax2.set_xlabel('Months Since First Purchase')
|
||||||
|
ax2.set_ylabel('Cohort')
|
||||||
|
|
||||||
|
plt.suptitle(f'Cohort Analysis - {COMPANY_NAME}', fontsize=14, fontweight='bold', y=1.02)
|
||||||
|
plt.tight_layout()
|
||||||
|
save_chart(fig, 'cohort_analysis.png')
|
||||||
|
plt.close()
|
||||||
|
|
||||||
|
# 8. Validate
|
||||||
|
print("\nValidating revenue...")
|
||||||
|
validate_revenue(df, ANALYSIS_NAME)
|
||||||
|
|
||||||
|
print(f"\n{ANALYSIS_NAME} complete!")
|
||||||
|
print(f"Charts saved to: {OUTPUT_DIR}")
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# RUN ANALYSIS
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
213
examples/customer_segmentation.py
Normal file
213
examples/customer_segmentation.py
Normal file
@@ -0,0 +1,213 @@
|
|||||||
|
"""
|
||||||
|
Example: Customer Segmentation (RFM) Analysis
|
||||||
|
Example showing customer segmentation using RFM methodology
|
||||||
|
|
||||||
|
This example demonstrates:
|
||||||
|
- Customer-level aggregation
|
||||||
|
- RFM segmentation (Recency, Frequency, Monetary)
|
||||||
|
- Segment analysis and visualization
|
||||||
|
"""
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
# Import utilities
|
||||||
|
from data_loader import load_sales_data, validate_data_structure
|
||||||
|
from validate_revenue import validate_revenue
|
||||||
|
from analysis_utils import (
|
||||||
|
get_ltm_period_config, apply_exclusion_filters,
|
||||||
|
setup_revenue_chart, save_chart, format_currency
|
||||||
|
)
|
||||||
|
from config import (
|
||||||
|
OUTPUT_DIR, MAX_DATE, CHART_SIZES, ensure_directories,
|
||||||
|
get_data_path, COMPANY_NAME, REVENUE_COLUMN, CUSTOMER_COLUMN,
|
||||||
|
DATE_COLUMN, MIN_YEAR
|
||||||
|
)
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# CONFIGURATION
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
ANALYSIS_NAME = "Customer Segmentation (RFM)"
|
||||||
|
DESCRIPTION = "Customer segmentation using RFM methodology"
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# RFM SEGMENTATION FUNCTIONS
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
def calculate_rfm_scores(df, analysis_date=None):
|
||||||
|
"""
|
||||||
|
Calculate RFM scores for each customer
|
||||||
|
|
||||||
|
Args:
|
||||||
|
df: DataFrame with customer, date, and revenue columns
|
||||||
|
analysis_date: Reference date for recency calculation (defaults to max date)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
DataFrame with RFM scores and segment assignment
|
||||||
|
"""
|
||||||
|
if analysis_date is None:
|
||||||
|
analysis_date = df[DATE_COLUMN].max()
|
||||||
|
|
||||||
|
# Calculate customer-level metrics
|
||||||
|
customer_metrics = df.groupby(CUSTOMER_COLUMN).agg({
|
||||||
|
DATE_COLUMN: ['max', 'count'],
|
||||||
|
REVENUE_COLUMN: 'sum'
|
||||||
|
}).reset_index()
|
||||||
|
|
||||||
|
customer_metrics.columns = [CUSTOMER_COLUMN, 'LastPurchaseDate', 'Frequency', 'Monetary']
|
||||||
|
|
||||||
|
# Calculate Recency (days since last purchase)
|
||||||
|
customer_metrics['Recency'] = (analysis_date - customer_metrics['LastPurchaseDate']).dt.days
|
||||||
|
|
||||||
|
# Score each dimension (1-5 scale, 5 = best)
|
||||||
|
customer_metrics['R_Score'] = pd.qcut(
|
||||||
|
customer_metrics['Recency'].rank(method='first'),
|
||||||
|
q=5, labels=[5, 4, 3, 2, 1], duplicates='drop'
|
||||||
|
).astype(int)
|
||||||
|
|
||||||
|
customer_metrics['F_Score'] = pd.qcut(
|
||||||
|
customer_metrics['Frequency'].rank(method='first'),
|
||||||
|
q=5, labels=[1, 2, 3, 4, 5], duplicates='drop'
|
||||||
|
).astype(int)
|
||||||
|
|
||||||
|
customer_metrics['M_Score'] = pd.qcut(
|
||||||
|
customer_metrics['Monetary'].rank(method='first'),
|
||||||
|
q=5, labels=[1, 2, 3, 4, 5], duplicates='drop'
|
||||||
|
).astype(int)
|
||||||
|
|
||||||
|
# Calculate RFM score (sum of R, F, M)
|
||||||
|
customer_metrics['RFM_Score'] = (
|
||||||
|
customer_metrics['R_Score'] +
|
||||||
|
customer_metrics['F_Score'] +
|
||||||
|
customer_metrics['M_Score']
|
||||||
|
)
|
||||||
|
|
||||||
|
# Assign segments
|
||||||
|
def assign_segment(row):
|
||||||
|
r, f, m = row['R_Score'], row['F_Score'], row['M_Score']
|
||||||
|
if r >= 4 and f >= 4 and m >= 4:
|
||||||
|
return 'Champions'
|
||||||
|
elif r >= 3 and f >= 3 and m >= 4:
|
||||||
|
return 'Loyal Customers'
|
||||||
|
elif r >= 4 and f <= 2:
|
||||||
|
return 'At Risk'
|
||||||
|
elif r <= 2:
|
||||||
|
return 'Hibernating'
|
||||||
|
elif r >= 3 and f >= 3 and m <= 2:
|
||||||
|
return 'Potential Loyalists'
|
||||||
|
else:
|
||||||
|
return 'Need Attention'
|
||||||
|
|
||||||
|
customer_metrics['Segment'] = customer_metrics.apply(assign_segment, axis=1)
|
||||||
|
|
||||||
|
return customer_metrics
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# MAIN ANALYSIS FUNCTION
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""Main analysis function"""
|
||||||
|
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print(f"{ANALYSIS_NAME}")
|
||||||
|
print(f"{'='*60}\n")
|
||||||
|
|
||||||
|
# 1. Load data
|
||||||
|
print("Loading data...")
|
||||||
|
try:
|
||||||
|
df = load_sales_data(get_data_path())
|
||||||
|
print(f"Loaded {len(df):,} transactions")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"ERROR loading data: {e}")
|
||||||
|
return
|
||||||
|
|
||||||
|
# 2. Validate data structure
|
||||||
|
is_valid, msg = validate_data_structure(df)
|
||||||
|
if not is_valid:
|
||||||
|
print(f"ERROR: {msg}")
|
||||||
|
return
|
||||||
|
|
||||||
|
if CUSTOMER_COLUMN not in df.columns:
|
||||||
|
print(f"ERROR: Customer column '{CUSTOMER_COLUMN}' not found in data")
|
||||||
|
return
|
||||||
|
|
||||||
|
print("Data validation passed")
|
||||||
|
|
||||||
|
# 3. Apply exclusion filters
|
||||||
|
df = apply_exclusion_filters(df)
|
||||||
|
|
||||||
|
# 4. Filter by date range
|
||||||
|
df = df[df['Year'] >= MIN_YEAR]
|
||||||
|
if DATE_COLUMN in df.columns:
|
||||||
|
df = df[df[DATE_COLUMN] <= MAX_DATE]
|
||||||
|
|
||||||
|
# 5. Calculate RFM scores
|
||||||
|
print("\nCalculating RFM scores...")
|
||||||
|
rfm_df = calculate_rfm_scores(df)
|
||||||
|
|
||||||
|
# 6. Segment summary
|
||||||
|
print("\nCustomer Segmentation Summary:")
|
||||||
|
print("-" * 60)
|
||||||
|
segment_summary = rfm_df.groupby('Segment').agg({
|
||||||
|
CUSTOMER_COLUMN: 'count',
|
||||||
|
'Monetary': 'sum'
|
||||||
|
}).reset_index()
|
||||||
|
segment_summary.columns = ['Segment', 'Customer Count', 'Total Revenue']
|
||||||
|
segment_summary = segment_summary.sort_values('Total Revenue', ascending=False)
|
||||||
|
|
||||||
|
for _, row in segment_summary.iterrows():
|
||||||
|
pct_customers = (row['Customer Count'] / len(rfm_df)) * 100
|
||||||
|
pct_revenue = (row['Total Revenue'] / rfm_df['Monetary'].sum()) * 100
|
||||||
|
print(f"{row['Segment']:20s}: {row['Customer Count']:5d} customers ({pct_customers:5.1f}%), "
|
||||||
|
f"{format_currency(row['Total Revenue'])} ({pct_revenue:5.1f}% of revenue)")
|
||||||
|
|
||||||
|
# 7. Create visualizations
|
||||||
|
print("\nGenerating charts...")
|
||||||
|
ensure_directories()
|
||||||
|
|
||||||
|
# Chart 1: Revenue by Segment
|
||||||
|
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=CHART_SIZES['wide'])
|
||||||
|
|
||||||
|
segment_summary_sorted = segment_summary.sort_values('Total Revenue', ascending=True)
|
||||||
|
revenue_millions = segment_summary_sorted['Total Revenue'].values / 1e6
|
||||||
|
|
||||||
|
ax1.barh(range(len(segment_summary_sorted)), revenue_millions, color='#2E86AB')
|
||||||
|
ax1.set_yticks(range(len(segment_summary_sorted)))
|
||||||
|
ax1.set_yticklabels(segment_summary_sorted['Segment'].values)
|
||||||
|
ax1.set_xlabel('Revenue (Millions USD)')
|
||||||
|
ax1.set_title('Revenue by Customer Segment', fontsize=12, fontweight='bold')
|
||||||
|
setup_revenue_chart(ax1)
|
||||||
|
ax1.set_ylabel('')
|
||||||
|
|
||||||
|
# Chart 2: Customer Count by Segment
|
||||||
|
customer_counts = segment_summary_sorted['Customer Count'].values
|
||||||
|
ax2.barh(range(len(segment_summary_sorted)), customer_counts, color='#A23B72')
|
||||||
|
ax2.set_yticks(range(len(segment_summary_sorted)))
|
||||||
|
ax2.set_yticklabels(segment_summary_sorted['Segment'].values)
|
||||||
|
ax2.set_xlabel('Number of Customers')
|
||||||
|
ax2.set_title('Customer Count by Segment', fontsize=12, fontweight='bold')
|
||||||
|
ax2.set_ylabel('')
|
||||||
|
ax2.grid(True, alpha=0.3)
|
||||||
|
|
||||||
|
plt.suptitle(f'Customer Segmentation Analysis - {COMPANY_NAME}',
|
||||||
|
fontsize=14, fontweight='bold', y=1.02)
|
||||||
|
plt.tight_layout()
|
||||||
|
save_chart(fig, 'customer_segmentation.png')
|
||||||
|
plt.close()
|
||||||
|
|
||||||
|
# 8. Validate revenue
|
||||||
|
print("\nValidating revenue...")
|
||||||
|
validate_revenue(df, ANALYSIS_NAME)
|
||||||
|
|
||||||
|
print(f"\n{ANALYSIS_NAME} complete!")
|
||||||
|
print(f"Charts saved to: {OUTPUT_DIR}")
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# RUN ANALYSIS
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
203
examples/product_performance.py
Normal file
203
examples/product_performance.py
Normal file
@@ -0,0 +1,203 @@
|
|||||||
|
"""
|
||||||
|
Example: Product Performance Analysis
|
||||||
|
Example showing product mix and performance analysis
|
||||||
|
|
||||||
|
This example demonstrates:
|
||||||
|
- Product-level aggregation
|
||||||
|
- Product performance metrics
|
||||||
|
- Product mix visualization
|
||||||
|
"""
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
# Import utilities
|
||||||
|
from data_loader import load_sales_data, validate_data_structure
|
||||||
|
from validate_revenue import validate_revenue
|
||||||
|
from analysis_utils import (
|
||||||
|
get_ltm_period_config, calculate_annual_metrics,
|
||||||
|
apply_exclusion_filters, setup_revenue_chart, save_chart,
|
||||||
|
format_currency, sort_mixed_years
|
||||||
|
)
|
||||||
|
from config import (
|
||||||
|
OUTPUT_DIR, MAX_DATE, CHART_SIZES, ensure_directories,
|
||||||
|
get_data_path, COMPANY_NAME, REVENUE_COLUMN, ITEM_COLUMN,
|
||||||
|
DATE_COLUMN, MIN_YEAR, QUANTITY_COLUMN
|
||||||
|
)
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# CONFIGURATION
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
ANALYSIS_NAME = "Product Performance Analysis"
|
||||||
|
DESCRIPTION = "Product mix and performance analysis"
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# MAIN ANALYSIS FUNCTION
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""Main analysis function"""
|
||||||
|
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print(f"{ANALYSIS_NAME}")
|
||||||
|
print(f"{'='*60}\n")
|
||||||
|
|
||||||
|
# 1. Load data
|
||||||
|
print("Loading data...")
|
||||||
|
try:
|
||||||
|
df = load_sales_data(get_data_path())
|
||||||
|
print(f"Loaded {len(df):,} transactions")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"ERROR loading data: {e}")
|
||||||
|
return
|
||||||
|
|
||||||
|
# 2. Validate data structure
|
||||||
|
is_valid, msg = validate_data_structure(df)
|
||||||
|
if not is_valid:
|
||||||
|
print(f"ERROR: {msg}")
|
||||||
|
return
|
||||||
|
|
||||||
|
if ITEM_COLUMN not in df.columns:
|
||||||
|
print(f"WARNING: Item column '{ITEM_COLUMN}' not found. Using transaction-level analysis.")
|
||||||
|
# Create a dummy item column for demonstration
|
||||||
|
df[ITEM_COLUMN] = 'All Products'
|
||||||
|
|
||||||
|
print("Data validation passed")
|
||||||
|
|
||||||
|
# 3. Apply exclusion filters
|
||||||
|
df = apply_exclusion_filters(df)
|
||||||
|
|
||||||
|
# 4. Filter by date range
|
||||||
|
df = df[df['Year'] >= MIN_YEAR]
|
||||||
|
if DATE_COLUMN in df.columns:
|
||||||
|
df = df[df[DATE_COLUMN] <= MAX_DATE]
|
||||||
|
|
||||||
|
# 5. Setup LTM period
|
||||||
|
ltm_start, ltm_end = get_ltm_period_config()
|
||||||
|
|
||||||
|
# 6. Product performance summary
|
||||||
|
print("\nCalculating product performance...")
|
||||||
|
|
||||||
|
# Get most recent period data
|
||||||
|
if ltm_start and ltm_end and 'YearMonth' in df.columns:
|
||||||
|
recent_data = df[(df['YearMonth'] >= ltm_start) & (df['YearMonth'] <= ltm_end)]
|
||||||
|
period_label = f"LTM {ltm_end}"
|
||||||
|
else:
|
||||||
|
recent_year = df['Year'].max()
|
||||||
|
recent_data = df[df['Year'] == recent_year]
|
||||||
|
period_label = str(recent_year)
|
||||||
|
|
||||||
|
# Product-level metrics
|
||||||
|
product_metrics = recent_data.groupby(ITEM_COLUMN).agg({
|
||||||
|
REVENUE_COLUMN: ['sum', 'count'],
|
||||||
|
QUANTITY_COLUMN: 'sum' if QUANTITY_COLUMN in df.columns else 'count'
|
||||||
|
}).reset_index()
|
||||||
|
|
||||||
|
product_metrics.columns = [ITEM_COLUMN, 'Revenue', 'Transaction_Count', 'Quantity']
|
||||||
|
|
||||||
|
# Calculate average price per unit if quantity available
|
||||||
|
if QUANTITY_COLUMN in df.columns:
|
||||||
|
product_metrics['Avg_Price'] = product_metrics['Revenue'] / product_metrics['Quantity'].replace(0, np.nan)
|
||||||
|
else:
|
||||||
|
product_metrics['Avg_Price'] = product_metrics['Revenue'] / product_metrics['Transaction_Count']
|
||||||
|
|
||||||
|
# Sort by revenue
|
||||||
|
product_metrics = product_metrics.sort_values('Revenue', ascending=False)
|
||||||
|
|
||||||
|
# Top products summary
|
||||||
|
print(f"\nTop 10 Products by Revenue ({period_label}):")
|
||||||
|
print("-" * 80)
|
||||||
|
top_10 = product_metrics.head(10)
|
||||||
|
total_revenue = product_metrics['Revenue'].sum()
|
||||||
|
|
||||||
|
for idx, row in top_10.iterrows():
|
||||||
|
pct = (row['Revenue'] / total_revenue) * 100
|
||||||
|
print(f"{row[ITEM_COLUMN]:30s}: {format_currency(row['Revenue']):>12s} ({pct:5.1f}%)")
|
||||||
|
|
||||||
|
# 7. Annual product trends (if multiple years available)
|
||||||
|
if len(df['Year'].unique()) > 1:
|
||||||
|
print("\nCalculating annual product trends...")
|
||||||
|
|
||||||
|
def calculate_product_metrics(year_data):
|
||||||
|
"""Calculate product metrics for a year"""
|
||||||
|
product_revenue = year_data.groupby(ITEM_COLUMN)[REVENUE_COLUMN].sum()
|
||||||
|
# Get top 5 products
|
||||||
|
top_5 = product_revenue.nlargest(5)
|
||||||
|
return dict(top_5)
|
||||||
|
|
||||||
|
annual_product_df = calculate_annual_metrics(df, calculate_product_metrics, ltm_start, ltm_end)
|
||||||
|
|
||||||
|
# 8. Create visualizations
|
||||||
|
print("\nGenerating charts...")
|
||||||
|
ensure_directories()
|
||||||
|
|
||||||
|
# Chart 1: Top Products Revenue (Bar Chart)
|
||||||
|
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=CHART_SIZES['wide'])
|
||||||
|
|
||||||
|
top_10_revenue = top_10['Revenue'].values / 1e6
|
||||||
|
top_10_names = top_10[ITEM_COLUMN].values
|
||||||
|
|
||||||
|
ax1.barh(range(len(top_10)), top_10_revenue, color='#2E86AB')
|
||||||
|
ax1.set_yticks(range(len(top_10)))
|
||||||
|
ax1.set_yticklabels([name[:30] + '...' if len(name) > 30 else name for name in top_10_names])
|
||||||
|
ax1.set_xlabel('Revenue (Millions USD)')
|
||||||
|
ax1.set_title(f'Top 10 Products by Revenue\n({period_label})', fontsize=12, fontweight='bold')
|
||||||
|
setup_revenue_chart(ax1)
|
||||||
|
ax1.set_ylabel('')
|
||||||
|
|
||||||
|
# Chart 2: Revenue Distribution (Pie Chart for top 10)
|
||||||
|
if len(product_metrics) > 10:
|
||||||
|
other_revenue = product_metrics.iloc[10:]['Revenue'].sum()
|
||||||
|
pie_data = list(top_10['Revenue'].values) + [other_revenue]
|
||||||
|
pie_labels = list(top_10[ITEM_COLUMN].values) + ['Other']
|
||||||
|
else:
|
||||||
|
pie_data = product_metrics['Revenue'].values
|
||||||
|
pie_labels = product_metrics[ITEM_COLUMN].values
|
||||||
|
|
||||||
|
pie_data_millions = [x / 1e6 for x in pie_data]
|
||||||
|
ax2.pie(pie_data_millions, labels=pie_labels, autopct='%1.1f%%', startangle=90)
|
||||||
|
ax2.set_title('Revenue Distribution\n(Top Products)', fontsize=12, fontweight='bold')
|
||||||
|
|
||||||
|
plt.suptitle(f'Product Performance Analysis - {COMPANY_NAME}',
|
||||||
|
fontsize=14, fontweight='bold', y=1.02)
|
||||||
|
plt.tight_layout()
|
||||||
|
save_chart(fig, 'product_performance.png')
|
||||||
|
plt.close()
|
||||||
|
else:
|
||||||
|
# Single chart if only one year
|
||||||
|
print("\nGenerating chart...")
|
||||||
|
ensure_directories()
|
||||||
|
|
||||||
|
fig, ax = plt.subplots(figsize=CHART_SIZES['medium'])
|
||||||
|
|
||||||
|
top_10_revenue = top_10['Revenue'].values / 1e6
|
||||||
|
top_10_names = top_10[ITEM_COLUMN].values
|
||||||
|
|
||||||
|
ax.barh(range(len(top_10)), top_10_revenue, color='#2E86AB')
|
||||||
|
ax.set_yticks(range(len(top_10)))
|
||||||
|
ax.set_yticklabels([name[:40] + '...' if len(name) > 40 else name for name in top_10_names])
|
||||||
|
ax.set_xlabel('Revenue (Millions USD)')
|
||||||
|
ax.set_title(f'Top 10 Products by Revenue - {COMPANY_NAME}\n({period_label})',
|
||||||
|
fontsize=14, fontweight='bold')
|
||||||
|
setup_revenue_chart(ax)
|
||||||
|
ax.set_ylabel('')
|
||||||
|
|
||||||
|
plt.tight_layout()
|
||||||
|
save_chart(fig, 'product_performance.png')
|
||||||
|
plt.close()
|
||||||
|
|
||||||
|
# 9. Validate revenue
|
||||||
|
print("\nValidating revenue...")
|
||||||
|
validate_revenue(df, ANALYSIS_NAME)
|
||||||
|
|
||||||
|
print(f"\n{ANALYSIS_NAME} complete!")
|
||||||
|
print(f"Charts saved to: {OUTPUT_DIR}")
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# RUN ANALYSIS
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
238
export_utils.py
Normal file
238
export_utils.py
Normal file
@@ -0,0 +1,238 @@
|
|||||||
|
"""
|
||||||
|
Export utilities for analysis results
|
||||||
|
Provides functions to export DataFrames and summary data to CSV and Excel
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
from export_utils import export_to_csv, export_to_excel, export_summary_table
|
||||||
|
|
||||||
|
# Export DataFrame to CSV
|
||||||
|
export_to_csv(df, 'results.csv')
|
||||||
|
|
||||||
|
# Export DataFrame to Excel
|
||||||
|
export_to_excel(df, 'results.xlsx', sheet_name='Data')
|
||||||
|
|
||||||
|
# Export summary table
|
||||||
|
export_summary_table({'Metric1': 100, 'Metric2': 200}, 'summary.xlsx')
|
||||||
|
"""
|
||||||
|
import pandas as pd
|
||||||
|
from pathlib import Path
|
||||||
|
from config import REPORTS_DIR, ensure_directories
|
||||||
|
|
||||||
|
def export_to_csv(df, filename, output_dir=None, index=True):
|
||||||
|
"""
|
||||||
|
Export DataFrame to CSV with proper formatting
|
||||||
|
|
||||||
|
Args:
|
||||||
|
df: DataFrame to export
|
||||||
|
filename: Output filename (e.g., 'results.csv')
|
||||||
|
output_dir: Output directory (defaults to config.REPORTS_DIR)
|
||||||
|
index: Whether to include index in export (default: True)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Path to exported file
|
||||||
|
"""
|
||||||
|
if output_dir is None:
|
||||||
|
output_dir = REPORTS_DIR
|
||||||
|
else:
|
||||||
|
output_dir = Path(output_dir)
|
||||||
|
|
||||||
|
ensure_directories()
|
||||||
|
output_dir.mkdir(exist_ok=True)
|
||||||
|
|
||||||
|
filepath = output_dir / filename
|
||||||
|
|
||||||
|
df.to_csv(filepath, index=index, encoding='utf-8-sig')
|
||||||
|
print(f"Exported to CSV: {filepath}")
|
||||||
|
|
||||||
|
return filepath
|
||||||
|
|
||||||
|
def export_to_excel(df, filename, sheet_name='Data', output_dir=None, index=True):
|
||||||
|
"""
|
||||||
|
Export DataFrame to Excel with formatting
|
||||||
|
|
||||||
|
Args:
|
||||||
|
df: DataFrame to export
|
||||||
|
filename: Output filename (e.g., 'results.xlsx')
|
||||||
|
sheet_name: Excel sheet name (default: 'Data')
|
||||||
|
output_dir: Output directory (defaults to config.REPORTS_DIR)
|
||||||
|
index: Whether to include index in export (default: True)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Path to exported file
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ImportError: If openpyxl is not installed
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
import openpyxl
|
||||||
|
except ImportError:
|
||||||
|
raise ImportError(
|
||||||
|
"openpyxl is required for Excel export. Install with: pip install openpyxl"
|
||||||
|
)
|
||||||
|
|
||||||
|
if output_dir is None:
|
||||||
|
output_dir = REPORTS_DIR
|
||||||
|
else:
|
||||||
|
output_dir = Path(output_dir)
|
||||||
|
|
||||||
|
ensure_directories()
|
||||||
|
output_dir.mkdir(exist_ok=True)
|
||||||
|
|
||||||
|
filepath = output_dir / filename
|
||||||
|
|
||||||
|
# Create Excel writer
|
||||||
|
with pd.ExcelWriter(filepath, engine='openpyxl') as writer:
|
||||||
|
df.to_excel(writer, sheet_name=sheet_name, index=index)
|
||||||
|
|
||||||
|
# Auto-adjust column widths
|
||||||
|
worksheet = writer.sheets[sheet_name]
|
||||||
|
for idx, col in enumerate(df.columns, 1):
|
||||||
|
max_length = max(
|
||||||
|
df[col].astype(str).map(len).max(),
|
||||||
|
len(str(col))
|
||||||
|
)
|
||||||
|
# Cap at 50 characters for readability
|
||||||
|
adjusted_width = min(max_length + 2, 50)
|
||||||
|
worksheet.column_dimensions[chr(64 + idx)].width = adjusted_width
|
||||||
|
|
||||||
|
print(f"Exported to Excel: {filepath}")
|
||||||
|
|
||||||
|
return filepath
|
||||||
|
|
||||||
|
def export_summary_table(data_dict, filename, output_dir=None, title=None):
|
||||||
|
"""
|
||||||
|
Export summary statistics to formatted table (Excel)
|
||||||
|
|
||||||
|
Args:
|
||||||
|
data_dict: Dictionary of {metric_name: value} pairs
|
||||||
|
filename: Output filename (e.g., 'summary.xlsx')
|
||||||
|
output_dir: Output directory (defaults to config.REPORTS_DIR)
|
||||||
|
title: Optional title for the summary table
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Path to exported file
|
||||||
|
|
||||||
|
Example:
|
||||||
|
export_summary_table({
|
||||||
|
'Total Revenue': 1000000,
|
||||||
|
'Customer Count': 500,
|
||||||
|
'Average Order Value': 2000
|
||||||
|
}, 'summary.xlsx')
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
import openpyxl
|
||||||
|
except ImportError:
|
||||||
|
raise ImportError(
|
||||||
|
"openpyxl is required for Excel export. Install with: pip install openpyxl"
|
||||||
|
)
|
||||||
|
|
||||||
|
if output_dir is None:
|
||||||
|
output_dir = REPORTS_DIR
|
||||||
|
else:
|
||||||
|
output_dir = Path(output_dir)
|
||||||
|
|
||||||
|
ensure_directories()
|
||||||
|
output_dir.mkdir(exist_ok=True)
|
||||||
|
|
||||||
|
filepath = output_dir / filename
|
||||||
|
|
||||||
|
# Create DataFrame from dictionary
|
||||||
|
df = pd.DataFrame({
|
||||||
|
'Metric': list(data_dict.keys()),
|
||||||
|
'Value': list(data_dict.values())
|
||||||
|
})
|
||||||
|
|
||||||
|
# Format numeric values
|
||||||
|
def format_value(val):
|
||||||
|
if isinstance(val, (int, float)):
|
||||||
|
if abs(val) >= 1e6:
|
||||||
|
return f"${val / 1e6:.2f}m"
|
||||||
|
elif abs(val) >= 1e3:
|
||||||
|
return f"${val / 1e3:.2f}k"
|
||||||
|
else:
|
||||||
|
return f"${val:.2f}"
|
||||||
|
return str(val)
|
||||||
|
|
||||||
|
df['Formatted_Value'] = df['Value'].apply(format_value)
|
||||||
|
|
||||||
|
# Create Excel writer
|
||||||
|
with pd.ExcelWriter(filepath, engine='openpyxl') as writer:
|
||||||
|
df.to_excel(writer, sheet_name='Summary', index=False)
|
||||||
|
|
||||||
|
# Format worksheet
|
||||||
|
worksheet = writer.sheets['Summary']
|
||||||
|
|
||||||
|
# Set column widths
|
||||||
|
worksheet.column_dimensions['A'].width = 30
|
||||||
|
worksheet.column_dimensions['B'].width = 20
|
||||||
|
worksheet.column_dimensions['C'].width = 20
|
||||||
|
|
||||||
|
# Add title if provided
|
||||||
|
if title:
|
||||||
|
worksheet.insert_rows(1)
|
||||||
|
worksheet.merge_cells('A1:C1')
|
||||||
|
worksheet['A1'] = title
|
||||||
|
worksheet['A1'].font = openpyxl.styles.Font(bold=True, size=14)
|
||||||
|
worksheet['A1'].alignment = openpyxl.styles.Alignment(horizontal='center')
|
||||||
|
|
||||||
|
print(f"Exported summary table to Excel: {filepath}")
|
||||||
|
|
||||||
|
return filepath
|
||||||
|
|
||||||
|
def export_multiple_sheets(data_dict, filename, output_dir=None):
|
||||||
|
"""
|
||||||
|
Export multiple DataFrames to Excel with multiple sheets
|
||||||
|
|
||||||
|
Args:
|
||||||
|
data_dict: Dictionary of {sheet_name: DataFrame} pairs
|
||||||
|
filename: Output filename (e.g., 'results.xlsx')
|
||||||
|
output_dir: Output directory (defaults to config.REPORTS_DIR)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Path to exported file
|
||||||
|
|
||||||
|
Example:
|
||||||
|
export_multiple_sheets({
|
||||||
|
'Revenue': revenue_df,
|
||||||
|
'Customers': customer_df,
|
||||||
|
'Products': product_df
|
||||||
|
}, 'analysis_results.xlsx')
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
import openpyxl
|
||||||
|
except ImportError:
|
||||||
|
raise ImportError(
|
||||||
|
"openpyxl is required for Excel export. Install with: pip install openpyxl"
|
||||||
|
)
|
||||||
|
|
||||||
|
if output_dir is None:
|
||||||
|
output_dir = REPORTS_DIR
|
||||||
|
else:
|
||||||
|
output_dir = Path(output_dir)
|
||||||
|
|
||||||
|
ensure_directories()
|
||||||
|
output_dir.mkdir(exist_ok=True)
|
||||||
|
|
||||||
|
filepath = output_dir / filename
|
||||||
|
|
||||||
|
# Create Excel writer
|
||||||
|
with pd.ExcelWriter(filepath, engine='openpyxl') as writer:
|
||||||
|
for sheet_name, df in data_dict.items():
|
||||||
|
# Truncate sheet name to 31 characters (Excel limit)
|
||||||
|
safe_sheet_name = sheet_name[:31]
|
||||||
|
df.to_excel(writer, sheet_name=safe_sheet_name, index=True)
|
||||||
|
|
||||||
|
# Auto-adjust column widths
|
||||||
|
worksheet = writer.sheets[safe_sheet_name]
|
||||||
|
for idx, col in enumerate(df.columns, 1):
|
||||||
|
max_length = max(
|
||||||
|
df[col].astype(str).map(len).max(),
|
||||||
|
len(str(col))
|
||||||
|
)
|
||||||
|
adjusted_width = min(max_length + 2, 50)
|
||||||
|
col_letter = openpyxl.utils.get_column_letter(idx)
|
||||||
|
worksheet.column_dimensions[col_letter].width = adjusted_width
|
||||||
|
|
||||||
|
print(f"Exported {len(data_dict)} sheets to Excel: {filepath}")
|
||||||
|
|
||||||
|
return filepath
|
||||||
184
generate_sample_data.py
Normal file
184
generate_sample_data.py
Normal file
@@ -0,0 +1,184 @@
|
|||||||
|
"""
|
||||||
|
Sample data generator for testing and demonstrations
|
||||||
|
Generates realistic sample sales data
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python generate_sample_data.py
|
||||||
|
|
||||||
|
# Or import and use programmatically:
|
||||||
|
from generate_sample_data import generate_sample_sales_data
|
||||||
|
df = generate_sample_sales_data(num_customers=100, num_products=50, years=[2021, 2022, 2023])
|
||||||
|
"""
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
from pathlib import Path
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
import random
|
||||||
|
|
||||||
|
def generate_sample_sales_data(
|
||||||
|
num_customers=100,
|
||||||
|
num_products=50,
|
||||||
|
years=[2021, 2022, 2023, 2024, 2025],
|
||||||
|
transactions_per_month=500,
|
||||||
|
output_file='sample_sales_data.csv'
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Generate realistic sample sales data
|
||||||
|
|
||||||
|
Args:
|
||||||
|
num_customers: Number of unique customers
|
||||||
|
num_products: Number of unique products
|
||||||
|
years: List of years to generate data for
|
||||||
|
transactions_per_month: Average transactions per month
|
||||||
|
output_file: Output CSV filename
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
DataFrame: Generated sales data
|
||||||
|
"""
|
||||||
|
print(f"Generating sample sales data...")
|
||||||
|
print(f" Customers: {num_customers}")
|
||||||
|
print(f" Products: {num_products}")
|
||||||
|
print(f" Years: {years}")
|
||||||
|
|
||||||
|
# Generate customer names
|
||||||
|
customer_names = [f"Customer_{i:04d}" for i in range(1, num_customers + 1)]
|
||||||
|
|
||||||
|
# Generate product names
|
||||||
|
product_names = [f"Product_{i:04d}" for i in range(1, num_products + 1)]
|
||||||
|
|
||||||
|
# Generate transactions
|
||||||
|
transactions = []
|
||||||
|
|
||||||
|
for year in years:
|
||||||
|
for month in range(1, 13):
|
||||||
|
# Skip future months
|
||||||
|
current_date = datetime.now()
|
||||||
|
if year > current_date.year or (year == current_date.year and month > current_date.month):
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Generate transactions for this month
|
||||||
|
num_transactions = int(np.random.normal(transactions_per_month, transactions_per_month * 0.2))
|
||||||
|
num_transactions = max(10, num_transactions) # At least 10 transactions
|
||||||
|
|
||||||
|
for _ in range(num_transactions):
|
||||||
|
# Random date within month
|
||||||
|
if month == 2:
|
||||||
|
max_day = 28
|
||||||
|
elif month in [4, 6, 9, 11]:
|
||||||
|
max_day = 30
|
||||||
|
else:
|
||||||
|
max_day = 31
|
||||||
|
|
||||||
|
day = random.randint(1, max_day)
|
||||||
|
invoice_date = datetime(year, month, day)
|
||||||
|
|
||||||
|
# Random customer and product
|
||||||
|
customer = random.choice(customer_names)
|
||||||
|
product = random.choice(product_names)
|
||||||
|
|
||||||
|
# Generate quantity (most transactions are small)
|
||||||
|
quantity = int(np.random.lognormal(mean=1.5, sigma=1.0))
|
||||||
|
quantity = max(1, min(quantity, 100)) # Cap at 100
|
||||||
|
|
||||||
|
# Generate revenue (with some correlation to quantity)
|
||||||
|
base_price = np.random.lognormal(mean=5, sigma=1.5)
|
||||||
|
revenue = base_price * quantity
|
||||||
|
|
||||||
|
# Add some variation
|
||||||
|
revenue *= np.random.uniform(0.8, 1.2)
|
||||||
|
revenue = round(revenue, 2)
|
||||||
|
|
||||||
|
transactions.append({
|
||||||
|
'InvoiceDate': invoice_date,
|
||||||
|
'Customer': customer,
|
||||||
|
'Item': product,
|
||||||
|
'Quantity': quantity,
|
||||||
|
'USD': revenue,
|
||||||
|
'Year': year,
|
||||||
|
'Month': month
|
||||||
|
})
|
||||||
|
|
||||||
|
# Create DataFrame
|
||||||
|
df = pd.DataFrame(transactions)
|
||||||
|
|
||||||
|
# Sort by date
|
||||||
|
df = df.sort_values('InvoiceDate').reset_index(drop=True)
|
||||||
|
|
||||||
|
# Add some missing dates (realistic data quality issue)
|
||||||
|
missing_date_pct = 0.05 # 5% missing dates
|
||||||
|
num_missing = int(len(df) * missing_date_pct)
|
||||||
|
missing_indices = np.random.choice(df.index, size=num_missing, replace=False)
|
||||||
|
df.loc[missing_indices, 'InvoiceDate'] = pd.NaT
|
||||||
|
|
||||||
|
# Save to CSV
|
||||||
|
output_path = Path(output_file)
|
||||||
|
df.to_csv(output_path, index=False)
|
||||||
|
print(f"\n✅ Sample data generated: {output_path}")
|
||||||
|
print(f" Rows: {len(df):,}")
|
||||||
|
print(f" Date range: {df['InvoiceDate'].min()} to {df['InvoiceDate'].max()}")
|
||||||
|
print(f" Total revenue: ${df['USD'].sum() / 1e6:.2f}m")
|
||||||
|
|
||||||
|
return df
|
||||||
|
|
||||||
|
def generate_sample_data_for_template():
|
||||||
|
"""
|
||||||
|
Generate sample data matching template's expected structure
|
||||||
|
Uses config.py column names
|
||||||
|
"""
|
||||||
|
from config import (
|
||||||
|
REVENUE_COLUMN, DATE_COLUMN, CUSTOMER_COLUMN, ITEM_COLUMN,
|
||||||
|
QUANTITY_COLUMN, ANALYSIS_YEARS
|
||||||
|
)
|
||||||
|
|
||||||
|
print("Generating sample data for template...")
|
||||||
|
|
||||||
|
df = generate_sample_sales_data(
|
||||||
|
num_customers=200,
|
||||||
|
num_products=100,
|
||||||
|
years=ANALYSIS_YEARS,
|
||||||
|
transactions_per_month=1000,
|
||||||
|
output_file='sample_sales_data.csv'
|
||||||
|
)
|
||||||
|
|
||||||
|
# Rename columns to match config (if different)
|
||||||
|
column_mapping = {
|
||||||
|
'USD': REVENUE_COLUMN,
|
||||||
|
'InvoiceDate': DATE_COLUMN,
|
||||||
|
'Customer': CUSTOMER_COLUMN,
|
||||||
|
'Item': ITEM_COLUMN,
|
||||||
|
'Quantity': QUANTITY_COLUMN
|
||||||
|
}
|
||||||
|
|
||||||
|
# Only rename if different
|
||||||
|
for old_name, new_name in column_mapping.items():
|
||||||
|
if old_name in df.columns and old_name != new_name:
|
||||||
|
df = df.rename(columns={old_name: new_name})
|
||||||
|
|
||||||
|
# Save
|
||||||
|
output_path = Path('sample_sales_data.csv')
|
||||||
|
df.to_csv(output_path, index=False)
|
||||||
|
|
||||||
|
print(f"\n✅ Sample data saved to: {output_path}")
|
||||||
|
print(f" Ready to use with sales_analysis_template")
|
||||||
|
|
||||||
|
return df
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# MAIN
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
"""Generate sample data"""
|
||||||
|
import sys
|
||||||
|
|
||||||
|
if len(sys.argv) > 1:
|
||||||
|
# Custom generation
|
||||||
|
num_customers = int(sys.argv[1]) if len(sys.argv) > 1 else 100
|
||||||
|
num_products = int(sys.argv[2]) if len(sys.argv) > 2 else 50
|
||||||
|
generate_sample_sales_data(
|
||||||
|
num_customers=num_customers,
|
||||||
|
num_products=num_products
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# Generate for template
|
||||||
|
generate_sample_data_for_template()
|
||||||
197
logger_config.py
Normal file
197
logger_config.py
Normal file
@@ -0,0 +1,197 @@
|
|||||||
|
"""
|
||||||
|
Logging configuration for analysis scripts
|
||||||
|
Provides structured logging with file and console output
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
from logger_config import get_logger
|
||||||
|
|
||||||
|
logger = get_logger('my_analysis')
|
||||||
|
logger.info("Analysis started")
|
||||||
|
logger.warning("Low data quality detected")
|
||||||
|
logger.error("Failed to load data")
|
||||||
|
"""
|
||||||
|
import logging
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
from datetime import datetime
|
||||||
|
from config import COMPANY_NAME, OUTPUT_DIR
|
||||||
|
|
||||||
|
# Global logger instance
|
||||||
|
_logger = None
|
||||||
|
|
||||||
|
def setup_logging(log_level=logging.INFO, log_file=None, analysis_name=None):
|
||||||
|
"""
|
||||||
|
Setup logging configuration
|
||||||
|
|
||||||
|
Args:
|
||||||
|
log_level: Logging level (DEBUG, INFO, WARNING, ERROR)
|
||||||
|
log_file: Path to log file (defaults to logs/analysis_YYYYMMDD_HHMMSS.log)
|
||||||
|
analysis_name: Name of analysis for log file naming
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
logging.Logger: Configured logger instance
|
||||||
|
"""
|
||||||
|
global _logger
|
||||||
|
|
||||||
|
# Create logs directory
|
||||||
|
logs_dir = Path('logs')
|
||||||
|
logs_dir.mkdir(exist_ok=True)
|
||||||
|
|
||||||
|
# Default log file name
|
||||||
|
if log_file is None:
|
||||||
|
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
||||||
|
if analysis_name:
|
||||||
|
safe_name = analysis_name.lower().replace(' ', '_').replace('/', '_')
|
||||||
|
log_file = logs_dir / f"{safe_name}_{timestamp}.log"
|
||||||
|
else:
|
||||||
|
log_file = logs_dir / f"analysis_{timestamp}.log"
|
||||||
|
else:
|
||||||
|
log_file = Path(log_file)
|
||||||
|
log_file.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
# Create logger
|
||||||
|
logger = logging.getLogger(analysis_name or 'analysis')
|
||||||
|
logger.setLevel(log_level)
|
||||||
|
|
||||||
|
# Remove existing handlers to avoid duplicates
|
||||||
|
logger.handlers = []
|
||||||
|
|
||||||
|
# Create formatters
|
||||||
|
detailed_formatter = logging.Formatter(
|
||||||
|
'%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
||||||
|
datefmt='%Y-%m-%d %H:%M:%S'
|
||||||
|
)
|
||||||
|
|
||||||
|
console_formatter = logging.Formatter(
|
||||||
|
'%(levelname)s - %(message)s'
|
||||||
|
)
|
||||||
|
|
||||||
|
# File handler (detailed)
|
||||||
|
file_handler = logging.FileHandler(log_file, encoding='utf-8')
|
||||||
|
file_handler.setLevel(log_level)
|
||||||
|
file_handler.setFormatter(detailed_formatter)
|
||||||
|
logger.addHandler(file_handler)
|
||||||
|
|
||||||
|
# Console handler (simpler)
|
||||||
|
console_handler = logging.StreamHandler(sys.stdout)
|
||||||
|
console_handler.setLevel(log_level)
|
||||||
|
console_handler.setFormatter(console_formatter)
|
||||||
|
logger.addHandler(console_handler)
|
||||||
|
|
||||||
|
# Log startup message
|
||||||
|
logger.info(f"="*60)
|
||||||
|
logger.info(f"Analysis: {analysis_name or 'Unknown'}")
|
||||||
|
logger.info(f"Company: {COMPANY_NAME}")
|
||||||
|
logger.info(f"Log File: {log_file}")
|
||||||
|
logger.info(f"="*60)
|
||||||
|
|
||||||
|
_logger = logger
|
||||||
|
return logger
|
||||||
|
|
||||||
|
def get_logger(analysis_name=None, log_level=logging.INFO):
|
||||||
|
"""
|
||||||
|
Get or create logger instance
|
||||||
|
|
||||||
|
Args:
|
||||||
|
analysis_name: Name of analysis
|
||||||
|
log_level: Logging level (default: INFO)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
logging.Logger: Logger instance
|
||||||
|
"""
|
||||||
|
global _logger
|
||||||
|
|
||||||
|
if _logger is None:
|
||||||
|
_logger = setup_logging(log_level=log_level, analysis_name=analysis_name)
|
||||||
|
|
||||||
|
return _logger
|
||||||
|
|
||||||
|
def log_analysis_start(analysis_name, logger=None):
|
||||||
|
"""
|
||||||
|
Log analysis start
|
||||||
|
|
||||||
|
Args:
|
||||||
|
analysis_name: Name of analysis
|
||||||
|
logger: Logger instance (creates one if None)
|
||||||
|
"""
|
||||||
|
if logger is None:
|
||||||
|
logger = get_logger(analysis_name)
|
||||||
|
|
||||||
|
logger.info(f"Starting analysis: {analysis_name}")
|
||||||
|
logger.info(f"Timestamp: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
||||||
|
|
||||||
|
def log_analysis_end(analysis_name, success=True, logger=None):
|
||||||
|
"""
|
||||||
|
Log analysis completion
|
||||||
|
|
||||||
|
Args:
|
||||||
|
analysis_name: Name of analysis
|
||||||
|
success: Whether analysis completed successfully
|
||||||
|
logger: Logger instance (creates one if None)
|
||||||
|
"""
|
||||||
|
if logger is None:
|
||||||
|
logger = get_logger(analysis_name)
|
||||||
|
|
||||||
|
if success:
|
||||||
|
logger.info(f"Analysis completed successfully: {analysis_name}")
|
||||||
|
else:
|
||||||
|
logger.error(f"Analysis failed: {analysis_name}")
|
||||||
|
|
||||||
|
logger.info(f"Timestamp: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
||||||
|
logger.info("="*60)
|
||||||
|
|
||||||
|
def log_data_loading(df, logger=None):
|
||||||
|
"""
|
||||||
|
Log data loading summary
|
||||||
|
|
||||||
|
Args:
|
||||||
|
df: Loaded DataFrame
|
||||||
|
logger: Logger instance (creates one if None)
|
||||||
|
"""
|
||||||
|
if logger is None:
|
||||||
|
logger = get_logger()
|
||||||
|
|
||||||
|
logger.info(f"Data loaded: {len(df):,} rows, {len(df.columns)} columns")
|
||||||
|
|
||||||
|
from config import REVENUE_COLUMN, DATE_COLUMN
|
||||||
|
if REVENUE_COLUMN in df.columns:
|
||||||
|
revenue = df[REVENUE_COLUMN].sum()
|
||||||
|
logger.info(f"Total revenue: ${revenue / 1e6:.2f}m")
|
||||||
|
|
||||||
|
if DATE_COLUMN in df.columns:
|
||||||
|
date_coverage = df[DATE_COLUMN].notna().sum() / len(df) * 100
|
||||||
|
logger.info(f"Date coverage: {date_coverage:.1f}%")
|
||||||
|
|
||||||
|
def log_error(error, logger=None, context=None):
|
||||||
|
"""
|
||||||
|
Log error with context
|
||||||
|
|
||||||
|
Args:
|
||||||
|
error: Exception or error message
|
||||||
|
logger: Logger instance (creates one if None)
|
||||||
|
context: Additional context string
|
||||||
|
"""
|
||||||
|
if logger is None:
|
||||||
|
logger = get_logger()
|
||||||
|
|
||||||
|
error_msg = str(error)
|
||||||
|
if context:
|
||||||
|
error_msg = f"{context}: {error_msg}"
|
||||||
|
|
||||||
|
logger.error(error_msg, exc_info=True)
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# EXAMPLE USAGE
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
"""Example usage"""
|
||||||
|
logger = setup_logging(log_level=logging.DEBUG, analysis_name="Example Analysis")
|
||||||
|
|
||||||
|
logger.debug("This is a debug message")
|
||||||
|
logger.info("This is an info message")
|
||||||
|
logger.warning("This is a warning message")
|
||||||
|
logger.error("This is an error message")
|
||||||
|
|
||||||
|
log_analysis_start("Example Analysis", logger)
|
||||||
|
log_analysis_end("Example Analysis", success=True, logger)
|
||||||
228
report_generator.py
Normal file
228
report_generator.py
Normal file
@@ -0,0 +1,228 @@
|
|||||||
|
"""
|
||||||
|
Report generation utility
|
||||||
|
Combines multiple charts and data into a PDF report
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
from report_generator import generate_pdf_report
|
||||||
|
|
||||||
|
# Generate PDF report
|
||||||
|
generate_pdf_report(
|
||||||
|
charts=['chart1.png', 'chart2.png'],
|
||||||
|
title='Sales Analysis Report',
|
||||||
|
summary_data={'Total Revenue': 1000000}
|
||||||
|
)
|
||||||
|
"""
|
||||||
|
from pathlib import Path
|
||||||
|
from datetime import datetime
|
||||||
|
from config import COMPANY_NAME, OUTPUT_DIR, REPORTS_DIR, ensure_directories
|
||||||
|
|
||||||
|
def generate_pdf_report(
|
||||||
|
charts,
|
||||||
|
title=None,
|
||||||
|
summary_data=None,
|
||||||
|
output_filename=None,
|
||||||
|
output_dir=None
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Generate PDF report from charts and summary data
|
||||||
|
|
||||||
|
Args:
|
||||||
|
charts: List of chart file paths (PNG files)
|
||||||
|
title: Report title (defaults to company name + date)
|
||||||
|
summary_data: Dictionary of summary metrics
|
||||||
|
output_filename: Output PDF filename (defaults to report_YYYYMMDD_HHMMSS.pdf)
|
||||||
|
output_dir: Output directory (defaults to config.REPORTS_DIR)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Path: Path to generated PDF file
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ImportError: If reportlab is not installed
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
from reportlab.lib.pagesizes import letter, A4
|
||||||
|
from reportlab.lib.units import inch
|
||||||
|
from reportlab.lib import colors
|
||||||
|
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Image, Table, TableStyle, PageBreak
|
||||||
|
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
|
||||||
|
from reportlab.lib.enums import TA_CENTER, TA_LEFT
|
||||||
|
except ImportError:
|
||||||
|
raise ImportError(
|
||||||
|
"reportlab is required for PDF generation. Install with: pip install reportlab"
|
||||||
|
)
|
||||||
|
|
||||||
|
if output_dir is None:
|
||||||
|
output_dir = REPORTS_DIR
|
||||||
|
else:
|
||||||
|
output_dir = Path(output_dir)
|
||||||
|
|
||||||
|
ensure_directories()
|
||||||
|
output_dir.mkdir(exist_ok=True)
|
||||||
|
|
||||||
|
# Default filename
|
||||||
|
if output_filename is None:
|
||||||
|
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
||||||
|
output_filename = f"report_{timestamp}.pdf"
|
||||||
|
|
||||||
|
output_path = output_dir / output_filename
|
||||||
|
|
||||||
|
# Create PDF document
|
||||||
|
doc = SimpleDocTemplate(
|
||||||
|
str(output_path),
|
||||||
|
pagesize=letter,
|
||||||
|
rightMargin=0.75*inch,
|
||||||
|
leftMargin=0.75*inch,
|
||||||
|
topMargin=0.75*inch,
|
||||||
|
bottomMargin=0.75*inch
|
||||||
|
)
|
||||||
|
|
||||||
|
# Container for PDF elements
|
||||||
|
story = []
|
||||||
|
|
||||||
|
# Styles
|
||||||
|
styles = getSampleStyleSheet()
|
||||||
|
title_style = ParagraphStyle(
|
||||||
|
'CustomTitle',
|
||||||
|
parent=styles['Heading1'],
|
||||||
|
fontSize=20,
|
||||||
|
textColor=colors.HexColor('#2E86AB'),
|
||||||
|
spaceAfter=30,
|
||||||
|
alignment=TA_CENTER
|
||||||
|
)
|
||||||
|
|
||||||
|
heading_style = ParagraphStyle(
|
||||||
|
'CustomHeading',
|
||||||
|
parent=styles['Heading2'],
|
||||||
|
fontSize=14,
|
||||||
|
textColor=colors.HexColor('#2E86AB'),
|
||||||
|
spaceAfter=12
|
||||||
|
)
|
||||||
|
|
||||||
|
# Title
|
||||||
|
if title is None:
|
||||||
|
title = f"{COMPANY_NAME} Sales Analysis Report"
|
||||||
|
|
||||||
|
story.append(Paragraph(title, title_style))
|
||||||
|
story.append(Spacer(1, 0.2*inch))
|
||||||
|
|
||||||
|
# Report metadata
|
||||||
|
metadata_text = f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
|
||||||
|
story.append(Paragraph(metadata_text, styles['Normal']))
|
||||||
|
story.append(Spacer(1, 0.3*inch))
|
||||||
|
|
||||||
|
# Summary data table
|
||||||
|
if summary_data:
|
||||||
|
story.append(Paragraph("Summary", heading_style))
|
||||||
|
|
||||||
|
# Create table
|
||||||
|
table_data = [['Metric', 'Value']]
|
||||||
|
for key, value in summary_data.items():
|
||||||
|
# Format value
|
||||||
|
if isinstance(value, (int, float)):
|
||||||
|
if abs(value) >= 1e6:
|
||||||
|
formatted_value = f"${value / 1e6:.2f}m"
|
||||||
|
elif abs(value) >= 1e3:
|
||||||
|
formatted_value = f"${value / 1e3:.2f}k"
|
||||||
|
else:
|
||||||
|
formatted_value = f"${value:.2f}"
|
||||||
|
else:
|
||||||
|
formatted_value = str(value)
|
||||||
|
|
||||||
|
table_data.append([key, formatted_value])
|
||||||
|
|
||||||
|
table = Table(table_data, colWidths=[3*inch, 2*inch])
|
||||||
|
table.setStyle(TableStyle([
|
||||||
|
('BACKGROUND', (0, 0), (-1, 0), colors.HexColor('#2E86AB')),
|
||||||
|
('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
|
||||||
|
('ALIGN', (0, 0), (-1, -1), 'LEFT'),
|
||||||
|
('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
|
||||||
|
('FONTSIZE', (0, 0), (-1, 0), 12),
|
||||||
|
('BOTTOMPADDING', (0, 0), (-1, 0), 12),
|
||||||
|
('BACKGROUND', (0, 1), (-1, -1), colors.beige),
|
||||||
|
('GRID', (0, 0), (-1, -1), 1, colors.black),
|
||||||
|
('ROWBACKGROUNDS', (0, 1), (-1, -1), [colors.white, colors.lightgrey])
|
||||||
|
]))
|
||||||
|
|
||||||
|
story.append(table)
|
||||||
|
story.append(Spacer(1, 0.3*inch))
|
||||||
|
|
||||||
|
# Add charts
|
||||||
|
if charts:
|
||||||
|
story.append(Paragraph("Charts", heading_style))
|
||||||
|
|
||||||
|
for i, chart_path in enumerate(charts, 1):
|
||||||
|
chart_path = Path(chart_path)
|
||||||
|
|
||||||
|
if not chart_path.exists():
|
||||||
|
print(f"Warning: Chart not found: {chart_path}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Add chart title
|
||||||
|
chart_title = f"Chart {i}: {chart_path.stem.replace('_', ' ').title()}"
|
||||||
|
story.append(Paragraph(chart_title, styles['Heading3']))
|
||||||
|
story.append(Spacer(1, 0.1*inch))
|
||||||
|
|
||||||
|
# Add image
|
||||||
|
try:
|
||||||
|
img = Image(str(chart_path), width=6*inch, height=4*inch)
|
||||||
|
story.append(img)
|
||||||
|
except Exception as e:
|
||||||
|
error_msg = f"Error loading chart: {e}"
|
||||||
|
story.append(Paragraph(error_msg, styles['Normal']))
|
||||||
|
|
||||||
|
# Add page break between charts (except last one)
|
||||||
|
if i < len(charts):
|
||||||
|
story.append(PageBreak())
|
||||||
|
|
||||||
|
# Build PDF
|
||||||
|
doc.build(story)
|
||||||
|
|
||||||
|
print(f"PDF report generated: {output_path}")
|
||||||
|
|
||||||
|
return output_path
|
||||||
|
|
||||||
|
def generate_simple_report(charts, title=None, output_filename=None):
|
||||||
|
"""
|
||||||
|
Generate a simple PDF report (wrapper with defaults)
|
||||||
|
|
||||||
|
Args:
|
||||||
|
charts: List of chart file paths
|
||||||
|
title: Report title
|
||||||
|
output_filename: Output filename
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Path: Path to generated PDF
|
||||||
|
"""
|
||||||
|
return generate_pdf_report(
|
||||||
|
charts=charts,
|
||||||
|
title=title,
|
||||||
|
output_filename=output_filename
|
||||||
|
)
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# EXAMPLE USAGE
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
"""Example usage"""
|
||||||
|
from config import OUTPUT_DIR
|
||||||
|
|
||||||
|
# Find charts in output directory
|
||||||
|
chart_files = list(OUTPUT_DIR.glob('*.png'))
|
||||||
|
|
||||||
|
if chart_files:
|
||||||
|
print(f"Found {len(chart_files)} charts")
|
||||||
|
|
||||||
|
# Generate report
|
||||||
|
report_path = generate_pdf_report(
|
||||||
|
charts=[str(f) for f in chart_files[:5]], # Limit to 5 charts
|
||||||
|
title="Sales Analysis Report",
|
||||||
|
summary_data={
|
||||||
|
'Total Charts': len(chart_files),
|
||||||
|
'Report Date': datetime.now().strftime('%Y-%m-%d')
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
print(f"Report saved to: {report_path}")
|
||||||
|
else:
|
||||||
|
print("No charts found in output directory")
|
||||||
30
requirements.txt
Normal file
30
requirements.txt
Normal file
@@ -0,0 +1,30 @@
|
|||||||
|
# Python dependencies for Sales Analysis Template
|
||||||
|
# Install with: pip install -r requirements.txt
|
||||||
|
|
||||||
|
# Core data analysis
|
||||||
|
pandas>=2.0.0
|
||||||
|
numpy>=1.24.0
|
||||||
|
|
||||||
|
# Visualization
|
||||||
|
matplotlib>=3.7.0
|
||||||
|
seaborn>=0.12.0
|
||||||
|
|
||||||
|
# Export utilities (optional - uncomment if needed)
|
||||||
|
# openpyxl>=3.1.0 # For Excel export (export_utils.py)
|
||||||
|
|
||||||
|
# Interactive visualizations (optional - uncomment if needed)
|
||||||
|
# plotly>=5.17.0 # For interactive charts (analysis_utils.py)
|
||||||
|
|
||||||
|
# Report generation (optional - uncomment if needed)
|
||||||
|
# reportlab>=4.0.0 # For PDF reports (report_generator.py)
|
||||||
|
|
||||||
|
# Statistical analysis (optional - uncomment if needed)
|
||||||
|
# scipy>=1.10.0 # For statistical analysis, product lifecycle (statistical_utils.py)
|
||||||
|
|
||||||
|
# Testing (optional - uncomment if needed)
|
||||||
|
# pytest>=7.4.0 # For unit tests
|
||||||
|
|
||||||
|
# Advanced analysis (optional - uncomment if needed)
|
||||||
|
# pmdarima>=2.0.0 # For time series forecasting
|
||||||
|
# mlxtend>=0.22.0 # For market basket analysis
|
||||||
|
# scikit-learn>=1.3.0 # For machine learning analyses
|
||||||
185
run_all_analyses.py
Normal file
185
run_all_analyses.py
Normal file
@@ -0,0 +1,185 @@
|
|||||||
|
"""
|
||||||
|
Batch runner for all analysis scripts
|
||||||
|
Runs all analyses in sequence and generates a summary report
|
||||||
|
|
||||||
|
To use:
|
||||||
|
1. Add your analysis scripts to the ANALYSIS_SCRIPTS list below
|
||||||
|
2. Run: python run_all_analyses.py
|
||||||
|
"""
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
from datetime import datetime
|
||||||
|
import time
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# CONFIGURATION
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
# List of analysis scripts to run
|
||||||
|
# TODO: Add your analysis scripts here
|
||||||
|
ANALYSIS_SCRIPTS = [
|
||||||
|
# Example structure - customize for your analyses:
|
||||||
|
# 'check_annual_revenue.py',
|
||||||
|
# 'revenue_analysis.py',
|
||||||
|
# 'geographic_analysis.py',
|
||||||
|
# 'customer_segmentation.py',
|
||||||
|
# 'product_analysis.py',
|
||||||
|
# Add your analysis scripts here...
|
||||||
|
]
|
||||||
|
|
||||||
|
# Timeout per script (in seconds)
|
||||||
|
SCRIPT_TIMEOUT = 600 # 10 minutes
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# HELPER FUNCTIONS
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
def run_script(script_path):
|
||||||
|
"""Run a single analysis script"""
|
||||||
|
script_name = Path(script_path).name
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print(f"Running: {script_name}")
|
||||||
|
print(f"{'='*60}")
|
||||||
|
|
||||||
|
start_time = time.time()
|
||||||
|
|
||||||
|
try:
|
||||||
|
result = subprocess.run(
|
||||||
|
[sys.executable, script_path],
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
timeout=SCRIPT_TIMEOUT
|
||||||
|
)
|
||||||
|
|
||||||
|
elapsed = time.time() - start_time
|
||||||
|
|
||||||
|
if result.returncode == 0:
|
||||||
|
print(f"✅ {script_name} completed successfully ({elapsed:.1f}s)")
|
||||||
|
if result.stdout:
|
||||||
|
# Print last 10 lines of output
|
||||||
|
lines = result.stdout.strip().split('\n')
|
||||||
|
if len(lines) > 10:
|
||||||
|
print(" ... (output truncated)")
|
||||||
|
for line in lines[-10:]:
|
||||||
|
print(f" {line}")
|
||||||
|
else:
|
||||||
|
for line in lines:
|
||||||
|
print(f" {line}")
|
||||||
|
return True, elapsed, None
|
||||||
|
else:
|
||||||
|
print(f"❌ {script_name} failed ({elapsed:.1f}s)")
|
||||||
|
if result.stderr:
|
||||||
|
print(f" Error: {result.stderr[:500]}")
|
||||||
|
return False, elapsed, result.stderr
|
||||||
|
|
||||||
|
except subprocess.TimeoutExpired:
|
||||||
|
elapsed = time.time() - start_time
|
||||||
|
print(f"⏱️ {script_name} timed out after {elapsed:.1f}s")
|
||||||
|
return False, elapsed, "Timeout"
|
||||||
|
except Exception as e:
|
||||||
|
elapsed = time.time() - start_time
|
||||||
|
print(f"❌ {script_name} error: {str(e)}")
|
||||||
|
return False, elapsed, str(e)
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# MAIN FUNCTION
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""Run all analysis scripts"""
|
||||||
|
from config import COMPANY_NAME
|
||||||
|
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print(f"{COMPANY_NAME} Sales Analysis - Batch Runner")
|
||||||
|
print(f"Started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
||||||
|
print(f"{'='*60}\n")
|
||||||
|
|
||||||
|
# Check which scripts exist
|
||||||
|
existing_scripts = []
|
||||||
|
missing_scripts = []
|
||||||
|
|
||||||
|
for script in ANALYSIS_SCRIPTS:
|
||||||
|
script_path = Path(script)
|
||||||
|
if script_path.exists():
|
||||||
|
existing_scripts.append(script)
|
||||||
|
else:
|
||||||
|
missing_scripts.append(script)
|
||||||
|
|
||||||
|
if missing_scripts:
|
||||||
|
print(f"⚠️ Warning: {len(missing_scripts)} scripts not found:")
|
||||||
|
for script in missing_scripts:
|
||||||
|
print(f" - {script}")
|
||||||
|
print()
|
||||||
|
|
||||||
|
if not existing_scripts:
|
||||||
|
print("❌ No analysis scripts found!")
|
||||||
|
print(" Please add analysis scripts to ANALYSIS_SCRIPTS list in run_all_analyses.py")
|
||||||
|
return
|
||||||
|
|
||||||
|
print(f"Found {len(existing_scripts)} analysis scripts to run\n")
|
||||||
|
|
||||||
|
# Run scripts
|
||||||
|
results = []
|
||||||
|
total_start = time.time()
|
||||||
|
|
||||||
|
for script in existing_scripts:
|
||||||
|
success, elapsed, error = run_script(script)
|
||||||
|
results.append({
|
||||||
|
'script': script,
|
||||||
|
'success': success,
|
||||||
|
'elapsed': elapsed,
|
||||||
|
'error': error
|
||||||
|
})
|
||||||
|
|
||||||
|
total_elapsed = time.time() - total_start
|
||||||
|
|
||||||
|
# Print summary
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print("Batch Run Summary")
|
||||||
|
print(f"{'='*60}\n")
|
||||||
|
|
||||||
|
successful = [r for r in results if r['success']]
|
||||||
|
failed = [r for r in results if not r['success']]
|
||||||
|
|
||||||
|
print(f"Total scripts: {len(results)}")
|
||||||
|
print(f"✅ Successful: {len(successful)}")
|
||||||
|
print(f"❌ Failed: {len(failed)}")
|
||||||
|
print(f"⏱️ Total time: {total_elapsed/60:.1f} minutes\n")
|
||||||
|
|
||||||
|
if failed:
|
||||||
|
print("Failed scripts:")
|
||||||
|
for r in failed:
|
||||||
|
print(f" ❌ {r['script']} ({r['elapsed']:.1f}s)")
|
||||||
|
if r['error']:
|
||||||
|
print(f" Error: {r['error'][:100]}")
|
||||||
|
print()
|
||||||
|
|
||||||
|
# Save summary to file
|
||||||
|
summary_file = Path('analysis_run_summary.txt')
|
||||||
|
with open(summary_file, 'w') as f:
|
||||||
|
f.write(f"{COMPANY_NAME} Sales Analysis - Batch Run Summary\n")
|
||||||
|
f.write(f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
|
||||||
|
f.write(f"{'='*60}\n\n")
|
||||||
|
f.write(f"Total scripts: {len(results)}\n")
|
||||||
|
f.write(f"Successful: {len(successful)}\n")
|
||||||
|
f.write(f"Failed: {len(failed)}\n")
|
||||||
|
f.write(f"Total time: {total_elapsed/60:.1f} minutes\n\n")
|
||||||
|
|
||||||
|
if successful:
|
||||||
|
f.write("Successful scripts:\n")
|
||||||
|
for r in successful:
|
||||||
|
f.write(f" ✅ {r['script']} ({r['elapsed']:.1f}s)\n")
|
||||||
|
f.write("\n")
|
||||||
|
|
||||||
|
if failed:
|
||||||
|
f.write("Failed scripts:\n")
|
||||||
|
for r in failed:
|
||||||
|
f.write(f" ❌ {r['script']} ({r['elapsed']:.1f}s)\n")
|
||||||
|
if r['error']:
|
||||||
|
f.write(f" Error: {r['error']}\n")
|
||||||
|
|
||||||
|
print(f"Summary saved to: {summary_file}")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
240
setup_wizard.py
Normal file
240
setup_wizard.py
Normal file
@@ -0,0 +1,240 @@
|
|||||||
|
"""
|
||||||
|
Interactive setup wizard for configuring the sales analysis template
|
||||||
|
Asks clarifying questions to configure config.py for your specific company and data
|
||||||
|
"""
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
def print_header(text):
|
||||||
|
"""Print a formatted header"""
|
||||||
|
print("\n" + "="*70)
|
||||||
|
print(f" {text}")
|
||||||
|
print("="*70 + "\n")
|
||||||
|
|
||||||
|
def ask_question(prompt, default=None, validator=None):
|
||||||
|
"""
|
||||||
|
Ask a question and return the answer
|
||||||
|
|
||||||
|
Args:
|
||||||
|
prompt: Question to ask
|
||||||
|
default: Default value if user just presses Enter
|
||||||
|
validator: Optional function to validate input
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
User's answer (or default)
|
||||||
|
"""
|
||||||
|
if default:
|
||||||
|
full_prompt = f"{prompt} [{default}]: "
|
||||||
|
else:
|
||||||
|
full_prompt = f"{prompt}: "
|
||||||
|
|
||||||
|
while True:
|
||||||
|
answer = input(full_prompt).strip()
|
||||||
|
if not answer and default:
|
||||||
|
return default
|
||||||
|
elif not answer:
|
||||||
|
print(" Please provide an answer.")
|
||||||
|
continue
|
||||||
|
|
||||||
|
if validator:
|
||||||
|
try:
|
||||||
|
return validator(answer)
|
||||||
|
except Exception as e:
|
||||||
|
print(f" Invalid input: {e}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
return answer
|
||||||
|
|
||||||
|
def validate_yes_no(answer):
|
||||||
|
"""Validate yes/no answer"""
|
||||||
|
answer_lower = answer.lower()
|
||||||
|
if answer_lower in ['y', 'yes', 'true', '1']:
|
||||||
|
return True
|
||||||
|
elif answer_lower in ['n', 'no', 'false', '0']:
|
||||||
|
return False
|
||||||
|
else:
|
||||||
|
raise ValueError("Please answer 'yes' or 'no'")
|
||||||
|
|
||||||
|
def validate_int(answer):
|
||||||
|
"""Validate integer answer"""
|
||||||
|
return int(answer)
|
||||||
|
|
||||||
|
def validate_file_exists(answer):
|
||||||
|
"""Validate that file exists"""
|
||||||
|
if not Path(answer).exists():
|
||||||
|
raise ValueError(f"File not found: {answer}")
|
||||||
|
return answer
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""Run the setup wizard"""
|
||||||
|
print_header("Sales Analysis Template - Setup Wizard")
|
||||||
|
print("This wizard will help you configure the template for your company's data.")
|
||||||
|
print("You can press Enter to accept defaults (shown in brackets).\n")
|
||||||
|
|
||||||
|
responses = {}
|
||||||
|
|
||||||
|
# Company Information
|
||||||
|
print_header("Company Information")
|
||||||
|
responses['company_name'] = ask_question("Company Name", default="Your Company Name")
|
||||||
|
responses['analysis_date'] = ask_question("Analysis Date (YYYY-MM-DD)", default="2026-01-12")
|
||||||
|
|
||||||
|
# Data File
|
||||||
|
print_header("Data File Configuration")
|
||||||
|
print("Where is your sales data CSV file located?")
|
||||||
|
data_file = ask_question("Data file name (e.g., sales_data.csv)", default="sales_data.csv")
|
||||||
|
|
||||||
|
# Check if file exists
|
||||||
|
if Path(data_file).exists():
|
||||||
|
print(f" ✓ Found: {data_file}")
|
||||||
|
else:
|
||||||
|
print(f" ⚠ Warning: {data_file} not found. Make sure to place it in the template directory.")
|
||||||
|
|
||||||
|
responses['data_file'] = data_file
|
||||||
|
|
||||||
|
# Column Mapping
|
||||||
|
print_header("Column Mapping")
|
||||||
|
print("What are the column names in your CSV file?")
|
||||||
|
print("(Press Enter to accept defaults if your columns match common names)\n")
|
||||||
|
|
||||||
|
responses['revenue_column'] = ask_question("Revenue/Amount column name", default="USD")
|
||||||
|
responses['date_column'] = ask_question("Primary date column name", default="InvoiceDate")
|
||||||
|
|
||||||
|
has_fallback = ask_question("Do you have fallback date columns (Month, Year)?", default="yes", validator=validate_yes_no)
|
||||||
|
if has_fallback:
|
||||||
|
fallback_str = ask_question("Fallback date columns (comma-separated)", default="Month, Year")
|
||||||
|
responses['date_fallback'] = [col.strip() for col in fallback_str.split(',')]
|
||||||
|
else:
|
||||||
|
responses['date_fallback'] = []
|
||||||
|
|
||||||
|
responses['customer_column'] = ask_question("Customer/Account column name", default="Customer")
|
||||||
|
responses['item_column'] = ask_question("Item/Product column name", default="Item")
|
||||||
|
|
||||||
|
has_quantity = ask_question("Do you have a Quantity column?", default="yes", validator=validate_yes_no)
|
||||||
|
if has_quantity:
|
||||||
|
responses['quantity_column'] = ask_question("Quantity column name", default="Quantity")
|
||||||
|
else:
|
||||||
|
responses['quantity_column'] = None
|
||||||
|
|
||||||
|
# Date Range
|
||||||
|
print_header("Date Range Configuration")
|
||||||
|
responses['min_year'] = ask_question("Minimum year to include in analysis", default="2021", validator=validate_int)
|
||||||
|
responses['max_date'] = ask_question("Maximum date (YYYY-MM-DD)", default="2025-09-30")
|
||||||
|
|
||||||
|
years_str = ask_question("Analysis years (comma-separated, e.g., 2021,2022,2023,2024,2025)", default="2021,2022,2023,2024,2025")
|
||||||
|
responses['analysis_years'] = [int(y.strip()) for y in years_str.split(',')]
|
||||||
|
|
||||||
|
# LTM Configuration
|
||||||
|
print_header("LTM (Last Twelve Months) Configuration")
|
||||||
|
print("LTM is used for the most recent partial year to enable apples-to-apples comparison.")
|
||||||
|
print("Example: If your latest data is through September 2025, use Oct 2024 - Sep 2025.\n")
|
||||||
|
|
||||||
|
use_ltm = ask_question("Do you need LTM for the most recent year?", default="yes", validator=validate_yes_no)
|
||||||
|
responses['ltm_enabled'] = use_ltm
|
||||||
|
|
||||||
|
if use_ltm:
|
||||||
|
responses['ltm_start_month'] = ask_question("LTM start month (1-12)", default="10", validator=validate_int)
|
||||||
|
responses['ltm_start_year'] = ask_question("LTM start year", default="2024", validator=validate_int)
|
||||||
|
responses['ltm_end_month'] = ask_question("LTM end month (1-12)", default="9", validator=validate_int)
|
||||||
|
responses['ltm_end_year'] = ask_question("LTM end year", default="2025", validator=validate_int)
|
||||||
|
else:
|
||||||
|
responses['ltm_start_month'] = 10
|
||||||
|
responses['ltm_start_year'] = 2024
|
||||||
|
responses['ltm_end_month'] = 9
|
||||||
|
responses['ltm_end_year'] = 2025
|
||||||
|
|
||||||
|
# Exclusion Filters
|
||||||
|
print_header("Exclusion Filters (Optional)")
|
||||||
|
use_exclusions = ask_question("Do you need to exclude specific segments (e.g., test accounts, business units)?", default="no", validator=validate_yes_no)
|
||||||
|
responses['exclusions_enabled'] = use_exclusions
|
||||||
|
|
||||||
|
if use_exclusions:
|
||||||
|
responses['exclude_column'] = ask_question("Column name to filter on", default="Country")
|
||||||
|
exclude_values_str = ask_question("Values to exclude (comma-separated)", default="")
|
||||||
|
responses['exclude_values'] = [v.strip() for v in exclude_values_str.split(',') if v.strip()]
|
||||||
|
else:
|
||||||
|
responses['exclude_column'] = None
|
||||||
|
responses['exclude_values'] = []
|
||||||
|
|
||||||
|
# Generate config.py
|
||||||
|
print_header("Generating Configuration")
|
||||||
|
print("Updating config.py with your settings...")
|
||||||
|
|
||||||
|
# Read current config.py
|
||||||
|
config_path = Path('config.py')
|
||||||
|
if not config_path.exists():
|
||||||
|
print("ERROR: config.py not found!")
|
||||||
|
return
|
||||||
|
|
||||||
|
with open(config_path, 'r', encoding='utf-8') as f:
|
||||||
|
config_content = f.read()
|
||||||
|
|
||||||
|
# Replace values
|
||||||
|
replacements = {
|
||||||
|
"COMPANY_NAME = \"Your Company Name\"": f"COMPANY_NAME = \"{responses['company_name']}\"",
|
||||||
|
"ANALYSIS_DATE = \"2026-01-12\"": f"ANALYSIS_DATE = \"{responses['analysis_date']}\"",
|
||||||
|
"DATA_FILE = 'sales_data.csv'": f"DATA_FILE = '{responses['data_file']}'",
|
||||||
|
"REVENUE_COLUMN = 'USD'": f"REVENUE_COLUMN = '{responses['revenue_column']}'",
|
||||||
|
"DATE_COLUMN = 'InvoiceDate'": f"DATE_COLUMN = '{responses['date_column']}'",
|
||||||
|
"DATE_FALLBACK_COLUMNS = ['Month', 'Year']": f"DATE_FALLBACK_COLUMNS = {responses['date_fallback']}",
|
||||||
|
"CUSTOMER_COLUMN = 'Customer'": f"CUSTOMER_COLUMN = '{responses['customer_column']}'",
|
||||||
|
"ITEM_COLUMN = 'Item'": f"ITEM_COLUMN = '{responses['item_column']}'",
|
||||||
|
"QUANTITY_COLUMN = 'Quantity'": f"QUANTITY_COLUMN = '{responses['quantity_column']}'" if responses['quantity_column'] else "QUANTITY_COLUMN = None",
|
||||||
|
"MIN_YEAR = 2021": f"MIN_YEAR = {responses['min_year']}",
|
||||||
|
"MAX_DATE = pd.Timestamp('2025-09-30')": f"MAX_DATE = pd.Timestamp('{responses['max_date']}')",
|
||||||
|
"ANALYSIS_YEARS = [2021, 2022, 2023, 2024, 2025]": f"ANALYSIS_YEARS = {responses['analysis_years']}",
|
||||||
|
"LTM_ENABLED = True": f"LTM_ENABLED = {responses['ltm_enabled']}",
|
||||||
|
"LTM_START_MONTH = 10": f"LTM_START_MONTH = {responses['ltm_start_month']}",
|
||||||
|
"LTM_START_YEAR = 2024": f"LTM_START_YEAR = {responses['ltm_start_year']}",
|
||||||
|
"LTM_END_MONTH = 9": f"LTM_END_MONTH = {responses['ltm_end_month']}",
|
||||||
|
"LTM_END_YEAR = 2025": f"LTM_END_YEAR = {responses['ltm_end_year']}",
|
||||||
|
}
|
||||||
|
|
||||||
|
# Handle exclusions
|
||||||
|
if responses['exclusions_enabled']:
|
||||||
|
exclusions_config = f"""EXCLUSION_FILTERS = {{
|
||||||
|
'enabled': True,
|
||||||
|
'exclude_by_column': '{responses['exclude_column']}',
|
||||||
|
'exclude_values': {responses['exclude_values']}
|
||||||
|
}}"""
|
||||||
|
# Replace the exclusion filters section
|
||||||
|
import re
|
||||||
|
pattern = r"EXCLUSION_FILTERS = \{.*?\}"
|
||||||
|
config_content = re.sub(pattern, exclusions_config, config_content, flags=re.DOTALL)
|
||||||
|
else:
|
||||||
|
exclusions_config = """EXCLUSION_FILTERS = {
|
||||||
|
'enabled': False,
|
||||||
|
'exclude_by_column': None,
|
||||||
|
'exclude_values': []
|
||||||
|
}"""
|
||||||
|
import re
|
||||||
|
pattern = r"EXCLUSION_FILTERS = \{.*?\}"
|
||||||
|
config_content = re.sub(pattern, exclusions_config, config_content, flags=re.DOTALL)
|
||||||
|
|
||||||
|
# Apply replacements
|
||||||
|
for old, new in replacements.items():
|
||||||
|
if old in config_content:
|
||||||
|
config_content = config_content.replace(old, new)
|
||||||
|
|
||||||
|
# Write updated config
|
||||||
|
with open(config_path, 'w', encoding='utf-8') as f:
|
||||||
|
f.write(config_content)
|
||||||
|
|
||||||
|
print(" ✓ Configuration updated successfully!")
|
||||||
|
|
||||||
|
# Summary
|
||||||
|
print_header("Setup Complete")
|
||||||
|
print("Your configuration has been saved to config.py")
|
||||||
|
print("\nNext steps:")
|
||||||
|
print("1. Place your data file in the template directory (if not already there)")
|
||||||
|
print("2. Test data loading: python -c \"from data_loader import load_sales_data; from config import get_data_path; df = load_sales_data(get_data_path()); print(f'Loaded {len(df):,} rows')\"")
|
||||||
|
print("3. Review config.py and adjust any settings as needed")
|
||||||
|
print("4. Start creating your analysis scripts using analysis_template.py")
|
||||||
|
print("\nFor help, see README.md")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
try:
|
||||||
|
main()
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
print("\n\nSetup cancelled by user.")
|
||||||
|
sys.exit(0)
|
||||||
321
statistical_utils.py
Normal file
321
statistical_utils.py
Normal file
@@ -0,0 +1,321 @@
|
|||||||
|
"""
|
||||||
|
Statistical analysis utilities
|
||||||
|
Common statistical operations for sales analysis
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
from statistical_utils import calculate_yoy_growth, calculate_cagr, calculate_correlation
|
||||||
|
|
||||||
|
# Calculate year-over-year growth
|
||||||
|
growth = calculate_yoy_growth(current_value=100, previous_value=90)
|
||||||
|
|
||||||
|
# Calculate CAGR
|
||||||
|
cagr = calculate_cagr(start_value=100, end_value=150, periods=3)
|
||||||
|
"""
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
from scipy import stats
|
||||||
|
|
||||||
|
def calculate_yoy_growth(current, previous):
|
||||||
|
"""
|
||||||
|
Calculate year-over-year growth percentage
|
||||||
|
|
||||||
|
Args:
|
||||||
|
current: Current period value
|
||||||
|
previous: Previous period value
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
float: Growth percentage (can be negative)
|
||||||
|
|
||||||
|
Example:
|
||||||
|
calculate_yoy_growth(110, 100) # Returns 10.0
|
||||||
|
calculate_yoy_growth(90, 100) # Returns -10.0
|
||||||
|
"""
|
||||||
|
if previous == 0:
|
||||||
|
return np.nan if current == 0 else np.inf
|
||||||
|
|
||||||
|
return ((current - previous) / previous) * 100
|
||||||
|
|
||||||
|
def calculate_cagr(start_value, end_value, periods):
|
||||||
|
"""
|
||||||
|
Calculate Compound Annual Growth Rate (CAGR)
|
||||||
|
|
||||||
|
Args:
|
||||||
|
start_value: Starting value
|
||||||
|
end_value: Ending value
|
||||||
|
periods: Number of periods (years)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
float: CAGR as percentage
|
||||||
|
|
||||||
|
Example:
|
||||||
|
calculate_cagr(100, 150, 3) # Returns ~14.47%
|
||||||
|
"""
|
||||||
|
if start_value <= 0 or periods <= 0:
|
||||||
|
return np.nan
|
||||||
|
|
||||||
|
if end_value <= 0:
|
||||||
|
return np.nan
|
||||||
|
|
||||||
|
cagr = ((end_value / start_value) ** (1 / periods) - 1) * 100
|
||||||
|
return cagr
|
||||||
|
|
||||||
|
def calculate_correlation(df, col1, col2):
|
||||||
|
"""
|
||||||
|
Calculate correlation between two columns
|
||||||
|
|
||||||
|
Args:
|
||||||
|
df: DataFrame
|
||||||
|
col1: First column name
|
||||||
|
col2: Second column name
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
float: Correlation coefficient (-1 to 1)
|
||||||
|
"""
|
||||||
|
if col1 not in df.columns or col2 not in df.columns:
|
||||||
|
return np.nan
|
||||||
|
|
||||||
|
# Convert to numeric
|
||||||
|
series1 = pd.to_numeric(df[col1], errors='coerce')
|
||||||
|
series2 = pd.to_numeric(df[col2], errors='coerce')
|
||||||
|
|
||||||
|
# Remove NaN pairs
|
||||||
|
valid_mask = series1.notna() & series2.notna()
|
||||||
|
if valid_mask.sum() < 2:
|
||||||
|
return np.nan
|
||||||
|
|
||||||
|
correlation = series1[valid_mask].corr(series2[valid_mask])
|
||||||
|
return correlation
|
||||||
|
|
||||||
|
def calculate_trend_slope(y_values):
|
||||||
|
"""
|
||||||
|
Calculate linear trend slope
|
||||||
|
|
||||||
|
Args:
|
||||||
|
y_values: Array-like of y values
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
float: Slope of linear trend
|
||||||
|
"""
|
||||||
|
if len(y_values) < 2:
|
||||||
|
return np.nan
|
||||||
|
|
||||||
|
x_values = np.arange(len(y_values))
|
||||||
|
|
||||||
|
# Remove NaN values
|
||||||
|
valid_mask = ~np.isnan(y_values)
|
||||||
|
if valid_mask.sum() < 2:
|
||||||
|
return np.nan
|
||||||
|
|
||||||
|
x_valid = x_values[valid_mask]
|
||||||
|
y_valid = y_values[valid_mask]
|
||||||
|
|
||||||
|
slope, intercept, r_value, p_value, std_err = stats.linregress(x_valid, y_valid)
|
||||||
|
return slope
|
||||||
|
|
||||||
|
def calculate_percent_change(series, periods=1):
|
||||||
|
"""
|
||||||
|
Calculate percent change over periods
|
||||||
|
|
||||||
|
Args:
|
||||||
|
series: Pandas Series
|
||||||
|
periods: Number of periods to shift (default: 1)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Series: Percent change
|
||||||
|
"""
|
||||||
|
return series.pct_change(periods=periods) * 100
|
||||||
|
|
||||||
|
def calculate_moving_average(series, window=3):
|
||||||
|
"""
|
||||||
|
Calculate moving average
|
||||||
|
|
||||||
|
Args:
|
||||||
|
series: Pandas Series
|
||||||
|
window: Window size for moving average
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Series: Moving average
|
||||||
|
"""
|
||||||
|
return series.rolling(window=window, center=False).mean()
|
||||||
|
|
||||||
|
def calculate_volatility(series, window=12):
|
||||||
|
"""
|
||||||
|
Calculate rolling volatility (standard deviation)
|
||||||
|
|
||||||
|
Args:
|
||||||
|
series: Pandas Series
|
||||||
|
window: Window size for rolling calculation
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Series: Rolling volatility
|
||||||
|
"""
|
||||||
|
return series.rolling(window=window, center=False).std()
|
||||||
|
|
||||||
|
def calculate_z_score(value, mean, std):
|
||||||
|
"""
|
||||||
|
Calculate z-score
|
||||||
|
|
||||||
|
Args:
|
||||||
|
value: Value to score
|
||||||
|
mean: Mean of distribution
|
||||||
|
std: Standard deviation of distribution
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
float: Z-score
|
||||||
|
"""
|
||||||
|
if std == 0:
|
||||||
|
return np.nan
|
||||||
|
|
||||||
|
return (value - mean) / std
|
||||||
|
|
||||||
|
def test_statistical_significance(group1, group2, alpha=0.05):
|
||||||
|
"""
|
||||||
|
Test statistical significance between two groups (t-test)
|
||||||
|
|
||||||
|
Args:
|
||||||
|
group1: First group (array-like)
|
||||||
|
group2: Second group (array-like)
|
||||||
|
alpha: Significance level (default: 0.05)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
dict: Test results with p-value, significant flag, etc.
|
||||||
|
"""
|
||||||
|
group1 = np.array(group1)
|
||||||
|
group2 = np.array(group2)
|
||||||
|
|
||||||
|
# Remove NaN values
|
||||||
|
group1 = group1[~np.isnan(group1)]
|
||||||
|
group2 = group2[~np.isnan(group2)]
|
||||||
|
|
||||||
|
if len(group1) < 2 or len(group2) < 2:
|
||||||
|
return {
|
||||||
|
'p_value': np.nan,
|
||||||
|
'significant': False,
|
||||||
|
'test_statistic': np.nan,
|
||||||
|
'error': 'Insufficient data'
|
||||||
|
}
|
||||||
|
|
||||||
|
# Perform t-test
|
||||||
|
t_statistic, p_value = stats.ttest_ind(group1, group2)
|
||||||
|
|
||||||
|
return {
|
||||||
|
'p_value': float(p_value),
|
||||||
|
'significant': p_value < alpha,
|
||||||
|
'test_statistic': float(t_statistic),
|
||||||
|
'alpha': alpha,
|
||||||
|
'group1_mean': float(np.mean(group1)),
|
||||||
|
'group2_mean': float(np.mean(group2)),
|
||||||
|
'group1_std': float(np.std(group1)),
|
||||||
|
'group2_std': float(np.std(group2))
|
||||||
|
}
|
||||||
|
|
||||||
|
def calculate_confidence_interval(series, confidence=0.95):
|
||||||
|
"""
|
||||||
|
Calculate confidence interval for a series
|
||||||
|
|
||||||
|
Args:
|
||||||
|
series: Pandas Series
|
||||||
|
confidence: Confidence level (default: 0.95 for 95%)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
dict: Mean, lower bound, upper bound
|
||||||
|
"""
|
||||||
|
series_clean = series.dropna()
|
||||||
|
|
||||||
|
if len(series_clean) == 0:
|
||||||
|
return {
|
||||||
|
'mean': np.nan,
|
||||||
|
'lower': np.nan,
|
||||||
|
'upper': np.nan,
|
||||||
|
'confidence': confidence
|
||||||
|
}
|
||||||
|
|
||||||
|
mean = series_clean.mean()
|
||||||
|
std = series_clean.std()
|
||||||
|
n = len(series_clean)
|
||||||
|
|
||||||
|
# Calculate standard error
|
||||||
|
se = std / np.sqrt(n)
|
||||||
|
|
||||||
|
# Calculate critical value (z-score for normal distribution)
|
||||||
|
alpha = 1 - confidence
|
||||||
|
z_critical = stats.norm.ppf(1 - alpha/2)
|
||||||
|
|
||||||
|
margin = z_critical * se
|
||||||
|
|
||||||
|
return {
|
||||||
|
'mean': float(mean),
|
||||||
|
'lower': float(mean - margin),
|
||||||
|
'upper': float(mean + margin),
|
||||||
|
'confidence': confidence,
|
||||||
|
'margin': float(margin)
|
||||||
|
}
|
||||||
|
|
||||||
|
def calculate_annual_growth_rates(values, years):
|
||||||
|
"""
|
||||||
|
Calculate year-over-year growth rates for annual data
|
||||||
|
|
||||||
|
Args:
|
||||||
|
values: Array-like of annual values
|
||||||
|
years: Array-like of corresponding years
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
DataFrame: Years, values, and growth rates
|
||||||
|
"""
|
||||||
|
df = pd.DataFrame({
|
||||||
|
'Year': years,
|
||||||
|
'Value': values
|
||||||
|
})
|
||||||
|
|
||||||
|
df['YoY_Growth'] = calculate_percent_change(df['Value'])
|
||||||
|
df['YoY_Change'] = df['Value'].diff()
|
||||||
|
|
||||||
|
return df
|
||||||
|
|
||||||
|
def calculate_seasonality_index(monthly_series):
|
||||||
|
"""
|
||||||
|
Calculate seasonality index for monthly data
|
||||||
|
|
||||||
|
Args:
|
||||||
|
monthly_series: Series with datetime index (monthly frequency)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Series: Seasonality index (1.0 = average, >1.0 = above average, <1.0 = below average)
|
||||||
|
"""
|
||||||
|
if not isinstance(monthly_series.index, pd.DatetimeIndex):
|
||||||
|
raise ValueError("Series must have DatetimeIndex")
|
||||||
|
|
||||||
|
# Extract month
|
||||||
|
monthly_series = monthly_series.copy()
|
||||||
|
monthly_series['Month'] = monthly_series.index.month
|
||||||
|
|
||||||
|
# Calculate average by month
|
||||||
|
monthly_avg = monthly_series.groupby('Month').mean()
|
||||||
|
overall_avg = monthly_series.mean()
|
||||||
|
|
||||||
|
# Calculate seasonality index
|
||||||
|
seasonality = monthly_avg / overall_avg
|
||||||
|
|
||||||
|
return seasonality
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# EXAMPLE USAGE
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
"""Example usage"""
|
||||||
|
# YoY Growth
|
||||||
|
growth = calculate_yoy_growth(110, 100)
|
||||||
|
print(f"Year-over-year growth: {growth:.2f}%")
|
||||||
|
|
||||||
|
# CAGR
|
||||||
|
cagr = calculate_cagr(100, 150, 3)
|
||||||
|
print(f"CAGR: {cagr:.2f}%")
|
||||||
|
|
||||||
|
# Sample data for correlation
|
||||||
|
df = pd.DataFrame({
|
||||||
|
'Revenue': [100, 110, 120, 130, 140],
|
||||||
|
'Quantity': [10, 11, 12, 13, 14]
|
||||||
|
})
|
||||||
|
corr = calculate_correlation(df, 'Revenue', 'Quantity')
|
||||||
|
print(f"Correlation: {corr:.2f}")
|
||||||
85
tests/test_analysis_utils.py
Normal file
85
tests/test_analysis_utils.py
Normal file
@@ -0,0 +1,85 @@
|
|||||||
|
"""
|
||||||
|
Unit tests for analysis_utils.py
|
||||||
|
"""
|
||||||
|
import pytest
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
from pathlib import Path
|
||||||
|
import sys
|
||||||
|
|
||||||
|
# Add parent directory to path
|
||||||
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||||
|
|
||||||
|
from analysis_utils import (
|
||||||
|
millions_formatter, thousands_formatter,
|
||||||
|
get_millions_formatter, get_thousands_formatter,
|
||||||
|
format_currency, calculate_price_per_unit,
|
||||||
|
sort_mixed_years, safe_year_labels
|
||||||
|
)
|
||||||
|
|
||||||
|
class TestFormatters:
|
||||||
|
"""Test formatting functions"""
|
||||||
|
|
||||||
|
def test_millions_formatter(self):
|
||||||
|
"""Test millions formatter"""
|
||||||
|
assert millions_formatter(10.5, None) == '$10.5m'
|
||||||
|
assert millions_formatter(0, None) == '$0.0m'
|
||||||
|
assert millions_formatter(100.0, None) == '$100.0m'
|
||||||
|
|
||||||
|
def test_thousands_formatter(self):
|
||||||
|
"""Test thousands formatter"""
|
||||||
|
assert thousands_formatter(10.5, None) == '$10.5k'
|
||||||
|
assert thousands_formatter(0, None) == '$0.0k'
|
||||||
|
|
||||||
|
def test_format_currency(self):
|
||||||
|
"""Test currency formatting"""
|
||||||
|
assert format_currency(1000000) == '$1.00m'
|
||||||
|
assert format_currency(1000, millions=False) == '$1.00k'
|
||||||
|
assert format_currency(np.nan) == 'N/A'
|
||||||
|
|
||||||
|
class TestPriceCalculation:
|
||||||
|
"""Test price calculation functions"""
|
||||||
|
|
||||||
|
def test_calculate_price_per_unit(self):
|
||||||
|
"""Test price per unit calculation"""
|
||||||
|
df = pd.DataFrame({
|
||||||
|
'Quantity': [10, 20, 30],
|
||||||
|
'Revenue': [100, 200, 300]
|
||||||
|
})
|
||||||
|
|
||||||
|
price = calculate_price_per_unit(df, 'Quantity', 'Revenue')
|
||||||
|
assert price == 10.0 # (100+200+300) / (10+20+30)
|
||||||
|
|
||||||
|
def test_calculate_price_per_unit_with_outliers(self):
|
||||||
|
"""Test price calculation excludes outliers"""
|
||||||
|
df = pd.DataFrame({
|
||||||
|
'Quantity': [10, 20, 30, 2000], # 2000 is outlier
|
||||||
|
'Revenue': [100, 200, 300, 10000]
|
||||||
|
})
|
||||||
|
|
||||||
|
# Should exclude quantity > 1000 by default
|
||||||
|
price = calculate_price_per_unit(df, 'Quantity', 'Revenue')
|
||||||
|
assert price == 10.0 # Only first 3 rows
|
||||||
|
|
||||||
|
class TestYearHandling:
|
||||||
|
"""Test year handling functions"""
|
||||||
|
|
||||||
|
def test_sort_mixed_years(self):
|
||||||
|
"""Test sorting mixed int/str years"""
|
||||||
|
df = pd.DataFrame({
|
||||||
|
'Year': [2023, '2025 (LTM)', 2024, 2022],
|
||||||
|
'Value': [100, 150, 120, 90]
|
||||||
|
})
|
||||||
|
|
||||||
|
sorted_df = sort_mixed_years(df, 'Year')
|
||||||
|
assert sorted_df['Year'].iloc[0] == 2022
|
||||||
|
assert sorted_df['Year'].iloc[-1] == '2025 (LTM)'
|
||||||
|
|
||||||
|
def test_safe_year_labels(self):
|
||||||
|
"""Test year label conversion"""
|
||||||
|
years = [2021, 2022, '2025 (LTM)']
|
||||||
|
labels = safe_year_labels(years)
|
||||||
|
assert labels == ['2021', '2022', '2025 (LTM)']
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
pytest.main([__file__, '-v'])
|
||||||
45
tests/test_config_validator.py
Normal file
45
tests/test_config_validator.py
Normal file
@@ -0,0 +1,45 @@
|
|||||||
|
"""
|
||||||
|
Unit tests for config_validator.py
|
||||||
|
"""
|
||||||
|
import pytest
|
||||||
|
import pandas as pd
|
||||||
|
from pathlib import Path
|
||||||
|
import sys
|
||||||
|
|
||||||
|
# Add parent directory to path
|
||||||
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||||
|
|
||||||
|
from config_validator import validate_config
|
||||||
|
|
||||||
|
class TestConfigValidator:
|
||||||
|
"""Test configuration validation"""
|
||||||
|
|
||||||
|
def test_validate_config_missing_column(self):
|
||||||
|
"""Test validation catches missing columns"""
|
||||||
|
df = pd.DataFrame({
|
||||||
|
'SomeColumn': [1, 2, 3]
|
||||||
|
})
|
||||||
|
|
||||||
|
errors, warnings = validate_config(df)
|
||||||
|
|
||||||
|
# Should have errors for missing required columns
|
||||||
|
assert len(errors) > 0
|
||||||
|
assert any('not found' in error.lower() for error in errors)
|
||||||
|
|
||||||
|
def test_validate_config_valid_data(self):
|
||||||
|
"""Test validation with valid data"""
|
||||||
|
df = pd.DataFrame({
|
||||||
|
'InvoiceDate': pd.to_datetime(['2023-01-01', '2023-02-01']),
|
||||||
|
'USD': [100.0, 200.0],
|
||||||
|
'Year': [2023, 2023]
|
||||||
|
})
|
||||||
|
|
||||||
|
errors, warnings = validate_config(df)
|
||||||
|
|
||||||
|
# Should have minimal errors (may have warnings about missing optional columns)
|
||||||
|
# But should not have critical errors if basic structure is correct
|
||||||
|
critical_errors = [e for e in errors if 'not found' in e.lower() and 'USD' in e or 'InvoiceDate' in e]
|
||||||
|
assert len(critical_errors) == 0
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
pytest.main([__file__, '-v'])
|
||||||
68
tests/test_data_loader.py
Normal file
68
tests/test_data_loader.py
Normal file
@@ -0,0 +1,68 @@
|
|||||||
|
"""
|
||||||
|
Integration tests for data_loader.py
|
||||||
|
"""
|
||||||
|
import pytest
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
from pathlib import Path
|
||||||
|
import sys
|
||||||
|
import tempfile
|
||||||
|
import os
|
||||||
|
|
||||||
|
# Add parent directory to path
|
||||||
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||||
|
|
||||||
|
from data_loader import load_sales_data, validate_data_structure
|
||||||
|
|
||||||
|
class TestDataLoader:
|
||||||
|
"""Test data loading functions"""
|
||||||
|
|
||||||
|
def test_load_sales_data_basic(self):
|
||||||
|
"""Test basic data loading"""
|
||||||
|
# Create temporary CSV
|
||||||
|
with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False) as f:
|
||||||
|
f.write('InvoiceDate,USD,Customer\n')
|
||||||
|
f.write('2023-01-01,100.0,Customer1\n')
|
||||||
|
f.write('2023-02-01,200.0,Customer2\n')
|
||||||
|
temp_path = f.name
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Temporarily update config
|
||||||
|
import config
|
||||||
|
original_data_file = config.DATA_FILE
|
||||||
|
config.DATA_FILE = Path(temp_path).name
|
||||||
|
|
||||||
|
df = load_sales_data(Path(temp_path))
|
||||||
|
|
||||||
|
assert len(df) == 2
|
||||||
|
assert 'Year' in df.columns
|
||||||
|
assert 'YearMonth' in df.columns
|
||||||
|
|
||||||
|
# Restore config
|
||||||
|
config.DATA_FILE = original_data_file
|
||||||
|
finally:
|
||||||
|
os.unlink(temp_path)
|
||||||
|
|
||||||
|
def test_validate_data_structure(self):
|
||||||
|
"""Test data structure validation"""
|
||||||
|
# Valid DataFrame
|
||||||
|
df_valid = pd.DataFrame({
|
||||||
|
'InvoiceDate': pd.to_datetime(['2023-01-01', '2023-02-01']),
|
||||||
|
'USD': [100.0, 200.0]
|
||||||
|
})
|
||||||
|
|
||||||
|
is_valid, msg = validate_data_structure(df_valid)
|
||||||
|
assert is_valid
|
||||||
|
assert msg == "OK"
|
||||||
|
|
||||||
|
# Invalid DataFrame (missing column)
|
||||||
|
df_invalid = pd.DataFrame({
|
||||||
|
'InvoiceDate': pd.to_datetime(['2023-01-01'])
|
||||||
|
})
|
||||||
|
|
||||||
|
is_valid, msg = validate_data_structure(df_invalid)
|
||||||
|
assert not is_valid
|
||||||
|
assert 'Missing required column' in msg
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
pytest.main([__file__, '-v'])
|
||||||
95
validate_revenue.py
Normal file
95
validate_revenue.py
Normal file
@@ -0,0 +1,95 @@
|
|||||||
|
"""
|
||||||
|
Revenue validation utility
|
||||||
|
Validates that revenue calculations are consistent across analyses
|
||||||
|
"""
|
||||||
|
import pandas as pd
|
||||||
|
from config import (
|
||||||
|
REVENUE_COLUMN, ANALYSIS_YEARS, VALIDATION_ENABLED,
|
||||||
|
EXPECTED_REVENUE, REVENUE_TOLERANCE_PCT, LTM_ENABLED,
|
||||||
|
get_ltm_period
|
||||||
|
)
|
||||||
|
from analysis_utils import get_annual_data
|
||||||
|
|
||||||
|
def validate_revenue(dataframe: pd.DataFrame, analysis_name: str = "Analysis") -> None:
|
||||||
|
"""
|
||||||
|
Print annual revenue summary for validation.
|
||||||
|
|
||||||
|
This function helps ensure that:
|
||||||
|
1. Data loading is working correctly
|
||||||
|
2. Revenue calculations are consistent
|
||||||
|
3. Filters are not accidentally excluding too much data
|
||||||
|
|
||||||
|
Args:
|
||||||
|
dataframe: DataFrame with revenue and date columns (should have REVENUE_COLUMN and Year)
|
||||||
|
analysis_name: Name of the analysis (for logging/display)
|
||||||
|
|
||||||
|
Example:
|
||||||
|
>>> validate_revenue(df, "Revenue Analysis")
|
||||||
|
>>> # Prints annual revenue summary by year
|
||||||
|
"""
|
||||||
|
df = dataframe.copy()
|
||||||
|
|
||||||
|
# Ensure date column is datetime
|
||||||
|
from config import DATE_COLUMN
|
||||||
|
if DATE_COLUMN in df.columns:
|
||||||
|
df[DATE_COLUMN] = pd.to_datetime(df[DATE_COLUMN], errors='coerce', format='mixed')
|
||||||
|
|
||||||
|
# Filter to analysis years
|
||||||
|
df = df[df['Year'].isin(ANALYSIS_YEARS)]
|
||||||
|
|
||||||
|
# Calculate annual revenue
|
||||||
|
annual_revenue = {}
|
||||||
|
ltm_start, ltm_end = get_ltm_period() if LTM_ENABLED else (None, None)
|
||||||
|
|
||||||
|
for year in sorted(ANALYSIS_YEARS):
|
||||||
|
if year in df['Year'].unique():
|
||||||
|
year_data, year_label = get_annual_data(df, year, ltm_start, ltm_end)
|
||||||
|
if len(year_data) > 0:
|
||||||
|
revenue = year_data[REVENUE_COLUMN].sum()
|
||||||
|
annual_revenue[year_label] = revenue
|
||||||
|
|
||||||
|
# Print summary
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print(f"Annual Revenue Validation - {analysis_name}")
|
||||||
|
print(f"{'='*60}")
|
||||||
|
|
||||||
|
if annual_revenue:
|
||||||
|
for year_label, revenue in annual_revenue.items():
|
||||||
|
formatted = f"${revenue / 1e6:.2f}m"
|
||||||
|
print(f" {year_label}: {formatted}")
|
||||||
|
|
||||||
|
# Validation against expected values
|
||||||
|
if VALIDATION_ENABLED and EXPECTED_REVENUE:
|
||||||
|
print(f"\nValidation Check:")
|
||||||
|
all_valid = True
|
||||||
|
for year_label, actual_revenue in annual_revenue.items():
|
||||||
|
# Try to match year label to expected revenue
|
||||||
|
year_key = None
|
||||||
|
if isinstance(year_label, str):
|
||||||
|
# Extract year number from label (e.g., "2025 (LTM 9/2025)" -> 2025)
|
||||||
|
import re
|
||||||
|
year_match = re.search(r'(\d{4})', str(year_label))
|
||||||
|
if year_match:
|
||||||
|
year_key = int(year_match.group(1))
|
||||||
|
else:
|
||||||
|
year_key = year_label
|
||||||
|
|
||||||
|
if year_key in EXPECTED_REVENUE:
|
||||||
|
expected = EXPECTED_REVENUE[year_key]
|
||||||
|
tolerance = expected * REVENUE_TOLERANCE_PCT
|
||||||
|
diff = abs(actual_revenue - expected)
|
||||||
|
|
||||||
|
if diff <= tolerance:
|
||||||
|
print(f" ✓ {year_label}: Within tolerance ({diff/1e6:.2f}m difference)")
|
||||||
|
else:
|
||||||
|
print(f" ✗ {year_label}: Outside tolerance (expected ${expected/1e6:.2f}m, got ${actual_revenue/1e6:.2f}m, diff: ${diff/1e6:.2f}m)")
|
||||||
|
all_valid = False
|
||||||
|
|
||||||
|
if all_valid:
|
||||||
|
print(" All validations passed!")
|
||||||
|
else:
|
||||||
|
print(" WARNING: Some validations failed. Check data loading and filters.")
|
||||||
|
else:
|
||||||
|
print(" No revenue data found for analysis years")
|
||||||
|
|
||||||
|
print(f"{'='*60}\n")
|
||||||
Reference in New Issue
Block a user