Initial commit: sales analysis template
Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
134
examples/annual_revenue_trend.py
Normal file
134
examples/annual_revenue_trend.py
Normal file
@@ -0,0 +1,134 @@
|
||||
"""
|
||||
Example: Annual Revenue Trend Analysis
|
||||
Simple example showing annual revenue with LTM support
|
||||
|
||||
This is a working example that demonstrates:
|
||||
- Loading data using data_loader
|
||||
- Calculating annual metrics with LTM
|
||||
- Creating a revenue trend chart
|
||||
- Following template best practices
|
||||
"""
|
||||
import pandas as pd
|
||||
import matplotlib.pyplot as plt
|
||||
from pathlib import Path
|
||||
|
||||
# Import utilities
|
||||
from data_loader import load_sales_data, validate_data_structure
|
||||
from validate_revenue import validate_revenue
|
||||
from analysis_utils import (
|
||||
get_ltm_period_config, calculate_annual_metrics,
|
||||
setup_revenue_chart, save_chart,
|
||||
format_currency, print_annual_summary, sort_mixed_years,
|
||||
apply_exclusion_filters
|
||||
)
|
||||
from config import (
|
||||
OUTPUT_DIR, ANALYSIS_YEARS, MAX_DATE,
|
||||
CHART_SIZES, ensure_directories, get_data_path, COMPANY_NAME,
|
||||
REVENUE_COLUMN, MIN_YEAR, DATE_COLUMN
|
||||
)
|
||||
|
||||
# ============================================================================
|
||||
# CONFIGURATION
|
||||
# ============================================================================
|
||||
|
||||
ANALYSIS_NAME = "Annual Revenue Trend"
|
||||
DESCRIPTION = "Simple annual revenue trend analysis with LTM support"
|
||||
|
||||
# ============================================================================
|
||||
# MAIN ANALYSIS FUNCTION
|
||||
# ============================================================================
|
||||
|
||||
def main():
|
||||
"""Main analysis function"""
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"{ANALYSIS_NAME}")
|
||||
print(f"{'='*60}\n")
|
||||
|
||||
# 1. Load data
|
||||
print("Loading data...")
|
||||
try:
|
||||
df = load_sales_data(get_data_path())
|
||||
print(f"Loaded {len(df):,} transactions")
|
||||
except Exception as e:
|
||||
print(f"ERROR loading data: {e}")
|
||||
return
|
||||
|
||||
# 2. Validate data structure
|
||||
is_valid, msg = validate_data_structure(df)
|
||||
if not is_valid:
|
||||
print(f"ERROR: {msg}")
|
||||
return
|
||||
print("Data validation passed")
|
||||
|
||||
# 3. Apply exclusion filters (if configured)
|
||||
df = apply_exclusion_filters(df)
|
||||
|
||||
# 4. Filter by date range
|
||||
df = df[df['Year'] >= MIN_YEAR]
|
||||
if DATE_COLUMN in df.columns:
|
||||
df = df[df[DATE_COLUMN] <= MAX_DATE]
|
||||
|
||||
# 5. Setup LTM period (if enabled)
|
||||
ltm_start, ltm_end = get_ltm_period_config()
|
||||
if ltm_start and ltm_end:
|
||||
print(f"LTM period: {ltm_start} to {ltm_end}")
|
||||
|
||||
# 6. Calculate annual metrics
|
||||
print("\nCalculating annual metrics...")
|
||||
|
||||
def calculate_metrics(year_data):
|
||||
"""Calculate metrics for a single year"""
|
||||
return {
|
||||
'Revenue': year_data[REVENUE_COLUMN].sum(),
|
||||
}
|
||||
|
||||
annual_df = calculate_annual_metrics(df, calculate_metrics, ltm_start, ltm_end)
|
||||
|
||||
# 7. Print summary
|
||||
print_annual_summary(annual_df, 'Revenue', 'Revenue')
|
||||
|
||||
# 8. Create visualization
|
||||
print("Generating chart...")
|
||||
ensure_directories()
|
||||
|
||||
# Annual revenue trend chart
|
||||
fig, ax = plt.subplots(figsize=CHART_SIZES['medium'])
|
||||
|
||||
# Prepare data for plotting (handle mixed types)
|
||||
annual_df_sorted = sort_mixed_years(annual_df.reset_index(), 'Year')
|
||||
years = annual_df_sorted['Year'].tolist()
|
||||
revenue = annual_df_sorted['Revenue'].values / 1e6 # Convert to millions
|
||||
|
||||
# Create chart
|
||||
ax.plot(range(len(years)), revenue, marker='o', linewidth=2, markersize=8, color='#2E86AB')
|
||||
ax.set_xticks(range(len(years)))
|
||||
ax.set_xticklabels(years, rotation=45, ha='right')
|
||||
setup_revenue_chart(ax)
|
||||
|
||||
# Add LTM notation to title if applicable
|
||||
title = f'Annual Revenue Trend - {COMPANY_NAME}'
|
||||
if ltm_start and ltm_end:
|
||||
from config import get_ltm_label
|
||||
ltm_label = get_ltm_label()
|
||||
if ltm_label:
|
||||
title += f'\n({ltm_label})'
|
||||
ax.set_title(title, fontsize=14, fontweight='bold')
|
||||
|
||||
plt.tight_layout()
|
||||
save_chart(fig, 'annual_revenue_trend.png')
|
||||
plt.close()
|
||||
|
||||
# 9. Validate revenue
|
||||
print("\nValidating revenue...")
|
||||
validate_revenue(df, ANALYSIS_NAME)
|
||||
|
||||
print(f"\n{ANALYSIS_NAME} complete!")
|
||||
print(f"Chart saved to: {OUTPUT_DIR}")
|
||||
|
||||
# ============================================================================
|
||||
# RUN ANALYSIS
|
||||
# ============================================================================
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
218
examples/cohort_analysis.py
Normal file
218
examples/cohort_analysis.py
Normal file
@@ -0,0 +1,218 @@
|
||||
"""
|
||||
Example: Cohort Analysis
|
||||
Advanced example showing customer cohort retention analysis
|
||||
|
||||
This demonstrates:
|
||||
- Cohort-based analysis
|
||||
- Retention rate calculations
|
||||
- Revenue retention metrics
|
||||
- Advanced visualization
|
||||
"""
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
import seaborn as sns
|
||||
from pathlib import Path
|
||||
from operator import attrgetter
|
||||
|
||||
# Import utilities
|
||||
from data_loader import load_sales_data, validate_data_structure
|
||||
from validate_revenue import validate_revenue
|
||||
from analysis_utils import (
|
||||
get_ltm_period_config, apply_exclusion_filters,
|
||||
setup_revenue_chart, save_chart, format_currency
|
||||
)
|
||||
from config import (
|
||||
OUTPUT_DIR, MAX_DATE, CHART_SIZES, ensure_directories,
|
||||
get_data_path, COMPANY_NAME, REVENUE_COLUMN, CUSTOMER_COLUMN,
|
||||
DATE_COLUMN, MIN_YEAR
|
||||
)
|
||||
|
||||
# ============================================================================
|
||||
# CONFIGURATION
|
||||
# ============================================================================
|
||||
|
||||
ANALYSIS_NAME = "Cohort Analysis"
|
||||
DESCRIPTION = "Customer cohort retention and revenue analysis"
|
||||
|
||||
# ============================================================================
|
||||
# COHORT ANALYSIS FUNCTIONS
|
||||
# ============================================================================
|
||||
|
||||
def create_cohorts(df):
|
||||
"""
|
||||
Create customer cohorts based on first purchase date
|
||||
|
||||
Args:
|
||||
df: DataFrame with customer and date columns
|
||||
|
||||
Returns:
|
||||
DataFrame: Original DataFrame with 'Cohort' and 'CohortPeriod' columns
|
||||
"""
|
||||
from config import CUSTOMER_COLUMN, DATE_COLUMN
|
||||
|
||||
# Get first purchase date for each customer
|
||||
first_purchase = df.groupby(CUSTOMER_COLUMN)[DATE_COLUMN].min().reset_index()
|
||||
first_purchase.columns = [CUSTOMER_COLUMN, 'FirstPurchaseDate']
|
||||
|
||||
# Extract cohort year-month
|
||||
first_purchase['Cohort'] = first_purchase['FirstPurchaseDate'].dt.to_period('M')
|
||||
|
||||
# Merge back to original data
|
||||
df_with_cohort = df.merge(first_purchase[[CUSTOMER_COLUMN, 'Cohort']], on=CUSTOMER_COLUMN)
|
||||
|
||||
# Calculate period number (months since first purchase)
|
||||
df_with_cohort['Period'] = df_with_cohort[DATE_COLUMN].dt.to_period('M')
|
||||
df_with_cohort['CohortPeriod'] = (df_with_cohort['Period'] - df_with_cohort['Cohort']).apply(attrgetter('n'))
|
||||
|
||||
return df_with_cohort
|
||||
|
||||
def calculate_cohort_metrics(df_with_cohort):
|
||||
"""
|
||||
Calculate cohort retention metrics
|
||||
|
||||
Args:
|
||||
df_with_cohort: DataFrame with Cohort and CohortPeriod columns
|
||||
|
||||
Returns:
|
||||
DataFrame: Cohort metrics by period
|
||||
"""
|
||||
from config import REVENUE_COLUMN, CUSTOMER_COLUMN
|
||||
|
||||
# Customer count by cohort and period
|
||||
cohort_size = df_with_cohort.groupby('Cohort')[CUSTOMER_COLUMN].nunique()
|
||||
|
||||
# Revenue by cohort and period
|
||||
cohort_revenue = df_with_cohort.groupby(['Cohort', 'CohortPeriod']).agg({
|
||||
CUSTOMER_COLUMN: 'nunique',
|
||||
REVENUE_COLUMN: 'sum'
|
||||
}).reset_index()
|
||||
cohort_revenue.columns = ['Cohort', 'Period', 'Customers', 'Revenue']
|
||||
|
||||
# Calculate retention rates
|
||||
cohort_retention = []
|
||||
for cohort in cohort_revenue['Cohort'].unique():
|
||||
cohort_data = cohort_revenue[cohort_revenue['Cohort'] == cohort].copy()
|
||||
initial_customers = cohort_data[cohort_data['Period'] == 0]['Customers'].values[0]
|
||||
|
||||
cohort_data['Retention_Rate'] = (cohort_data['Customers'] / initial_customers) * 100
|
||||
cohort_data['Revenue_Retention'] = cohort_data['Revenue'] / cohort_data[cohort_data['Period'] == 0]['Revenue'].values[0] * 100
|
||||
|
||||
cohort_retention.append(cohort_data)
|
||||
|
||||
return pd.concat(cohort_retention, ignore_index=True)
|
||||
|
||||
# ============================================================================
|
||||
# MAIN ANALYSIS FUNCTION
|
||||
# ============================================================================
|
||||
|
||||
def main():
|
||||
"""Main analysis function"""
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"{ANALYSIS_NAME}")
|
||||
print(f"{'='*60}\n")
|
||||
|
||||
# 1. Load data
|
||||
print("Loading data...")
|
||||
try:
|
||||
df = load_sales_data(get_data_path())
|
||||
print(f"Loaded {len(df):,} transactions")
|
||||
except Exception as e:
|
||||
print(f"ERROR loading data: {e}")
|
||||
return
|
||||
|
||||
# 2. Validate
|
||||
is_valid, msg = validate_data_structure(df)
|
||||
if not is_valid:
|
||||
print(f"ERROR: {msg}")
|
||||
return
|
||||
|
||||
if CUSTOMER_COLUMN not in df.columns:
|
||||
print(f"ERROR: Customer column '{CUSTOMER_COLUMN}' not found")
|
||||
return
|
||||
|
||||
# 3. Apply filters
|
||||
df = apply_exclusion_filters(df)
|
||||
df = df[df['Year'] >= MIN_YEAR]
|
||||
if DATE_COLUMN in df.columns:
|
||||
df = df[df[DATE_COLUMN] <= MAX_DATE]
|
||||
|
||||
# 4. Create cohorts
|
||||
print("\nCreating customer cohorts...")
|
||||
df_cohort = create_cohorts(df)
|
||||
|
||||
# 5. Calculate cohort metrics
|
||||
print("Calculating cohort metrics...")
|
||||
cohort_metrics = calculate_cohort_metrics(df_cohort)
|
||||
|
||||
# 6. Print summary
|
||||
print("\nCohort Summary:")
|
||||
print("-" * 60)
|
||||
for cohort in sorted(cohort_metrics['Cohort'].unique())[:5]: # Show top 5 cohorts
|
||||
cohort_data = cohort_metrics[cohort_metrics['Cohort'] == cohort]
|
||||
period_0 = cohort_data[cohort_data['Period'] == 0]
|
||||
if len(period_0) > 0:
|
||||
initial_customers = period_0['Customers'].values[0]
|
||||
initial_revenue = period_0['Revenue'].values[0]
|
||||
print(f"\n{cohort}:")
|
||||
print(f" Initial: {initial_customers:,} customers, {format_currency(initial_revenue)}")
|
||||
|
||||
# Show retention at period 12
|
||||
period_12 = cohort_data[cohort_data['Period'] == 12]
|
||||
if len(period_12) > 0:
|
||||
retention = period_12['Retention_Rate'].values[0]
|
||||
revenue_ret = period_12['Revenue_Retention'].values[0]
|
||||
print(f" Period 12: {retention:.1f}% customer retention, {revenue_ret:.1f}% revenue retention")
|
||||
|
||||
# 7. Create visualizations
|
||||
print("\nGenerating charts...")
|
||||
ensure_directories()
|
||||
|
||||
# Heatmap: Customer retention
|
||||
pivot_retention = cohort_metrics.pivot_table(
|
||||
index='Cohort',
|
||||
columns='Period',
|
||||
values='Retention_Rate',
|
||||
aggfunc='mean'
|
||||
)
|
||||
|
||||
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=CHART_SIZES['wide'])
|
||||
|
||||
# Retention heatmap
|
||||
sns.heatmap(pivot_retention, annot=True, fmt='.0f', cmap='YlOrRd', ax=ax1, cbar_kws={'label': 'Retention %'})
|
||||
ax1.set_title('Customer Retention by Cohort\n(Period 0 = 100%)', fontsize=12, fontweight='bold')
|
||||
ax1.set_xlabel('Months Since First Purchase')
|
||||
ax1.set_ylabel('Cohort')
|
||||
|
||||
# Revenue retention heatmap
|
||||
pivot_revenue = cohort_metrics.pivot_table(
|
||||
index='Cohort',
|
||||
columns='Period',
|
||||
values='Revenue_Retention',
|
||||
aggfunc='mean'
|
||||
)
|
||||
|
||||
sns.heatmap(pivot_revenue, annot=True, fmt='.0f', cmap='YlGnBu', ax=ax2, cbar_kws={'label': 'Revenue Retention %'})
|
||||
ax2.set_title('Revenue Retention by Cohort\n(Period 0 = 100%)', fontsize=12, fontweight='bold')
|
||||
ax2.set_xlabel('Months Since First Purchase')
|
||||
ax2.set_ylabel('Cohort')
|
||||
|
||||
plt.suptitle(f'Cohort Analysis - {COMPANY_NAME}', fontsize=14, fontweight='bold', y=1.02)
|
||||
plt.tight_layout()
|
||||
save_chart(fig, 'cohort_analysis.png')
|
||||
plt.close()
|
||||
|
||||
# 8. Validate
|
||||
print("\nValidating revenue...")
|
||||
validate_revenue(df, ANALYSIS_NAME)
|
||||
|
||||
print(f"\n{ANALYSIS_NAME} complete!")
|
||||
print(f"Charts saved to: {OUTPUT_DIR}")
|
||||
|
||||
# ============================================================================
|
||||
# RUN ANALYSIS
|
||||
# ============================================================================
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
213
examples/customer_segmentation.py
Normal file
213
examples/customer_segmentation.py
Normal file
@@ -0,0 +1,213 @@
|
||||
"""
|
||||
Example: Customer Segmentation (RFM) Analysis
|
||||
Example showing customer segmentation using RFM methodology
|
||||
|
||||
This example demonstrates:
|
||||
- Customer-level aggregation
|
||||
- RFM segmentation (Recency, Frequency, Monetary)
|
||||
- Segment analysis and visualization
|
||||
"""
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
from pathlib import Path
|
||||
|
||||
# Import utilities
|
||||
from data_loader import load_sales_data, validate_data_structure
|
||||
from validate_revenue import validate_revenue
|
||||
from analysis_utils import (
|
||||
get_ltm_period_config, apply_exclusion_filters,
|
||||
setup_revenue_chart, save_chart, format_currency
|
||||
)
|
||||
from config import (
|
||||
OUTPUT_DIR, MAX_DATE, CHART_SIZES, ensure_directories,
|
||||
get_data_path, COMPANY_NAME, REVENUE_COLUMN, CUSTOMER_COLUMN,
|
||||
DATE_COLUMN, MIN_YEAR
|
||||
)
|
||||
|
||||
# ============================================================================
|
||||
# CONFIGURATION
|
||||
# ============================================================================
|
||||
|
||||
ANALYSIS_NAME = "Customer Segmentation (RFM)"
|
||||
DESCRIPTION = "Customer segmentation using RFM methodology"
|
||||
|
||||
# ============================================================================
|
||||
# RFM SEGMENTATION FUNCTIONS
|
||||
# ============================================================================
|
||||
|
||||
def calculate_rfm_scores(df, analysis_date=None):
|
||||
"""
|
||||
Calculate RFM scores for each customer
|
||||
|
||||
Args:
|
||||
df: DataFrame with customer, date, and revenue columns
|
||||
analysis_date: Reference date for recency calculation (defaults to max date)
|
||||
|
||||
Returns:
|
||||
DataFrame with RFM scores and segment assignment
|
||||
"""
|
||||
if analysis_date is None:
|
||||
analysis_date = df[DATE_COLUMN].max()
|
||||
|
||||
# Calculate customer-level metrics
|
||||
customer_metrics = df.groupby(CUSTOMER_COLUMN).agg({
|
||||
DATE_COLUMN: ['max', 'count'],
|
||||
REVENUE_COLUMN: 'sum'
|
||||
}).reset_index()
|
||||
|
||||
customer_metrics.columns = [CUSTOMER_COLUMN, 'LastPurchaseDate', 'Frequency', 'Monetary']
|
||||
|
||||
# Calculate Recency (days since last purchase)
|
||||
customer_metrics['Recency'] = (analysis_date - customer_metrics['LastPurchaseDate']).dt.days
|
||||
|
||||
# Score each dimension (1-5 scale, 5 = best)
|
||||
customer_metrics['R_Score'] = pd.qcut(
|
||||
customer_metrics['Recency'].rank(method='first'),
|
||||
q=5, labels=[5, 4, 3, 2, 1], duplicates='drop'
|
||||
).astype(int)
|
||||
|
||||
customer_metrics['F_Score'] = pd.qcut(
|
||||
customer_metrics['Frequency'].rank(method='first'),
|
||||
q=5, labels=[1, 2, 3, 4, 5], duplicates='drop'
|
||||
).astype(int)
|
||||
|
||||
customer_metrics['M_Score'] = pd.qcut(
|
||||
customer_metrics['Monetary'].rank(method='first'),
|
||||
q=5, labels=[1, 2, 3, 4, 5], duplicates='drop'
|
||||
).astype(int)
|
||||
|
||||
# Calculate RFM score (sum of R, F, M)
|
||||
customer_metrics['RFM_Score'] = (
|
||||
customer_metrics['R_Score'] +
|
||||
customer_metrics['F_Score'] +
|
||||
customer_metrics['M_Score']
|
||||
)
|
||||
|
||||
# Assign segments
|
||||
def assign_segment(row):
|
||||
r, f, m = row['R_Score'], row['F_Score'], row['M_Score']
|
||||
if r >= 4 and f >= 4 and m >= 4:
|
||||
return 'Champions'
|
||||
elif r >= 3 and f >= 3 and m >= 4:
|
||||
return 'Loyal Customers'
|
||||
elif r >= 4 and f <= 2:
|
||||
return 'At Risk'
|
||||
elif r <= 2:
|
||||
return 'Hibernating'
|
||||
elif r >= 3 and f >= 3 and m <= 2:
|
||||
return 'Potential Loyalists'
|
||||
else:
|
||||
return 'Need Attention'
|
||||
|
||||
customer_metrics['Segment'] = customer_metrics.apply(assign_segment, axis=1)
|
||||
|
||||
return customer_metrics
|
||||
|
||||
# ============================================================================
|
||||
# MAIN ANALYSIS FUNCTION
|
||||
# ============================================================================
|
||||
|
||||
def main():
|
||||
"""Main analysis function"""
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"{ANALYSIS_NAME}")
|
||||
print(f"{'='*60}\n")
|
||||
|
||||
# 1. Load data
|
||||
print("Loading data...")
|
||||
try:
|
||||
df = load_sales_data(get_data_path())
|
||||
print(f"Loaded {len(df):,} transactions")
|
||||
except Exception as e:
|
||||
print(f"ERROR loading data: {e}")
|
||||
return
|
||||
|
||||
# 2. Validate data structure
|
||||
is_valid, msg = validate_data_structure(df)
|
||||
if not is_valid:
|
||||
print(f"ERROR: {msg}")
|
||||
return
|
||||
|
||||
if CUSTOMER_COLUMN not in df.columns:
|
||||
print(f"ERROR: Customer column '{CUSTOMER_COLUMN}' not found in data")
|
||||
return
|
||||
|
||||
print("Data validation passed")
|
||||
|
||||
# 3. Apply exclusion filters
|
||||
df = apply_exclusion_filters(df)
|
||||
|
||||
# 4. Filter by date range
|
||||
df = df[df['Year'] >= MIN_YEAR]
|
||||
if DATE_COLUMN in df.columns:
|
||||
df = df[df[DATE_COLUMN] <= MAX_DATE]
|
||||
|
||||
# 5. Calculate RFM scores
|
||||
print("\nCalculating RFM scores...")
|
||||
rfm_df = calculate_rfm_scores(df)
|
||||
|
||||
# 6. Segment summary
|
||||
print("\nCustomer Segmentation Summary:")
|
||||
print("-" * 60)
|
||||
segment_summary = rfm_df.groupby('Segment').agg({
|
||||
CUSTOMER_COLUMN: 'count',
|
||||
'Monetary': 'sum'
|
||||
}).reset_index()
|
||||
segment_summary.columns = ['Segment', 'Customer Count', 'Total Revenue']
|
||||
segment_summary = segment_summary.sort_values('Total Revenue', ascending=False)
|
||||
|
||||
for _, row in segment_summary.iterrows():
|
||||
pct_customers = (row['Customer Count'] / len(rfm_df)) * 100
|
||||
pct_revenue = (row['Total Revenue'] / rfm_df['Monetary'].sum()) * 100
|
||||
print(f"{row['Segment']:20s}: {row['Customer Count']:5d} customers ({pct_customers:5.1f}%), "
|
||||
f"{format_currency(row['Total Revenue'])} ({pct_revenue:5.1f}% of revenue)")
|
||||
|
||||
# 7. Create visualizations
|
||||
print("\nGenerating charts...")
|
||||
ensure_directories()
|
||||
|
||||
# Chart 1: Revenue by Segment
|
||||
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=CHART_SIZES['wide'])
|
||||
|
||||
segment_summary_sorted = segment_summary.sort_values('Total Revenue', ascending=True)
|
||||
revenue_millions = segment_summary_sorted['Total Revenue'].values / 1e6
|
||||
|
||||
ax1.barh(range(len(segment_summary_sorted)), revenue_millions, color='#2E86AB')
|
||||
ax1.set_yticks(range(len(segment_summary_sorted)))
|
||||
ax1.set_yticklabels(segment_summary_sorted['Segment'].values)
|
||||
ax1.set_xlabel('Revenue (Millions USD)')
|
||||
ax1.set_title('Revenue by Customer Segment', fontsize=12, fontweight='bold')
|
||||
setup_revenue_chart(ax1)
|
||||
ax1.set_ylabel('')
|
||||
|
||||
# Chart 2: Customer Count by Segment
|
||||
customer_counts = segment_summary_sorted['Customer Count'].values
|
||||
ax2.barh(range(len(segment_summary_sorted)), customer_counts, color='#A23B72')
|
||||
ax2.set_yticks(range(len(segment_summary_sorted)))
|
||||
ax2.set_yticklabels(segment_summary_sorted['Segment'].values)
|
||||
ax2.set_xlabel('Number of Customers')
|
||||
ax2.set_title('Customer Count by Segment', fontsize=12, fontweight='bold')
|
||||
ax2.set_ylabel('')
|
||||
ax2.grid(True, alpha=0.3)
|
||||
|
||||
plt.suptitle(f'Customer Segmentation Analysis - {COMPANY_NAME}',
|
||||
fontsize=14, fontweight='bold', y=1.02)
|
||||
plt.tight_layout()
|
||||
save_chart(fig, 'customer_segmentation.png')
|
||||
plt.close()
|
||||
|
||||
# 8. Validate revenue
|
||||
print("\nValidating revenue...")
|
||||
validate_revenue(df, ANALYSIS_NAME)
|
||||
|
||||
print(f"\n{ANALYSIS_NAME} complete!")
|
||||
print(f"Charts saved to: {OUTPUT_DIR}")
|
||||
|
||||
# ============================================================================
|
||||
# RUN ANALYSIS
|
||||
# ============================================================================
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
203
examples/product_performance.py
Normal file
203
examples/product_performance.py
Normal file
@@ -0,0 +1,203 @@
|
||||
"""
|
||||
Example: Product Performance Analysis
|
||||
Example showing product mix and performance analysis
|
||||
|
||||
This example demonstrates:
|
||||
- Product-level aggregation
|
||||
- Product performance metrics
|
||||
- Product mix visualization
|
||||
"""
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
from pathlib import Path
|
||||
|
||||
# Import utilities
|
||||
from data_loader import load_sales_data, validate_data_structure
|
||||
from validate_revenue import validate_revenue
|
||||
from analysis_utils import (
|
||||
get_ltm_period_config, calculate_annual_metrics,
|
||||
apply_exclusion_filters, setup_revenue_chart, save_chart,
|
||||
format_currency, sort_mixed_years
|
||||
)
|
||||
from config import (
|
||||
OUTPUT_DIR, MAX_DATE, CHART_SIZES, ensure_directories,
|
||||
get_data_path, COMPANY_NAME, REVENUE_COLUMN, ITEM_COLUMN,
|
||||
DATE_COLUMN, MIN_YEAR, QUANTITY_COLUMN
|
||||
)
|
||||
|
||||
# ============================================================================
|
||||
# CONFIGURATION
|
||||
# ============================================================================
|
||||
|
||||
ANALYSIS_NAME = "Product Performance Analysis"
|
||||
DESCRIPTION = "Product mix and performance analysis"
|
||||
|
||||
# ============================================================================
|
||||
# MAIN ANALYSIS FUNCTION
|
||||
# ============================================================================
|
||||
|
||||
def main():
|
||||
"""Main analysis function"""
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"{ANALYSIS_NAME}")
|
||||
print(f"{'='*60}\n")
|
||||
|
||||
# 1. Load data
|
||||
print("Loading data...")
|
||||
try:
|
||||
df = load_sales_data(get_data_path())
|
||||
print(f"Loaded {len(df):,} transactions")
|
||||
except Exception as e:
|
||||
print(f"ERROR loading data: {e}")
|
||||
return
|
||||
|
||||
# 2. Validate data structure
|
||||
is_valid, msg = validate_data_structure(df)
|
||||
if not is_valid:
|
||||
print(f"ERROR: {msg}")
|
||||
return
|
||||
|
||||
if ITEM_COLUMN not in df.columns:
|
||||
print(f"WARNING: Item column '{ITEM_COLUMN}' not found. Using transaction-level analysis.")
|
||||
# Create a dummy item column for demonstration
|
||||
df[ITEM_COLUMN] = 'All Products'
|
||||
|
||||
print("Data validation passed")
|
||||
|
||||
# 3. Apply exclusion filters
|
||||
df = apply_exclusion_filters(df)
|
||||
|
||||
# 4. Filter by date range
|
||||
df = df[df['Year'] >= MIN_YEAR]
|
||||
if DATE_COLUMN in df.columns:
|
||||
df = df[df[DATE_COLUMN] <= MAX_DATE]
|
||||
|
||||
# 5. Setup LTM period
|
||||
ltm_start, ltm_end = get_ltm_period_config()
|
||||
|
||||
# 6. Product performance summary
|
||||
print("\nCalculating product performance...")
|
||||
|
||||
# Get most recent period data
|
||||
if ltm_start and ltm_end and 'YearMonth' in df.columns:
|
||||
recent_data = df[(df['YearMonth'] >= ltm_start) & (df['YearMonth'] <= ltm_end)]
|
||||
period_label = f"LTM {ltm_end}"
|
||||
else:
|
||||
recent_year = df['Year'].max()
|
||||
recent_data = df[df['Year'] == recent_year]
|
||||
period_label = str(recent_year)
|
||||
|
||||
# Product-level metrics
|
||||
product_metrics = recent_data.groupby(ITEM_COLUMN).agg({
|
||||
REVENUE_COLUMN: ['sum', 'count'],
|
||||
QUANTITY_COLUMN: 'sum' if QUANTITY_COLUMN in df.columns else 'count'
|
||||
}).reset_index()
|
||||
|
||||
product_metrics.columns = [ITEM_COLUMN, 'Revenue', 'Transaction_Count', 'Quantity']
|
||||
|
||||
# Calculate average price per unit if quantity available
|
||||
if QUANTITY_COLUMN in df.columns:
|
||||
product_metrics['Avg_Price'] = product_metrics['Revenue'] / product_metrics['Quantity'].replace(0, np.nan)
|
||||
else:
|
||||
product_metrics['Avg_Price'] = product_metrics['Revenue'] / product_metrics['Transaction_Count']
|
||||
|
||||
# Sort by revenue
|
||||
product_metrics = product_metrics.sort_values('Revenue', ascending=False)
|
||||
|
||||
# Top products summary
|
||||
print(f"\nTop 10 Products by Revenue ({period_label}):")
|
||||
print("-" * 80)
|
||||
top_10 = product_metrics.head(10)
|
||||
total_revenue = product_metrics['Revenue'].sum()
|
||||
|
||||
for idx, row in top_10.iterrows():
|
||||
pct = (row['Revenue'] / total_revenue) * 100
|
||||
print(f"{row[ITEM_COLUMN]:30s}: {format_currency(row['Revenue']):>12s} ({pct:5.1f}%)")
|
||||
|
||||
# 7. Annual product trends (if multiple years available)
|
||||
if len(df['Year'].unique()) > 1:
|
||||
print("\nCalculating annual product trends...")
|
||||
|
||||
def calculate_product_metrics(year_data):
|
||||
"""Calculate product metrics for a year"""
|
||||
product_revenue = year_data.groupby(ITEM_COLUMN)[REVENUE_COLUMN].sum()
|
||||
# Get top 5 products
|
||||
top_5 = product_revenue.nlargest(5)
|
||||
return dict(top_5)
|
||||
|
||||
annual_product_df = calculate_annual_metrics(df, calculate_product_metrics, ltm_start, ltm_end)
|
||||
|
||||
# 8. Create visualizations
|
||||
print("\nGenerating charts...")
|
||||
ensure_directories()
|
||||
|
||||
# Chart 1: Top Products Revenue (Bar Chart)
|
||||
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=CHART_SIZES['wide'])
|
||||
|
||||
top_10_revenue = top_10['Revenue'].values / 1e6
|
||||
top_10_names = top_10[ITEM_COLUMN].values
|
||||
|
||||
ax1.barh(range(len(top_10)), top_10_revenue, color='#2E86AB')
|
||||
ax1.set_yticks(range(len(top_10)))
|
||||
ax1.set_yticklabels([name[:30] + '...' if len(name) > 30 else name for name in top_10_names])
|
||||
ax1.set_xlabel('Revenue (Millions USD)')
|
||||
ax1.set_title(f'Top 10 Products by Revenue\n({period_label})', fontsize=12, fontweight='bold')
|
||||
setup_revenue_chart(ax1)
|
||||
ax1.set_ylabel('')
|
||||
|
||||
# Chart 2: Revenue Distribution (Pie Chart for top 10)
|
||||
if len(product_metrics) > 10:
|
||||
other_revenue = product_metrics.iloc[10:]['Revenue'].sum()
|
||||
pie_data = list(top_10['Revenue'].values) + [other_revenue]
|
||||
pie_labels = list(top_10[ITEM_COLUMN].values) + ['Other']
|
||||
else:
|
||||
pie_data = product_metrics['Revenue'].values
|
||||
pie_labels = product_metrics[ITEM_COLUMN].values
|
||||
|
||||
pie_data_millions = [x / 1e6 for x in pie_data]
|
||||
ax2.pie(pie_data_millions, labels=pie_labels, autopct='%1.1f%%', startangle=90)
|
||||
ax2.set_title('Revenue Distribution\n(Top Products)', fontsize=12, fontweight='bold')
|
||||
|
||||
plt.suptitle(f'Product Performance Analysis - {COMPANY_NAME}',
|
||||
fontsize=14, fontweight='bold', y=1.02)
|
||||
plt.tight_layout()
|
||||
save_chart(fig, 'product_performance.png')
|
||||
plt.close()
|
||||
else:
|
||||
# Single chart if only one year
|
||||
print("\nGenerating chart...")
|
||||
ensure_directories()
|
||||
|
||||
fig, ax = plt.subplots(figsize=CHART_SIZES['medium'])
|
||||
|
||||
top_10_revenue = top_10['Revenue'].values / 1e6
|
||||
top_10_names = top_10[ITEM_COLUMN].values
|
||||
|
||||
ax.barh(range(len(top_10)), top_10_revenue, color='#2E86AB')
|
||||
ax.set_yticks(range(len(top_10)))
|
||||
ax.set_yticklabels([name[:40] + '...' if len(name) > 40 else name for name in top_10_names])
|
||||
ax.set_xlabel('Revenue (Millions USD)')
|
||||
ax.set_title(f'Top 10 Products by Revenue - {COMPANY_NAME}\n({period_label})',
|
||||
fontsize=14, fontweight='bold')
|
||||
setup_revenue_chart(ax)
|
||||
ax.set_ylabel('')
|
||||
|
||||
plt.tight_layout()
|
||||
save_chart(fig, 'product_performance.png')
|
||||
plt.close()
|
||||
|
||||
# 9. Validate revenue
|
||||
print("\nValidating revenue...")
|
||||
validate_revenue(df, ANALYSIS_NAME)
|
||||
|
||||
print(f"\n{ANALYSIS_NAME} complete!")
|
||||
print(f"Charts saved to: {OUTPUT_DIR}")
|
||||
|
||||
# ============================================================================
|
||||
# RUN ANALYSIS
|
||||
# ============================================================================
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user