""" Example: Customer Segmentation (RFM) Analysis Example showing customer segmentation using RFM methodology This example demonstrates: - Customer-level aggregation - RFM segmentation (Recency, Frequency, Monetary) - Segment analysis and visualization """ import pandas as pd import numpy as np import matplotlib.pyplot as plt from pathlib import Path # Import utilities from data_loader import load_sales_data, validate_data_structure from validate_revenue import validate_revenue from analysis_utils import ( get_ltm_period_config, apply_exclusion_filters, setup_revenue_chart, save_chart, format_currency ) from config import ( OUTPUT_DIR, MAX_DATE, CHART_SIZES, ensure_directories, get_data_path, COMPANY_NAME, REVENUE_COLUMN, CUSTOMER_COLUMN, DATE_COLUMN, MIN_YEAR ) # ============================================================================ # CONFIGURATION # ============================================================================ ANALYSIS_NAME = "Customer Segmentation (RFM)" DESCRIPTION = "Customer segmentation using RFM methodology" # ============================================================================ # RFM SEGMENTATION FUNCTIONS # ============================================================================ def calculate_rfm_scores(df, analysis_date=None): """ Calculate RFM scores for each customer Args: df: DataFrame with customer, date, and revenue columns analysis_date: Reference date for recency calculation (defaults to max date) Returns: DataFrame with RFM scores and segment assignment """ if analysis_date is None: analysis_date = df[DATE_COLUMN].max() # Calculate customer-level metrics customer_metrics = df.groupby(CUSTOMER_COLUMN).agg({ DATE_COLUMN: ['max', 'count'], REVENUE_COLUMN: 'sum' }).reset_index() customer_metrics.columns = [CUSTOMER_COLUMN, 'LastPurchaseDate', 'Frequency', 'Monetary'] # Calculate Recency (days since last purchase) customer_metrics['Recency'] = (analysis_date - customer_metrics['LastPurchaseDate']).dt.days # Score each dimension (1-5 scale, 5 = best) customer_metrics['R_Score'] = pd.qcut( customer_metrics['Recency'].rank(method='first'), q=5, labels=[5, 4, 3, 2, 1], duplicates='drop' ).astype(int) customer_metrics['F_Score'] = pd.qcut( customer_metrics['Frequency'].rank(method='first'), q=5, labels=[1, 2, 3, 4, 5], duplicates='drop' ).astype(int) customer_metrics['M_Score'] = pd.qcut( customer_metrics['Monetary'].rank(method='first'), q=5, labels=[1, 2, 3, 4, 5], duplicates='drop' ).astype(int) # Calculate RFM score (sum of R, F, M) customer_metrics['RFM_Score'] = ( customer_metrics['R_Score'] + customer_metrics['F_Score'] + customer_metrics['M_Score'] ) # Assign segments def assign_segment(row): r, f, m = row['R_Score'], row['F_Score'], row['M_Score'] if r >= 4 and f >= 4 and m >= 4: return 'Champions' elif r >= 3 and f >= 3 and m >= 4: return 'Loyal Customers' elif r >= 4 and f <= 2: return 'At Risk' elif r <= 2: return 'Hibernating' elif r >= 3 and f >= 3 and m <= 2: return 'Potential Loyalists' else: return 'Need Attention' customer_metrics['Segment'] = customer_metrics.apply(assign_segment, axis=1) return customer_metrics # ============================================================================ # MAIN ANALYSIS FUNCTION # ============================================================================ def main(): """Main analysis function""" print(f"\n{'='*60}") print(f"{ANALYSIS_NAME}") print(f"{'='*60}\n") # 1. Load data print("Loading data...") try: df = load_sales_data(get_data_path()) print(f"Loaded {len(df):,} transactions") except Exception as e: print(f"ERROR loading data: {e}") return # 2. Validate data structure is_valid, msg = validate_data_structure(df) if not is_valid: print(f"ERROR: {msg}") return if CUSTOMER_COLUMN not in df.columns: print(f"ERROR: Customer column '{CUSTOMER_COLUMN}' not found in data") return print("Data validation passed") # 3. Apply exclusion filters df = apply_exclusion_filters(df) # 4. Filter by date range df = df[df['Year'] >= MIN_YEAR] if DATE_COLUMN in df.columns: df = df[df[DATE_COLUMN] <= MAX_DATE] # 5. Calculate RFM scores print("\nCalculating RFM scores...") rfm_df = calculate_rfm_scores(df) # 6. Segment summary print("\nCustomer Segmentation Summary:") print("-" * 60) segment_summary = rfm_df.groupby('Segment').agg({ CUSTOMER_COLUMN: 'count', 'Monetary': 'sum' }).reset_index() segment_summary.columns = ['Segment', 'Customer Count', 'Total Revenue'] segment_summary = segment_summary.sort_values('Total Revenue', ascending=False) for _, row in segment_summary.iterrows(): pct_customers = (row['Customer Count'] / len(rfm_df)) * 100 pct_revenue = (row['Total Revenue'] / rfm_df['Monetary'].sum()) * 100 print(f"{row['Segment']:20s}: {row['Customer Count']:5d} customers ({pct_customers:5.1f}%), " f"{format_currency(row['Total Revenue'])} ({pct_revenue:5.1f}% of revenue)") # 7. Create visualizations print("\nGenerating charts...") ensure_directories() # Chart 1: Revenue by Segment fig, (ax1, ax2) = plt.subplots(1, 2, figsize=CHART_SIZES['wide']) segment_summary_sorted = segment_summary.sort_values('Total Revenue', ascending=True) revenue_millions = segment_summary_sorted['Total Revenue'].values / 1e6 ax1.barh(range(len(segment_summary_sorted)), revenue_millions, color='#2E86AB') ax1.set_yticks(range(len(segment_summary_sorted))) ax1.set_yticklabels(segment_summary_sorted['Segment'].values) ax1.set_xlabel('Revenue (Millions USD)') ax1.set_title('Revenue by Customer Segment', fontsize=12, fontweight='bold') setup_revenue_chart(ax1) ax1.set_ylabel('') # Chart 2: Customer Count by Segment customer_counts = segment_summary_sorted['Customer Count'].values ax2.barh(range(len(segment_summary_sorted)), customer_counts, color='#A23B72') ax2.set_yticks(range(len(segment_summary_sorted))) ax2.set_yticklabels(segment_summary_sorted['Segment'].values) ax2.set_xlabel('Number of Customers') ax2.set_title('Customer Count by Segment', fontsize=12, fontweight='bold') ax2.set_ylabel('') ax2.grid(True, alpha=0.3) plt.suptitle(f'Customer Segmentation Analysis - {COMPANY_NAME}', fontsize=14, fontweight='bold', y=1.02) plt.tight_layout() save_chart(fig, 'customer_segmentation.png') plt.close() # 8. Validate revenue print("\nValidating revenue...") validate_revenue(df, ANALYSIS_NAME) print(f"\n{ANALYSIS_NAME} complete!") print(f"Charts saved to: {OUTPUT_DIR}") # ============================================================================ # RUN ANALYSIS # ============================================================================ if __name__ == "__main__": main()