""" Sample data generator for testing and demonstrations Generates realistic sample sales data Usage: python generate_sample_data.py # Or import and use programmatically: from generate_sample_data import generate_sample_sales_data df = generate_sample_sales_data(num_customers=100, num_products=50, years=[2021, 2022, 2023]) """ import pandas as pd import numpy as np from pathlib import Path from datetime import datetime, timedelta import random def generate_sample_sales_data( num_customers=100, num_products=50, years=[2021, 2022, 2023, 2024, 2025], transactions_per_month=500, output_file='sample_sales_data.csv' ): """ Generate realistic sample sales data Args: num_customers: Number of unique customers num_products: Number of unique products years: List of years to generate data for transactions_per_month: Average transactions per month output_file: Output CSV filename Returns: DataFrame: Generated sales data """ print(f"Generating sample sales data...") print(f" Customers: {num_customers}") print(f" Products: {num_products}") print(f" Years: {years}") # Generate customer names customer_names = [f"Customer_{i:04d}" for i in range(1, num_customers + 1)] # Generate product names product_names = [f"Product_{i:04d}" for i in range(1, num_products + 1)] # Generate transactions transactions = [] for year in years: for month in range(1, 13): # Skip future months current_date = datetime.now() if year > current_date.year or (year == current_date.year and month > current_date.month): continue # Generate transactions for this month num_transactions = int(np.random.normal(transactions_per_month, transactions_per_month * 0.2)) num_transactions = max(10, num_transactions) # At least 10 transactions for _ in range(num_transactions): # Random date within month if month == 2: max_day = 28 elif month in [4, 6, 9, 11]: max_day = 30 else: max_day = 31 day = random.randint(1, max_day) invoice_date = datetime(year, month, day) # Random customer and product customer = random.choice(customer_names) product = random.choice(product_names) # Generate quantity (most transactions are small) quantity = int(np.random.lognormal(mean=1.5, sigma=1.0)) quantity = max(1, min(quantity, 100)) # Cap at 100 # Generate revenue (with some correlation to quantity) base_price = np.random.lognormal(mean=5, sigma=1.5) revenue = base_price * quantity # Add some variation revenue *= np.random.uniform(0.8, 1.2) revenue = round(revenue, 2) transactions.append({ 'InvoiceDate': invoice_date, 'Customer': customer, 'Item': product, 'Quantity': quantity, 'USD': revenue, 'Year': year, 'Month': month }) # Create DataFrame df = pd.DataFrame(transactions) # Sort by date df = df.sort_values('InvoiceDate').reset_index(drop=True) # Add some missing dates (realistic data quality issue) missing_date_pct = 0.05 # 5% missing dates num_missing = int(len(df) * missing_date_pct) missing_indices = np.random.choice(df.index, size=num_missing, replace=False) df.loc[missing_indices, 'InvoiceDate'] = pd.NaT # Save to CSV output_path = Path(output_file) df.to_csv(output_path, index=False) print(f"\n✅ Sample data generated: {output_path}") print(f" Rows: {len(df):,}") print(f" Date range: {df['InvoiceDate'].min()} to {df['InvoiceDate'].max()}") print(f" Total revenue: ${df['USD'].sum() / 1e6:.2f}m") return df def generate_sample_data_for_template(): """ Generate sample data matching template's expected structure Uses config.py column names """ from config import ( REVENUE_COLUMN, DATE_COLUMN, CUSTOMER_COLUMN, ITEM_COLUMN, QUANTITY_COLUMN, ANALYSIS_YEARS ) print("Generating sample data for template...") df = generate_sample_sales_data( num_customers=200, num_products=100, years=ANALYSIS_YEARS, transactions_per_month=1000, output_file='sample_sales_data.csv' ) # Rename columns to match config (if different) column_mapping = { 'USD': REVENUE_COLUMN, 'InvoiceDate': DATE_COLUMN, 'Customer': CUSTOMER_COLUMN, 'Item': ITEM_COLUMN, 'Quantity': QUANTITY_COLUMN } # Only rename if different for old_name, new_name in column_mapping.items(): if old_name in df.columns and old_name != new_name: df = df.rename(columns={old_name: new_name}) # Save output_path = Path('sample_sales_data.csv') df.to_csv(output_path, index=False) print(f"\n✅ Sample data saved to: {output_path}") print(f" Ready to use with sales_analysis_template") return df # ============================================================================ # MAIN # ============================================================================ if __name__ == "__main__": """Generate sample data""" import sys if len(sys.argv) > 1: # Custom generation num_customers = int(sys.argv[1]) if len(sys.argv) > 1 else 100 num_products = int(sys.argv[2]) if len(sys.argv) > 2 else 50 generate_sample_sales_data( num_customers=num_customers, num_products=num_products ) else: # Generate for template generate_sample_data_for_template()