sales-data-analysis/generate_sample_data.py

"""
Sample data generator for testing and demonstrations
Generates realistic sample sales data

Usage:
    python generate_sample_data.py

    # Or import and use programmatically:
    from generate_sample_data import generate_sample_sales_data
    df = generate_sample_sales_data(num_customers=100, num_products=50, years=[2021, 2022, 2023])
"""
import pandas as pd
import numpy as np
from pathlib import Path
from datetime import datetime, timedelta
import random

def generate_sample_sales_data(
    num_customers=100,
    num_products=50,
    years=[2021, 2022, 2023, 2024, 2025],
    transactions_per_month=500,
    output_file='sample_sales_data.csv'
):
    """
    Generate realistic sample sales data

    Args:
        num_customers: Number of unique customers
        num_products: Number of unique products
        years: List of years to generate data for
        transactions_per_month: Average transactions per month
        output_file: Output CSV filename

    Returns:
        DataFrame: Generated sales data
    """
    print(f"Generating sample sales data...")
    print(f"  Customers: {num_customers}")
    print(f"  Products: {num_products}")
    print(f"  Years: {years}")

    # Generate customer names
    customer_names = [f"Customer_{i:04d}" for i in range(1, num_customers + 1)]

    # Generate product names
    product_names = [f"Product_{i:04d}" for i in range(1, num_products + 1)]

    # Generate transactions
    transactions = []

    for year in years:
        for month in range(1, 13):
            # Skip future months
            current_date = datetime.now()
            if year > current_date.year or (year == current_date.year and month > current_date.month):
                continue

            # Generate transactions for this month
            num_transactions = int(np.random.normal(transactions_per_month, transactions_per_month * 0.2))
            num_transactions = max(10, num_transactions)  # At least 10 transactions

            for _ in range(num_transactions):
                # Random date within month
                if month == 2:
                    max_day = 28
                elif month in [4, 6, 9, 11]:
                    max_day = 30
                else:
                    max_day = 31

                day = random.randint(1, max_day)
                invoice_date = datetime(year, month, day)

                # Random customer and product
                customer = random.choice(customer_names)
                product = random.choice(product_names)

                # Generate quantity (most transactions are small)
                quantity = int(np.random.lognormal(mean=1.5, sigma=1.0))
                quantity = max(1, min(quantity, 100))  # Cap at 100

                # Generate revenue (with some correlation to quantity)
                base_price = np.random.lognormal(mean=5, sigma=1.5)
                revenue = base_price * quantity

                # Add some variation
                revenue *= np.random.uniform(0.8, 1.2)
                revenue = round(revenue, 2)

                transactions.append({
                    'InvoiceDate': invoice_date,
                    'Customer': customer,
                    'Item': product,
                    'Quantity': quantity,
                    'USD': revenue,
                    'Year': year,
                    'Month': month
                })

    # Create DataFrame
    df = pd.DataFrame(transactions)

    # Sort by date
    df = df.sort_values('InvoiceDate').reset_index(drop=True)

    # Add some missing dates (realistic data quality issue)
    missing_date_pct = 0.05  # 5% missing dates
    num_missing = int(len(df) * missing_date_pct)
    missing_indices = np.random.choice(df.index, size=num_missing, replace=False)
    df.loc[missing_indices, 'InvoiceDate'] = pd.NaT

    # Save to CSV
    output_path = Path(output_file)
    df.to_csv(output_path, index=False)
    print(f"\n✅ Sample data generated: {output_path}")
    print(f"   Rows: {len(df):,}")
    print(f"   Date range: {df['InvoiceDate'].min()} to {df['InvoiceDate'].max()}")
    print(f"   Total revenue: ${df['USD'].sum() / 1e6:.2f}m")

    return df

def generate_sample_data_for_template():
    """
    Generate sample data matching template's expected structure
    Uses config.py column names
    """
    from config import (
        REVENUE_COLUMN, DATE_COLUMN, CUSTOMER_COLUMN, ITEM_COLUMN,
        QUANTITY_COLUMN, ANALYSIS_YEARS
    )

    print("Generating sample data for template...")

    df = generate_sample_sales_data(
        num_customers=200,
        num_products=100,
        years=ANALYSIS_YEARS,
        transactions_per_month=1000,
        output_file='sample_sales_data.csv'
    )

    # Rename columns to match config (if different)
    column_mapping = {
        'USD': REVENUE_COLUMN,
        'InvoiceDate': DATE_COLUMN,
        'Customer': CUSTOMER_COLUMN,
        'Item': ITEM_COLUMN,
        'Quantity': QUANTITY_COLUMN
    }

    # Only rename if different
    for old_name, new_name in column_mapping.items():
        if old_name in df.columns and old_name != new_name:
            df = df.rename(columns={old_name: new_name})

    # Save
    output_path = Path('sample_sales_data.csv')
    df.to_csv(output_path, index=False)

    print(f"\n✅ Sample data saved to: {output_path}")
    print(f"   Ready to use with sales_analysis_template")

    return df

# ============================================================================
# MAIN
# ============================================================================

if __name__ == "__main__":
    """Generate sample data"""
    import sys

    if len(sys.argv) > 1:
        # Custom generation
        num_customers = int(sys.argv[1]) if len(sys.argv) > 1 else 100
        num_products = int(sys.argv[2]) if len(sys.argv) > 2 else 50
        generate_sample_sales_data(
            num_customers=num_customers,
            num_products=num_products
        )
    else:
        # Generate for template
        generate_sample_data_for_template()