Files
sales-data-analysis/generate_sample_data.py
Jonathan Pressnell cf0b596449 Initial commit: sales analysis template
Co-authored-by: Cursor <cursoragent@cursor.com>
2026-02-06 09:16:34 -05:00

185 lines
6.1 KiB
Python

"""
Sample data generator for testing and demonstrations
Generates realistic sample sales data
Usage:
python generate_sample_data.py
# Or import and use programmatically:
from generate_sample_data import generate_sample_sales_data
df = generate_sample_sales_data(num_customers=100, num_products=50, years=[2021, 2022, 2023])
"""
import pandas as pd
import numpy as np
from pathlib import Path
from datetime import datetime, timedelta
import random
def generate_sample_sales_data(
num_customers=100,
num_products=50,
years=[2021, 2022, 2023, 2024, 2025],
transactions_per_month=500,
output_file='sample_sales_data.csv'
):
"""
Generate realistic sample sales data
Args:
num_customers: Number of unique customers
num_products: Number of unique products
years: List of years to generate data for
transactions_per_month: Average transactions per month
output_file: Output CSV filename
Returns:
DataFrame: Generated sales data
"""
print(f"Generating sample sales data...")
print(f" Customers: {num_customers}")
print(f" Products: {num_products}")
print(f" Years: {years}")
# Generate customer names
customer_names = [f"Customer_{i:04d}" for i in range(1, num_customers + 1)]
# Generate product names
product_names = [f"Product_{i:04d}" for i in range(1, num_products + 1)]
# Generate transactions
transactions = []
for year in years:
for month in range(1, 13):
# Skip future months
current_date = datetime.now()
if year > current_date.year or (year == current_date.year and month > current_date.month):
continue
# Generate transactions for this month
num_transactions = int(np.random.normal(transactions_per_month, transactions_per_month * 0.2))
num_transactions = max(10, num_transactions) # At least 10 transactions
for _ in range(num_transactions):
# Random date within month
if month == 2:
max_day = 28
elif month in [4, 6, 9, 11]:
max_day = 30
else:
max_day = 31
day = random.randint(1, max_day)
invoice_date = datetime(year, month, day)
# Random customer and product
customer = random.choice(customer_names)
product = random.choice(product_names)
# Generate quantity (most transactions are small)
quantity = int(np.random.lognormal(mean=1.5, sigma=1.0))
quantity = max(1, min(quantity, 100)) # Cap at 100
# Generate revenue (with some correlation to quantity)
base_price = np.random.lognormal(mean=5, sigma=1.5)
revenue = base_price * quantity
# Add some variation
revenue *= np.random.uniform(0.8, 1.2)
revenue = round(revenue, 2)
transactions.append({
'InvoiceDate': invoice_date,
'Customer': customer,
'Item': product,
'Quantity': quantity,
'USD': revenue,
'Year': year,
'Month': month
})
# Create DataFrame
df = pd.DataFrame(transactions)
# Sort by date
df = df.sort_values('InvoiceDate').reset_index(drop=True)
# Add some missing dates (realistic data quality issue)
missing_date_pct = 0.05 # 5% missing dates
num_missing = int(len(df) * missing_date_pct)
missing_indices = np.random.choice(df.index, size=num_missing, replace=False)
df.loc[missing_indices, 'InvoiceDate'] = pd.NaT
# Save to CSV
output_path = Path(output_file)
df.to_csv(output_path, index=False)
print(f"\n✅ Sample data generated: {output_path}")
print(f" Rows: {len(df):,}")
print(f" Date range: {df['InvoiceDate'].min()} to {df['InvoiceDate'].max()}")
print(f" Total revenue: ${df['USD'].sum() / 1e6:.2f}m")
return df
def generate_sample_data_for_template():
"""
Generate sample data matching template's expected structure
Uses config.py column names
"""
from config import (
REVENUE_COLUMN, DATE_COLUMN, CUSTOMER_COLUMN, ITEM_COLUMN,
QUANTITY_COLUMN, ANALYSIS_YEARS
)
print("Generating sample data for template...")
df = generate_sample_sales_data(
num_customers=200,
num_products=100,
years=ANALYSIS_YEARS,
transactions_per_month=1000,
output_file='sample_sales_data.csv'
)
# Rename columns to match config (if different)
column_mapping = {
'USD': REVENUE_COLUMN,
'InvoiceDate': DATE_COLUMN,
'Customer': CUSTOMER_COLUMN,
'Item': ITEM_COLUMN,
'Quantity': QUANTITY_COLUMN
}
# Only rename if different
for old_name, new_name in column_mapping.items():
if old_name in df.columns and old_name != new_name:
df = df.rename(columns={old_name: new_name})
# Save
output_path = Path('sample_sales_data.csv')
df.to_csv(output_path, index=False)
print(f"\n✅ Sample data saved to: {output_path}")
print(f" Ready to use with sales_analysis_template")
return df
# ============================================================================
# MAIN
# ============================================================================
if __name__ == "__main__":
"""Generate sample data"""
import sys
if len(sys.argv) > 1:
# Custom generation
num_customers = int(sys.argv[1]) if len(sys.argv) > 1 else 100
num_products = int(sys.argv[2]) if len(sys.argv) > 2 else 50
generate_sample_sales_data(
num_customers=num_customers,
num_products=num_products
)
else:
# Generate for template
generate_sample_data_for_template()