185 lines
6.1 KiB
Python
185 lines
6.1 KiB
Python
"""
|
|
Sample data generator for testing and demonstrations
|
|
Generates realistic sample sales data
|
|
|
|
Usage:
|
|
python generate_sample_data.py
|
|
|
|
# Or import and use programmatically:
|
|
from generate_sample_data import generate_sample_sales_data
|
|
df = generate_sample_sales_data(num_customers=100, num_products=50, years=[2021, 2022, 2023])
|
|
"""
|
|
import pandas as pd
|
|
import numpy as np
|
|
from pathlib import Path
|
|
from datetime import datetime, timedelta
|
|
import random
|
|
|
|
def generate_sample_sales_data(
|
|
num_customers=100,
|
|
num_products=50,
|
|
years=[2021, 2022, 2023, 2024, 2025],
|
|
transactions_per_month=500,
|
|
output_file='sample_sales_data.csv'
|
|
):
|
|
"""
|
|
Generate realistic sample sales data
|
|
|
|
Args:
|
|
num_customers: Number of unique customers
|
|
num_products: Number of unique products
|
|
years: List of years to generate data for
|
|
transactions_per_month: Average transactions per month
|
|
output_file: Output CSV filename
|
|
|
|
Returns:
|
|
DataFrame: Generated sales data
|
|
"""
|
|
print(f"Generating sample sales data...")
|
|
print(f" Customers: {num_customers}")
|
|
print(f" Products: {num_products}")
|
|
print(f" Years: {years}")
|
|
|
|
# Generate customer names
|
|
customer_names = [f"Customer_{i:04d}" for i in range(1, num_customers + 1)]
|
|
|
|
# Generate product names
|
|
product_names = [f"Product_{i:04d}" for i in range(1, num_products + 1)]
|
|
|
|
# Generate transactions
|
|
transactions = []
|
|
|
|
for year in years:
|
|
for month in range(1, 13):
|
|
# Skip future months
|
|
current_date = datetime.now()
|
|
if year > current_date.year or (year == current_date.year and month > current_date.month):
|
|
continue
|
|
|
|
# Generate transactions for this month
|
|
num_transactions = int(np.random.normal(transactions_per_month, transactions_per_month * 0.2))
|
|
num_transactions = max(10, num_transactions) # At least 10 transactions
|
|
|
|
for _ in range(num_transactions):
|
|
# Random date within month
|
|
if month == 2:
|
|
max_day = 28
|
|
elif month in [4, 6, 9, 11]:
|
|
max_day = 30
|
|
else:
|
|
max_day = 31
|
|
|
|
day = random.randint(1, max_day)
|
|
invoice_date = datetime(year, month, day)
|
|
|
|
# Random customer and product
|
|
customer = random.choice(customer_names)
|
|
product = random.choice(product_names)
|
|
|
|
# Generate quantity (most transactions are small)
|
|
quantity = int(np.random.lognormal(mean=1.5, sigma=1.0))
|
|
quantity = max(1, min(quantity, 100)) # Cap at 100
|
|
|
|
# Generate revenue (with some correlation to quantity)
|
|
base_price = np.random.lognormal(mean=5, sigma=1.5)
|
|
revenue = base_price * quantity
|
|
|
|
# Add some variation
|
|
revenue *= np.random.uniform(0.8, 1.2)
|
|
revenue = round(revenue, 2)
|
|
|
|
transactions.append({
|
|
'InvoiceDate': invoice_date,
|
|
'Customer': customer,
|
|
'Item': product,
|
|
'Quantity': quantity,
|
|
'USD': revenue,
|
|
'Year': year,
|
|
'Month': month
|
|
})
|
|
|
|
# Create DataFrame
|
|
df = pd.DataFrame(transactions)
|
|
|
|
# Sort by date
|
|
df = df.sort_values('InvoiceDate').reset_index(drop=True)
|
|
|
|
# Add some missing dates (realistic data quality issue)
|
|
missing_date_pct = 0.05 # 5% missing dates
|
|
num_missing = int(len(df) * missing_date_pct)
|
|
missing_indices = np.random.choice(df.index, size=num_missing, replace=False)
|
|
df.loc[missing_indices, 'InvoiceDate'] = pd.NaT
|
|
|
|
# Save to CSV
|
|
output_path = Path(output_file)
|
|
df.to_csv(output_path, index=False)
|
|
print(f"\n✅ Sample data generated: {output_path}")
|
|
print(f" Rows: {len(df):,}")
|
|
print(f" Date range: {df['InvoiceDate'].min()} to {df['InvoiceDate'].max()}")
|
|
print(f" Total revenue: ${df['USD'].sum() / 1e6:.2f}m")
|
|
|
|
return df
|
|
|
|
def generate_sample_data_for_template():
|
|
"""
|
|
Generate sample data matching template's expected structure
|
|
Uses config.py column names
|
|
"""
|
|
from config import (
|
|
REVENUE_COLUMN, DATE_COLUMN, CUSTOMER_COLUMN, ITEM_COLUMN,
|
|
QUANTITY_COLUMN, ANALYSIS_YEARS
|
|
)
|
|
|
|
print("Generating sample data for template...")
|
|
|
|
df = generate_sample_sales_data(
|
|
num_customers=200,
|
|
num_products=100,
|
|
years=ANALYSIS_YEARS,
|
|
transactions_per_month=1000,
|
|
output_file='sample_sales_data.csv'
|
|
)
|
|
|
|
# Rename columns to match config (if different)
|
|
column_mapping = {
|
|
'USD': REVENUE_COLUMN,
|
|
'InvoiceDate': DATE_COLUMN,
|
|
'Customer': CUSTOMER_COLUMN,
|
|
'Item': ITEM_COLUMN,
|
|
'Quantity': QUANTITY_COLUMN
|
|
}
|
|
|
|
# Only rename if different
|
|
for old_name, new_name in column_mapping.items():
|
|
if old_name in df.columns and old_name != new_name:
|
|
df = df.rename(columns={old_name: new_name})
|
|
|
|
# Save
|
|
output_path = Path('sample_sales_data.csv')
|
|
df.to_csv(output_path, index=False)
|
|
|
|
print(f"\n✅ Sample data saved to: {output_path}")
|
|
print(f" Ready to use with sales_analysis_template")
|
|
|
|
return df
|
|
|
|
# ============================================================================
|
|
# MAIN
|
|
# ============================================================================
|
|
|
|
if __name__ == "__main__":
|
|
"""Generate sample data"""
|
|
import sys
|
|
|
|
if len(sys.argv) > 1:
|
|
# Custom generation
|
|
num_customers = int(sys.argv[1]) if len(sys.argv) > 1 else 100
|
|
num_products = int(sys.argv[2]) if len(sys.argv) > 2 else 50
|
|
generate_sample_sales_data(
|
|
num_customers=num_customers,
|
|
num_products=num_products
|
|
)
|
|
else:
|
|
# Generate for template
|
|
generate_sample_data_for_template()
|