Initial commit: sales analysis template
Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
184
generate_sample_data.py
Normal file
184
generate_sample_data.py
Normal file
@@ -0,0 +1,184 @@
|
||||
"""
|
||||
Sample data generator for testing and demonstrations
|
||||
Generates realistic sample sales data
|
||||
|
||||
Usage:
|
||||
python generate_sample_data.py
|
||||
|
||||
# Or import and use programmatically:
|
||||
from generate_sample_data import generate_sample_sales_data
|
||||
df = generate_sample_sales_data(num_customers=100, num_products=50, years=[2021, 2022, 2023])
|
||||
"""
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from pathlib import Path
|
||||
from datetime import datetime, timedelta
|
||||
import random
|
||||
|
||||
def generate_sample_sales_data(
|
||||
num_customers=100,
|
||||
num_products=50,
|
||||
years=[2021, 2022, 2023, 2024, 2025],
|
||||
transactions_per_month=500,
|
||||
output_file='sample_sales_data.csv'
|
||||
):
|
||||
"""
|
||||
Generate realistic sample sales data
|
||||
|
||||
Args:
|
||||
num_customers: Number of unique customers
|
||||
num_products: Number of unique products
|
||||
years: List of years to generate data for
|
||||
transactions_per_month: Average transactions per month
|
||||
output_file: Output CSV filename
|
||||
|
||||
Returns:
|
||||
DataFrame: Generated sales data
|
||||
"""
|
||||
print(f"Generating sample sales data...")
|
||||
print(f" Customers: {num_customers}")
|
||||
print(f" Products: {num_products}")
|
||||
print(f" Years: {years}")
|
||||
|
||||
# Generate customer names
|
||||
customer_names = [f"Customer_{i:04d}" for i in range(1, num_customers + 1)]
|
||||
|
||||
# Generate product names
|
||||
product_names = [f"Product_{i:04d}" for i in range(1, num_products + 1)]
|
||||
|
||||
# Generate transactions
|
||||
transactions = []
|
||||
|
||||
for year in years:
|
||||
for month in range(1, 13):
|
||||
# Skip future months
|
||||
current_date = datetime.now()
|
||||
if year > current_date.year or (year == current_date.year and month > current_date.month):
|
||||
continue
|
||||
|
||||
# Generate transactions for this month
|
||||
num_transactions = int(np.random.normal(transactions_per_month, transactions_per_month * 0.2))
|
||||
num_transactions = max(10, num_transactions) # At least 10 transactions
|
||||
|
||||
for _ in range(num_transactions):
|
||||
# Random date within month
|
||||
if month == 2:
|
||||
max_day = 28
|
||||
elif month in [4, 6, 9, 11]:
|
||||
max_day = 30
|
||||
else:
|
||||
max_day = 31
|
||||
|
||||
day = random.randint(1, max_day)
|
||||
invoice_date = datetime(year, month, day)
|
||||
|
||||
# Random customer and product
|
||||
customer = random.choice(customer_names)
|
||||
product = random.choice(product_names)
|
||||
|
||||
# Generate quantity (most transactions are small)
|
||||
quantity = int(np.random.lognormal(mean=1.5, sigma=1.0))
|
||||
quantity = max(1, min(quantity, 100)) # Cap at 100
|
||||
|
||||
# Generate revenue (with some correlation to quantity)
|
||||
base_price = np.random.lognormal(mean=5, sigma=1.5)
|
||||
revenue = base_price * quantity
|
||||
|
||||
# Add some variation
|
||||
revenue *= np.random.uniform(0.8, 1.2)
|
||||
revenue = round(revenue, 2)
|
||||
|
||||
transactions.append({
|
||||
'InvoiceDate': invoice_date,
|
||||
'Customer': customer,
|
||||
'Item': product,
|
||||
'Quantity': quantity,
|
||||
'USD': revenue,
|
||||
'Year': year,
|
||||
'Month': month
|
||||
})
|
||||
|
||||
# Create DataFrame
|
||||
df = pd.DataFrame(transactions)
|
||||
|
||||
# Sort by date
|
||||
df = df.sort_values('InvoiceDate').reset_index(drop=True)
|
||||
|
||||
# Add some missing dates (realistic data quality issue)
|
||||
missing_date_pct = 0.05 # 5% missing dates
|
||||
num_missing = int(len(df) * missing_date_pct)
|
||||
missing_indices = np.random.choice(df.index, size=num_missing, replace=False)
|
||||
df.loc[missing_indices, 'InvoiceDate'] = pd.NaT
|
||||
|
||||
# Save to CSV
|
||||
output_path = Path(output_file)
|
||||
df.to_csv(output_path, index=False)
|
||||
print(f"\n✅ Sample data generated: {output_path}")
|
||||
print(f" Rows: {len(df):,}")
|
||||
print(f" Date range: {df['InvoiceDate'].min()} to {df['InvoiceDate'].max()}")
|
||||
print(f" Total revenue: ${df['USD'].sum() / 1e6:.2f}m")
|
||||
|
||||
return df
|
||||
|
||||
def generate_sample_data_for_template():
|
||||
"""
|
||||
Generate sample data matching template's expected structure
|
||||
Uses config.py column names
|
||||
"""
|
||||
from config import (
|
||||
REVENUE_COLUMN, DATE_COLUMN, CUSTOMER_COLUMN, ITEM_COLUMN,
|
||||
QUANTITY_COLUMN, ANALYSIS_YEARS
|
||||
)
|
||||
|
||||
print("Generating sample data for template...")
|
||||
|
||||
df = generate_sample_sales_data(
|
||||
num_customers=200,
|
||||
num_products=100,
|
||||
years=ANALYSIS_YEARS,
|
||||
transactions_per_month=1000,
|
||||
output_file='sample_sales_data.csv'
|
||||
)
|
||||
|
||||
# Rename columns to match config (if different)
|
||||
column_mapping = {
|
||||
'USD': REVENUE_COLUMN,
|
||||
'InvoiceDate': DATE_COLUMN,
|
||||
'Customer': CUSTOMER_COLUMN,
|
||||
'Item': ITEM_COLUMN,
|
||||
'Quantity': QUANTITY_COLUMN
|
||||
}
|
||||
|
||||
# Only rename if different
|
||||
for old_name, new_name in column_mapping.items():
|
||||
if old_name in df.columns and old_name != new_name:
|
||||
df = df.rename(columns={old_name: new_name})
|
||||
|
||||
# Save
|
||||
output_path = Path('sample_sales_data.csv')
|
||||
df.to_csv(output_path, index=False)
|
||||
|
||||
print(f"\n✅ Sample data saved to: {output_path}")
|
||||
print(f" Ready to use with sales_analysis_template")
|
||||
|
||||
return df
|
||||
|
||||
# ============================================================================
|
||||
# MAIN
|
||||
# ============================================================================
|
||||
|
||||
if __name__ == "__main__":
|
||||
"""Generate sample data"""
|
||||
import sys
|
||||
|
||||
if len(sys.argv) > 1:
|
||||
# Custom generation
|
||||
num_customers = int(sys.argv[1]) if len(sys.argv) > 1 else 100
|
||||
num_products = int(sys.argv[2]) if len(sys.argv) > 2 else 50
|
||||
generate_sample_sales_data(
|
||||
num_customers=num_customers,
|
||||
num_products=num_products
|
||||
)
|
||||
else:
|
||||
# Generate for template
|
||||
generate_sample_data_for_template()
|
||||
Reference in New Issue
Block a user