agnes-the-ai-analyst/scripts/generate_sample_data.py
Petr 302494b632 Add --format parquet using project's ParquetManager
Generator now supports --format {csv,parquet,both}. Parquet mode
uses src.parquet_manager.ParquetManager for snappy compression,
proper column types (DATE, TIMESTAMP, DOUBLE), and metadata.
No more ad-hoc pandas conversion needed on the server.
2026-03-10 21:46:20 +01:00

1058 lines
40 KiB
Python

#!/usr/bin/env python3
"""
Sample data generator for AI Data Analyst demo and testing.
Generates realistic synthetic e-commerce + marketing data as CSV or Parquet.
Tables: customers, products, campaigns, web_sessions, web_leads,
orders, order_items, payments, support_tickets
Usage:
python scripts/generate_sample_data.py --size s --output data/sample
python scripts/generate_sample_data.py --size m --format parquet --output /data/src_data/parquet
python scripts/generate_sample_data.py --list-sizes
"""
import argparse
import csv
import json
import logging
import random
import sys
import time
from datetime import date, timedelta
from pathlib import Path
from typing import Any, Generator
try:
from faker import Faker
except ImportError:
print("ERROR: faker is required. Install with: pip install faker")
sys.exit(1)
logger = logging.getLogger(__name__)
# ── Size configurations ────────────────────────────────────────────────
SIZE_CONFIGS = {
"xs": {
"label": "Extra Small (demo/dev)",
"customers": 50,
"products": 30,
"campaigns": 10,
"web_sessions": 500,
"web_leads": 50,
"orders": 100,
"support_tickets": 30,
"months": 3,
"estimated_csv_mb": 1,
},
"s": {
"label": "Small (testing)",
"customers": 500,
"products": 100,
"campaigns": 30,
"web_sessions": 10_000,
"web_leads": 1_000,
"orders": 2_000,
"support_tickets": 500,
"months": 12,
"estimated_csv_mb": 15,
},
"m": {
"label": "Medium (realistic)",
"customers": 5_000,
"products": 300,
"campaigns": 80,
"web_sessions": 100_000,
"web_leads": 10_000,
"orders": 20_000,
"support_tickets": 5_000,
"months": 24,
"estimated_csv_mb": 150,
},
"l": {
"label": "Large (stress test)",
"customers": 50_000,
"products": 1_000,
"campaigns": 200,
"web_sessions": 1_000_000,
"web_leads": 100_000,
"orders": 200_000,
"support_tickets": 50_000,
"months": 36,
"estimated_csv_mb": 1500,
},
}
# ── Domain data ────────────────────────────────────────────────────────
# Monthly seasonality multipliers (index 0 = January)
MONTHLY_SEASONALITY = [0.70, 0.75, 0.85, 0.90, 0.95, 1.00,
0.90, 0.85, 1.00, 1.10, 1.30, 1.50]
# Day-of-week multipliers (Monday=0 .. Sunday=6)
DOW_MULTIPLIER = [1.0, 1.0, 1.0, 1.05, 1.15, 0.80, 0.60]
# Hour-of-day weights (24 values, peak at 10-14)
HOUR_WEIGHTS = [2, 1, 1, 1, 1, 2, 4, 8, 14, 18, 20, 19,
18, 17, 16, 15, 14, 12, 10, 8, 6, 5, 4, 3]
CUSTOMER_SEGMENTS = [
("b2c", 0.60),
("b2b_small", 0.25),
("b2b_enterprise", 0.15),
]
COUNTRIES = [
("Czech Republic", "CZ", 0.25), ("Germany", "DE", 0.15),
("United States", "US", 0.12), ("United Kingdom", "GB", 0.10),
("France", "FR", 0.08), ("Austria", "AT", 0.05),
("Poland", "PL", 0.05), ("Netherlands", "NL", 0.05),
("Slovakia", "SK", 0.05), ("Spain", "ES", 0.04),
("Italy", "IT", 0.03), ("Sweden", "SE", 0.03),
]
EMAIL_DOMAINS = [
"gmail.com", "yahoo.com", "outlook.com", "hotmail.com",
"protonmail.com", "icloud.com", "mail.com",
]
PRODUCT_CATEGORIES = {
"Electronics": {
"items": [
"Wireless Headphones", "USB-C Charger 65W", "Smart Watch",
"Webcam 4K", "Bluetooth Speaker", "Noise-Cancelling Earbuds",
"Mechanical Keyboard", "27in Monitor QHD", "Laptop Stand",
"Power Bank 20000mAh", "Smart Home Hub", "LED Desk Lamp",
"External SSD 1TB", "Wireless Charging Pad", "Action Camera",
],
"price_range": (19.99, 1299.99),
"cost_ratio": (0.40, 0.65),
},
"Clothing": {
"items": [
"Oxford Shirt Classic", "Slim Chino Pants", "Merino Sweater",
"Leather Belt Premium", "Running Sneakers", "Denim Jacket",
"Polo Shirt Casual", "Winter Down Jacket", "Cotton T-Shirt",
"Formal Dress Shoes", "Yoga Leggings", "Crossbody Bag",
"Wool Blend Coat", "Sport Shorts Quick-Dry", "Canvas Tote Bag",
],
"price_range": (9.99, 299.99),
"cost_ratio": (0.30, 0.55),
},
"Home & Garden": {
"items": [
"Ceramic Mug Set", "Bamboo Cutting Board", "Steel Water Bottle",
"Indoor Plant Pot Set", "LED String Lights 10m", "Bath Towel Set",
"Memory Foam Pillow", "Scented Candle Set", "Kitchen Knife Set 5pc",
"Garden Tool Set", "Bedside Lamp", "Throw Blanket Fleece",
"Wall Clock Minimal", "Spice Rack Organizer", "Herb Garden Kit",
],
"price_range": (7.99, 199.99),
"cost_ratio": (0.35, 0.55),
},
"Sports & Outdoors": {
"items": [
"Yoga Mat Premium", "Resistance Bands Set", "Insulated Bottle",
"Hiking Backpack 40L", "Speed Jump Rope", "Foam Roller 45cm",
"Camping Hammock", "Cycling Gloves", "Tennis Balls 4-Pack",
"Swim Goggles Anti-Fog", "Adjustable Dumbbells", "Running Armband",
"Compact Sleeping Bag", "Compression Socks", "Fishing Tackle Box",
],
"price_range": (8.99, 249.99),
"cost_ratio": (0.35, 0.60),
},
"Books & Media": {
"items": [
"Data Science Handbook", "Leadership in Practice", "Creative Writing",
"Python Programming", "World History Atlas", "Cooking Masterclass",
"Mindfulness Journal", "Photography Basics", "Financial Planning",
"Sci-Fi Novel Collection", "Art Supplies Set", "Board Game Classic",
"Puzzle 1000 Pieces", "Drawing Pencil Set 24pc", "Travel Guide Europe",
],
"price_range": (5.99, 79.99),
"cost_ratio": (0.25, 0.45),
},
"Beauty & Health": {
"items": [
"Moisturizer SPF30", "Organic Shampoo 500ml", "Electric Toothbrush",
"Vitamin D3 Supplements", "Essential Oil Set 6pk", "Hair Dryer Pro",
"Sunscreen SPF50", "Protein Powder Vanilla", "Face Mask Pack 10",
"Hand Cream Repair", "Body Lotion Hydrating", "Beard Grooming Set",
"Collagen Drink Mix", "Makeup Brush Set 12pc", "Bath Bomb Gift Set",
],
"price_range": (4.99, 149.99),
"cost_ratio": (0.20, 0.45),
},
}
PRODUCT_VARIANTS = ["Pro", "Ultra", "Lite", "Plus", "Mini", "Max"]
PRODUCT_COLORS = ["Black", "White", "Blue", "Red", "Green", "Grey"]
CAMPAIGN_CHANNELS = [
("email", 0.20),
("paid_search", 0.22),
("paid_social", 0.18),
("organic_social", 0.12),
("display", 0.12),
("affiliate", 0.08),
("retargeting", 0.08),
]
CAMPAIGN_TEMPLATES = [
"Spring Sale", "Summer Clearance", "Back to School", "Black Friday",
"Holiday Season", "New Year Push", "Flash Sale", "Product Launch",
"Loyalty Rewards", "Newsletter Blast", "Retargeting Wave",
"Brand Awareness", "Category Spotlight", "Win-Back", "Early Access",
]
LEAD_SOURCES = [
("newsletter_signup", 0.30),
("contact_form", 0.25),
("demo_request", 0.15),
("content_download", 0.20),
("webinar_registration", 0.10),
]
DEVICES = [("desktop", 0.45), ("mobile", 0.45), ("tablet", 0.10)]
BROWSERS = [("Chrome", 0.64), ("Safari", 0.19), ("Firefox", 0.08),
("Edge", 0.07), ("Other", 0.02)]
LANDING_PAGES = [
"/", "/products", "/products/electronics", "/products/clothing",
"/products/home-garden", "/sale", "/new-arrivals", "/about",
"/blog", "/blog/tips", "/blog/reviews", "/contact",
]
ORDER_STATUSES = [
("delivered", 0.58), ("shipped", 0.15), ("confirmed", 0.10),
("pending", 0.04), ("cancelled", 0.08), ("returned", 0.05),
]
ORDER_CHANNELS = [
("web", 0.55), ("mobile_app", 0.35), ("phone", 0.05), ("api", 0.05),
]
PAYMENT_METHODS = [
("credit_card", 0.38), ("debit_card", 0.20), ("paypal", 0.18),
("bank_transfer", 0.12), ("apple_pay", 0.08), ("invoice", 0.04),
]
TICKET_CATEGORIES = [
("question", 0.28), ("complaint", 0.18), ("return_request", 0.14),
("shipping", 0.16), ("technical_issue", 0.12), ("refund", 0.12),
]
TICKET_PRIORITIES = [
("low", 0.38), ("medium", 0.35), ("high", 0.20), ("critical", 0.07),
]
TICKET_SUBJECTS = {
"question": [
"Delivery time estimate", "Product compatibility", "Return policy",
"Bulk order pricing", "Warranty coverage", "Size guide help",
],
"complaint": [
"Item arrived damaged", "Wrong product received", "Poor quality",
"Missing items in order", "Packaging insufficient", "Late delivery",
],
"return_request": [
"Does not match description", "Changed my mind", "Duplicate order",
"Size does not fit", "Defective product", "Better price elsewhere",
],
"shipping": [
"Package not delivered", "Tracking not updating", "Wrong address",
"Expedited shipping request", "International shipping", "Lost package",
],
"technical_issue": [
"Cannot complete checkout", "Payment error", "Login problem",
"Page not loading", "Mobile app crash", "Coupon not working",
],
"refund": [
"Cancelled order refund", "Partial refund request", "Overcharged",
"Refund not received", "Billing discrepancy", "Double charged",
],
}
TICKET_CHANNELS = [
("email", 0.40), ("chat", 0.30), ("phone", 0.15), ("web_form", 0.15),
]
# ── Parquet schema definitions (used by ParquetManager) ────────────────
TABLE_SCHEMAS = {
"customers": {
"dtypes": {"is_active": "Int64"},
"date_columns": ["registration_date"],
},
"products": {
"dtypes": {
"price": "float64", "cost": "float64",
"weight_kg": "float64", "is_active": "Int64",
},
"date_columns": ["created_at"],
},
"campaigns": {
"dtypes": {
"budget": "float64", "spend": "float64",
"impressions": "Int64", "clicks": "Int64",
},
"date_columns": ["start_date", "end_date"],
},
"web_sessions": {
"dtypes": {
"duration_seconds": "Int64", "pages_viewed": "Int64",
"is_bounce": "Int64",
},
"parse_dates": ["started_at"],
},
"web_leads": {
"parse_dates": ["created_at", "converted_at"],
},
"orders": {
"dtypes": {
"items_total": "float64", "discount_amount": "float64",
"shipping_amount": "float64", "total_amount": "float64",
},
"parse_dates": ["created_at"],
},
"order_items": {
"dtypes": {
"quantity": "Int64", "unit_price": "float64",
"discount_percent": "Int64", "line_total": "float64",
},
},
"payments": {
"dtypes": {"amount": "float64"},
"parse_dates": ["created_at", "completed_at"],
},
"support_tickets": {
"dtypes": {"satisfaction_score": "Int64"},
"parse_dates": ["created_at", "first_response_at", "resolved_at"],
},
}
# ── Generator ──────────────────────────────────────────────────────────
class SampleDataGenerator:
"""Generates realistic synthetic e-commerce data as CSV or Parquet."""
def __init__(self, size: str, seed: int, output_dir: Path,
output_format: str = "csv"):
self.cfg = SIZE_CONFIGS[size]
self.size_name = size
self.rng = random.Random(seed)
self.fake = Faker(["en_US", "de_DE", "cs_CZ", "fr_FR"])
Faker.seed(seed)
self.output_dir = output_dir
self.output_format = output_format # "csv", "parquet", or "both"
self.row_counts: dict[str, int] = {}
# Time range
months = self.cfg["months"]
self.end_date = date(2026, 3, 1)
self.start_date = self.end_date - timedelta(days=months * 30)
self.total_days = (self.end_date - self.start_date).days
# Pre-compute day weights for temporal distribution
self._days: list[date] = []
self._day_weights: list[float] = []
for i in range(self.total_days):
d = self.start_date + timedelta(days=i)
growth = 1.0 + 0.5 * (i / max(self.total_days, 1))
season = MONTHLY_SEASONALITY[d.month - 1]
dow = DOW_MULTIPLIER[d.weekday()]
self._days.append(d)
self._day_weights.append(growth * season * dow)
# Reference data (populated during generation)
self._customer_ids: list[str] = []
self._customer_reg_dates: dict[str, date] = {}
self._customer_segments: dict[str, str] = {}
self._product_ids: list[str] = []
self._product_prices: dict[str, float] = {}
self._product_categories: dict[str, str] = {}
self._campaign_ids: list[str] = []
self._campaign_ranges: dict[str, tuple[date, date]] = {}
self._order_ids: list[str] = []
self._order_customers: dict[str, str] = {}
self._order_dates: dict[str, date] = {}
self._order_statuses: dict[str, str] = {}
self._order_totals: dict[str, float] = {}
# ── Helpers ─────────────────────────────────────────────────
def _weighted_choice(self, options: list[tuple[str, float]]) -> str:
"""Pick from [(value, weight), ...] using instance RNG."""
values, weights = zip(*options)
return self.rng.choices(values, weights=weights, k=1)[0]
def _random_date(self) -> date:
"""Random date weighted by growth + seasonality + day-of-week."""
return self.rng.choices(self._days, weights=self._day_weights, k=1)[0]
def _random_datetime(self, d: date | None = None) -> str:
"""Random datetime string. If d is None, pick a weighted random date."""
if d is None:
d = self._random_date()
hour = self.rng.choices(range(24), weights=HOUR_WEIGHTS, k=1)[0]
minute = self.rng.randint(0, 59)
second = self.rng.randint(0, 59)
return f"{d} {hour:02d}:{minute:02d}:{second:02d}"
def _random_date_after(self, start: date, max_days: int = 30) -> date:
"""Random date between start and start + max_days (capped at end_date)."""
end = min(start + timedelta(days=max_days), self.end_date)
delta = (end - start).days
if delta <= 0:
return start
return start + timedelta(days=self.rng.randint(0, delta))
def _write_table(self, name: str, fields: list[str],
rows: list[dict] | Generator) -> int:
"""Write CSV table from list or generator of dicts."""
path = self.output_dir / f"{name}.csv"
count = 0
with open(path, "w", newline="", encoding="utf-8") as f:
writer = csv.DictWriter(f, fieldnames=fields)
writer.writeheader()
for row in rows:
writer.writerow(row)
count += 1
if count % 250_000 == 0:
logger.info(f" ... {count:,} rows written")
self.row_counts[name] = count
return count
# ── Table generators ───────────────────────────────────────
def _generate_customers(self) -> None:
n = self.cfg["customers"]
logger.info(f" Generating {n:,} customers...")
country_vals = [(c[0], c[2]) for c in COUNTRIES]
rows = []
for i in range(n):
cid = f"C-{i + 1:06d}"
segment = self._weighted_choice(CUSTOMER_SEGMENTS)
reg_date = self._random_date()
first = self.fake.first_name()
last = self.fake.last_name()
country = self._weighted_choice(country_vals)
if segment.startswith("b2b"):
company = self.fake.company()
domain = company.lower().split()[0].replace(",", "") + ".com"
email = f"{first.lower()}.{last.lower()}@{domain}"
else:
company = ""
domain = self.rng.choice(EMAIL_DOMAINS)
email = f"{first.lower()}.{last.lower()}@{domain}"
rows.append({
"customer_id": cid,
"email": email,
"first_name": first,
"last_name": last,
"company": company,
"country": country,
"city": self.fake.city(),
"segment": segment,
"registration_date": str(reg_date),
"is_active": self.rng.choices([1, 0], weights=[0.85, 0.15])[0],
})
self._customer_ids.append(cid)
self._customer_reg_dates[cid] = reg_date
self._customer_segments[cid] = segment
self._write_table("customers", list(rows[0].keys()), rows)
def _generate_products(self) -> None:
n = self.cfg["products"]
logger.info(f" Generating {n:,} products...")
# Build product pool: base items + variants for larger sizes
pool: list[tuple[str, str, str]] = [] # (name, category, subcategory)
categories = list(PRODUCT_CATEGORIES.keys())
for cat in categories:
for item in PRODUCT_CATEGORIES[cat]["items"]:
pool.append((item, cat, cat))
# Add variants if we need more than base pool
while len(pool) < n:
cat = self.rng.choice(categories)
item = self.rng.choice(PRODUCT_CATEGORIES[cat]["items"])
variant = self.rng.choice(PRODUCT_VARIANTS)
color = self.rng.choice(PRODUCT_COLORS)
name = f"{item} {variant} - {color}"
pool.append((name, cat, cat))
self.rng.shuffle(pool)
pool = pool[:n]
rows = []
for i, (name, category, _subcat) in enumerate(pool):
pid = f"P-{i + 1:05d}"
cat_cfg = PRODUCT_CATEGORIES[category]
price = round(self.rng.uniform(*cat_cfg["price_range"]), 2)
cost_ratio = self.rng.uniform(*cat_cfg["cost_ratio"])
cost = round(price * cost_ratio, 2)
rows.append({
"product_id": pid,
"sku": f"SKU-{self.rng.randint(10000, 99999)}",
"name": name,
"category": category,
"price": price,
"cost": cost,
"weight_kg": round(self.rng.uniform(0.1, 15.0), 2),
"is_active": self.rng.choices([1, 0], weights=[0.90, 0.10])[0],
"created_at": str(self._random_date()),
})
self._product_ids.append(pid)
self._product_prices[pid] = price
self._product_categories[pid] = category
self._write_table("products", list(rows[0].keys()), rows)
def _generate_campaigns(self) -> None:
n = self.cfg["campaigns"]
logger.info(f" Generating {n:,} campaigns...")
rows = []
for i in range(n):
cid = f"CMP-{i + 1:04d}"
channel = self._weighted_choice(CAMPAIGN_CHANNELS)
start = self._random_date()
duration = self.rng.randint(7, 60)
end = min(start + timedelta(days=duration), self.end_date)
is_past = end < self.end_date - timedelta(days=7)
budget = round(self.rng.uniform(500, 25000), 2)
spend_ratio = self.rng.uniform(0.6, 1.1) if is_past else self.rng.uniform(0.2, 0.7)
spend = round(budget * min(spend_ratio, 1.0), 2)
impressions = int(spend * self.rng.uniform(80, 500))
ctr = self.rng.uniform(0.005, 0.08)
clicks = int(impressions * ctr)
template = self.rng.choice(CAMPAIGN_TEMPLATES)
name = f"{template} - {channel.replace('_', ' ').title()} {start.year}"
status = "completed" if is_past else self.rng.choice(["active", "paused"])
rows.append({
"campaign_id": cid,
"name": name,
"channel": channel,
"status": status,
"budget": budget,
"spend": spend,
"impressions": impressions,
"clicks": clicks,
"start_date": str(start),
"end_date": str(end),
"target_segment": self._weighted_choice(CUSTOMER_SEGMENTS),
})
self._campaign_ids.append(cid)
self._campaign_ranges[cid] = (start, end)
self._write_table("campaigns", list(rows[0].keys()), rows)
def _generate_web_sessions(self) -> None:
n = self.cfg["web_sessions"]
logger.info(f" Generating {n:,} web sessions...")
fields = [
"session_id", "visitor_id", "customer_id", "campaign_id",
"started_at", "duration_seconds", "pages_viewed",
"device_type", "browser", "country", "landing_page", "is_bounce",
]
country_vals = [(c[0], c[2]) for c in COUNTRIES]
def gen_rows() -> Generator[dict[str, Any], None, None]:
for i in range(n):
sid = f"S-{i + 1:08d}"
d = self._random_date()
# 40% sessions from logged-in customers
customer_id = ""
if self.rng.random() < 0.40 and self._customer_ids:
customer_id = self.rng.choice(self._customer_ids)
# 25% sessions attributed to a campaign
campaign_id = ""
if self.rng.random() < 0.25 and self._campaign_ids:
# Pick a campaign that was active on this date
candidates = [
c for c in self._campaign_ids
if self._campaign_ranges[c][0] <= d <= self._campaign_ranges[c][1]
]
if candidates:
campaign_id = self.rng.choice(candidates)
is_bounce = self.rng.random() < 0.35
if is_bounce:
duration = self.rng.randint(5, 30)
pages = 1
else:
duration = self.rng.randint(30, 900)
pages = self.rng.randint(2, 15)
yield {
"session_id": sid,
"visitor_id": f"V-{self.rng.randint(1, n // 3):08d}",
"customer_id": customer_id,
"campaign_id": campaign_id,
"started_at": self._random_datetime(d),
"duration_seconds": duration,
"pages_viewed": pages,
"device_type": self._weighted_choice(DEVICES),
"browser": self._weighted_choice(BROWSERS),
"country": self._weighted_choice(country_vals),
"landing_page": self.rng.choice(LANDING_PAGES),
"is_bounce": int(is_bounce),
}
self._write_table("web_sessions", fields, gen_rows())
def _generate_web_leads(self) -> None:
n = self.cfg["web_leads"]
logger.info(f" Generating {n:,} web leads...")
fields = [
"lead_id", "customer_id", "email", "source", "campaign_id",
"created_at", "status", "converted_at",
]
lead_statuses = [
("new", 0.35), ("contacted", 0.20), ("qualified", 0.15),
("converted", 0.18), ("lost", 0.12),
]
rows = []
for i in range(n):
lid = f"L-{i + 1:06d}"
d = self._random_date()
status = self._weighted_choice(lead_statuses)
# 55% from existing customers
customer_id = ""
email = self.fake.email()
if self.rng.random() < 0.55 and self._customer_ids:
customer_id = self.rng.choice(self._customer_ids)
campaign_id = ""
if self.rng.random() < 0.40 and self._campaign_ids:
campaign_id = self.rng.choice(self._campaign_ids)
converted_at = ""
if status == "converted":
converted_at = self._random_datetime(
self._random_date_after(d, max_days=14)
)
rows.append({
"lead_id": lid,
"customer_id": customer_id,
"email": email,
"source": self._weighted_choice(LEAD_SOURCES),
"campaign_id": campaign_id,
"created_at": self._random_datetime(d),
"status": status,
"converted_at": converted_at,
})
self._write_table("web_leads", fields, rows)
def _generate_orders_and_items(self) -> None:
n_orders = self.cfg["orders"]
logger.info(f" Generating {n_orders:,} orders + order items...")
# Customer activity weights (Pareto-like distribution)
activity = [self.rng.paretovariate(1.2) for _ in self._customer_ids]
order_fields = [
"order_id", "customer_id", "created_at", "status",
"items_total", "discount_amount", "shipping_amount",
"total_amount", "channel",
]
item_fields = [
"order_item_id", "order_id", "product_id", "quantity",
"unit_price", "discount_percent", "line_total",
]
order_rows = []
item_rows = []
item_seq = 0
for i in range(n_orders):
oid = f"ORD-{i + 1:07d}"
cust_id = self.rng.choices(self._customer_ids, weights=activity, k=1)[0]
reg_date = self._customer_reg_dates[cust_id]
segment = self._customer_segments[cust_id]
# Order date: after customer registration
order_date = self._random_date_after(reg_date,
max_days=(self.end_date - reg_date).days)
status = self._weighted_choice(ORDER_STATUSES)
# B2B orders tend to have more items
max_items = 8 if segment.startswith("b2b") else 5
item_weights = list(range(max_items, 0, -1)) # favor fewer items
n_items = self.rng.choices(range(1, max_items + 1), weights=item_weights, k=1)[0]
items_total = 0.0
for _j in range(n_items):
item_seq += 1
pid = self.rng.choice(self._product_ids)
qty = self.rng.choices([1, 2, 3, 4, 5],
weights=[60, 20, 10, 5, 5], k=1)[0]
if segment == "b2b_enterprise":
qty *= self.rng.randint(1, 5)
unit_price = self._product_prices[pid]
disc_pct = self.rng.choices(
[0, 5, 10, 15, 20],
weights=[50, 20, 15, 10, 5], k=1
)[0]
line_total = round(unit_price * qty * (1 - disc_pct / 100), 2)
items_total += line_total
item_rows.append({
"order_item_id": f"OI-{item_seq:08d}",
"order_id": oid,
"product_id": pid,
"quantity": qty,
"unit_price": unit_price,
"discount_percent": disc_pct,
"line_total": line_total,
})
discount_amount = round(items_total * self.rng.uniform(0, 0.05), 2)
shipping = round(self.rng.uniform(0, 15.99), 2) if items_total < 100 else 0.0
total = round(items_total - discount_amount + shipping, 2)
order_rows.append({
"order_id": oid,
"customer_id": cust_id,
"created_at": self._random_datetime(order_date),
"status": status,
"items_total": round(items_total, 2),
"discount_amount": discount_amount,
"shipping_amount": shipping,
"total_amount": total,
"channel": self._weighted_choice(ORDER_CHANNELS),
})
self._order_ids.append(oid)
self._order_customers[oid] = cust_id
self._order_dates[oid] = order_date
self._order_statuses[oid] = status
self._order_totals[oid] = total
self._write_table("orders", order_fields, order_rows)
self._write_table("order_items", item_fields, item_rows)
def _generate_payments(self) -> None:
logger.info(f" Generating payments for {len(self._order_ids):,} orders...")
fields = [
"payment_id", "order_id", "customer_id", "amount", "currency",
"method", "status", "created_at", "completed_at",
]
rows = []
seq = 0
for oid in self._order_ids:
cust_id = self._order_customers[oid]
segment = self._customer_segments[cust_id]
order_date = self._order_dates[oid]
order_status = self._order_statuses[oid]
amount = self._order_totals[oid]
# B2B more likely to use invoice/bank_transfer
if segment.startswith("b2b") and self.rng.random() < 0.40:
method = self.rng.choice(["bank_transfer", "invoice"])
else:
method = self._weighted_choice(PAYMENT_METHODS)
# 5% chance of a failed payment attempt first
if self.rng.random() < 0.05:
seq += 1
rows.append({
"payment_id": f"PAY-{seq:07d}",
"order_id": oid,
"customer_id": cust_id,
"amount": amount,
"currency": "EUR",
"method": method,
"status": "failed",
"created_at": self._random_datetime(order_date),
"completed_at": "",
})
seq += 1
if order_status == "cancelled":
pay_status = "cancelled"
completed = ""
elif order_status == "returned":
pay_status = "refunded"
completed = self._random_datetime(
self._random_date_after(order_date, max_days=14)
)
else:
pay_status = "completed"
completed = self._random_datetime(
self._random_date_after(order_date, max_days=3)
)
rows.append({
"payment_id": f"PAY-{seq:07d}",
"order_id": oid,
"customer_id": cust_id,
"amount": amount,
"currency": "EUR",
"method": method,
"status": pay_status,
"created_at": self._random_datetime(order_date),
"completed_at": completed,
})
self._write_table("payments", fields, rows)
def _generate_support_tickets(self) -> None:
n = self.cfg["support_tickets"]
logger.info(f" Generating {n:,} support tickets...")
fields = [
"ticket_id", "customer_id", "order_id", "category", "priority",
"status", "channel", "subject", "created_at", "first_response_at",
"resolved_at", "satisfaction_score",
]
rows = []
for i in range(n):
tid = f"TKT-{i + 1:06d}"
cust_id = self.rng.choice(self._customer_ids)
category = self._weighted_choice(TICKET_CATEGORIES)
priority = self._weighted_choice(TICKET_PRIORITIES)
subject = self.rng.choice(TICKET_SUBJECTS[category])
d = self._random_date()
# 60% linked to an order
order_id = ""
if self.rng.random() < 0.60 and self._order_ids:
# Pick an order from this customer if possible
cust_orders = [
o for o in self._order_ids
if self._order_customers[o] == cust_id
]
if cust_orders:
order_id = self.rng.choice(cust_orders)
else:
order_id = self.rng.choice(self._order_ids)
# Status progression
is_resolved = self.rng.random() < 0.75
if is_resolved:
status = self.rng.choice(["resolved", "closed"])
else:
status = self.rng.choice(["open", "in_progress", "waiting_customer"])
# Response and resolution times based on priority
response_hours = {
"critical": (0.5, 4), "high": (1, 12),
"medium": (4, 48), "low": (8, 96),
}
rh = response_hours[priority]
first_response = ""
resolved_at = ""
satisfaction = ""
if status not in ("open",):
resp_delta = timedelta(hours=self.rng.uniform(*rh))
first_response = self._random_datetime(
min(d + timedelta(days=int(resp_delta.total_seconds() // 86400)),
self.end_date)
)
if is_resolved:
resolve_days = self.rng.randint(1, 14)
resolved_at = self._random_datetime(
self._random_date_after(d, max_days=resolve_days)
)
# Satisfaction: skewed toward 4-5 for resolved
satisfaction = self.rng.choices(
[1, 2, 3, 4, 5],
weights=[5, 8, 15, 35, 37], k=1
)[0]
rows.append({
"ticket_id": tid,
"customer_id": cust_id,
"order_id": order_id,
"category": category,
"priority": priority,
"status": status,
"channel": self._weighted_choice(TICKET_CHANNELS),
"subject": subject,
"created_at": self._random_datetime(d),
"first_response_at": first_response,
"resolved_at": resolved_at,
"satisfaction_score": satisfaction,
})
self._write_table("support_tickets", fields, rows)
# ── Parquet conversion ─────────────────────────────────────
def _convert_to_parquet(self, parquet_dir: Path) -> None:
"""Convert generated CSVs to Parquet using project's ParquetManager."""
# Ensure project root is importable (script may run from any cwd)
project_root = Path(__file__).resolve().parent.parent
if str(project_root) not in sys.path:
sys.path.insert(0, str(project_root))
from src.parquet_manager import create_parquet_manager
manager = create_parquet_manager()
parquet_dir.mkdir(parents=True, exist_ok=True)
logger.info(f" Converting to Parquet -> {parquet_dir}/")
for csv_path in sorted(self.output_dir.glob("*.csv")):
table_name = csv_path.stem
schema = TABLE_SCHEMAS.get(table_name, {})
parquet_path = parquet_dir / f"{table_name}.parquet"
result = manager.csv_to_parquet(
csv_path=csv_path,
parquet_path=parquet_path,
dtypes=schema.get("dtypes"),
parse_dates=schema.get("parse_dates"),
date_columns=schema.get("date_columns"),
table_id=f"sample.{table_name}",
)
logger.info(
f" {table_name}: {result['rows']:,} rows, "
f"{result['parquet_size_bytes'] / 1024:.0f} KB "
f"({result['compression_ratio']:.1f}x compression)"
)
# ── Orchestration ──────────────────────────────────────────
def run(self) -> dict[str, Any]:
"""Generate all tables and return manifest data."""
self.output_dir.mkdir(parents=True, exist_ok=True)
t0 = time.time()
fmt_label = self.output_format.upper()
logger.info(f"Generating sample data (size: {self.size_name}, format: {fmt_label})")
logger.info(f" Period: {self.start_date} to {self.end_date} "
f"({self.cfg['months']} months)")
logger.info(f" Output: {self.output_dir}/")
# Phase 1: Generate CSVs (always needed as intermediate)
csv_dir = self.output_dir
if self.output_format == "parquet":
# CSVs go to a temp subdir, only Parquet files in output
csv_dir = self.output_dir / "_csv_tmp"
csv_dir.mkdir(parents=True, exist_ok=True)
self.output_dir = csv_dir # temporarily redirect CSV writes
self._generate_customers()
self._generate_products()
self._generate_campaigns()
self._generate_web_sessions()
self._generate_web_leads()
self._generate_orders_and_items()
self._generate_payments()
self._generate_support_tickets()
# Phase 2: Convert to Parquet if requested
if self.output_format == "parquet":
parquet_dir = csv_dir.parent # the original output_dir
self._convert_to_parquet(parquet_dir)
# Clean up temp CSVs
import shutil
shutil.rmtree(csv_dir)
self.output_dir = parquet_dir # restore for manifest
elif self.output_format == "both":
parquet_dir = self.output_dir / "parquet"
self._convert_to_parquet(parquet_dir)
elapsed = time.time() - t0
total_rows = sum(self.row_counts.values())
manifest = {
"generator": "generate_sample_data.py",
"size": self.size_name,
"format": self.output_format,
"seed": self.rng.getstate()[1][0],
"date_range": {
"start": str(self.start_date),
"end": str(self.end_date),
},
"tables": self.row_counts,
"total_rows": total_rows,
"elapsed_seconds": round(elapsed, 1),
}
manifest_path = self.output_dir / "_manifest.json"
with open(manifest_path, "w", encoding="utf-8") as f:
json.dump(manifest, f, indent=2)
logger.info("")
logger.info(f"Done! {len(self.row_counts)} tables, "
f"{total_rows:,} total rows in {elapsed:.1f}s")
logger.info(f"Manifest: {manifest_path}")
return manifest
# ── CLI ────────────────────────────────────────────────────────────────
def main() -> None:
parser = argparse.ArgumentParser(
description="Generate synthetic e-commerce sample data as CSV files."
)
parser.add_argument(
"--size", choices=SIZE_CONFIGS.keys(), default="s",
help="Data size preset (default: s)",
)
parser.add_argument(
"--output", type=Path, default=Path("data/sample"),
help="Output directory for CSV files (default: data/sample)",
)
parser.add_argument(
"--seed", type=int, default=42,
help="Random seed for reproducibility (default: 42)",
)
parser.add_argument(
"--format", choices=["csv", "parquet", "both"], default="csv",
help="Output format: csv, parquet (via ParquetManager), or both (default: csv)",
)
parser.add_argument(
"--list-sizes", action="store_true",
help="Show available size presets and exit",
)
args = parser.parse_args()
logging.basicConfig(level=logging.INFO, format="%(message)s")
if args.list_sizes:
print("\nAvailable size presets:\n")
print(f" {'Size':<6} {'Label':<24} {'Customers':>10} {'Products':>10} "
f"{'Sessions':>10} {'Orders':>10} {'~CSV MB':>8}")
print(f" {'' * 6} {'' * 24} {'' * 10} {'' * 10} "
f"{'' * 10} {'' * 10} {'' * 8}")
for key, cfg in SIZE_CONFIGS.items():
print(f" {key:<6} {cfg['label']:<24} {cfg['customers']:>10,} "
f"{cfg['products']:>10,} {cfg['web_sessions']:>10,} "
f"{cfg['orders']:>10,} {cfg['estimated_csv_mb']:>7,}")
print()
return
gen = SampleDataGenerator(
size=args.size, seed=args.seed,
output_dir=args.output, output_format=args.format,
)
gen.run()
if __name__ == "__main__":
main()