From 44bf43535b95ec80ef9194714788a84ee6e97a60 Mon Sep 17 00:00:00 2001 From: Petr Date: Tue, 10 Mar 2026 12:31:14 +0100 Subject: [PATCH] Add sample data generator with 9 e-commerce tables Synthetic data generator for demo/testing without real data adapter: - 9 tables: customers, products, campaigns, web_sessions, web_leads, orders, order_items, payments, support_tickets - 4 size presets: xs (1MB), s (15MB), m (150MB), l (1.5GB) - Realistic patterns: seasonality, Pareto customer distribution, segment-based behavior, referential integrity - Deterministic output via --seed parameter Also: docs/sample-data.md, updated auto-install.md with Step 6, updated CLAUDE.md (email auth provider, dual-repo architecture) --- CLAUDE.md | 17 +- docs/auto-install.md | 60 +- docs/sample-data.md | 196 ++++++ requirements.txt | 4 + scripts/generate_sample_data.py | 940 +++++++++++++++++++++++++++++ tests/test_generate_sample_data.py | 173 ++++++ 6 files changed, 1385 insertions(+), 5 deletions(-) create mode 100644 docs/sample-data.md create mode 100644 scripts/generate_sample_data.py create mode 100644 tests/test_generate_sample_data.py diff --git a/CLAUDE.md b/CLAUDE.md index 4e611b9..b2fee42 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -42,7 +42,7 @@ Ask the user for: │ └── jira/ # Jira webhook connector ├── auth/ # Authentication providers (pluggable) │ ├── google/ # Google OAuth provider -│ ├── password/ # Email/password provider +│ ├── email/ # Email magic link provider │ └── desktop/ # Desktop JWT provider (API-only) ├── services/ # Standalone services (own systemd units) │ ├── telegram_bot/ # Telegram notification bot @@ -88,6 +88,15 @@ Environment variables go in `.env` (never committed to git). Data schema is defined in `docs/data_description.md` (YAML blocks in markdown). +### Dual-Repo Deployment +Production uses two repos on the server: +- **OSS repo** (`/opt/data-analyst/repo/`): application code, no secrets or config +- **Instance repo** (`/opt/data-analyst/instance/`): private config, secrets template, data schema + +Symlinks bridge them: `repo/config/instance.yaml -> instance/config/instance.yaml`. +Each repo has its own SSH deploy key (github-oss / github-cfg aliases). +See `docs/auto-install.md` for full setup guide. + ## Development ```bash @@ -117,7 +126,7 @@ Pluggable data source connectors in `connectors/`: ### Authentication Pluggable auth providers in `auth/`: - **Google** (`google`): OAuth via Google -- **Password** (`password`): Email/password with magic links +- **Email** (`email`): Email magic link (itsdangerous token, no password needed) - **Desktop** (`desktop`): JWT for desktop app API - New provider = `auth//provider.py` implementing `AuthProvider` @@ -164,7 +173,9 @@ When reopening the project in Claude Code: ### Auth Provider Pattern - ABC: `AuthProvider` class in `auth/__init__.py` - Discovery: `discover_providers()` scans `auth/*/provider.py` -- Providers: google, password, desktop (each exports `provider` instance) +- Providers: google, email, desktop (each exports `provider` instance) +- Email provider: uses `itsdangerous.URLSafeTimedSerializer` for magic link tokens +- Multi-domain: `auth.allowed_domain` in instance.yaml supports comma-separated domains - Session contract: all providers set `session["user"] = {"email", "name", "picture"}` ### Service Pattern diff --git a/docs/auto-install.md b/docs/auto-install.md index 6bfafa7..7201d7e 100644 --- a/docs/auto-install.md +++ b/docs/auto-install.md @@ -341,9 +341,63 @@ After server is set up, analysts self-onboard via the webapp: 4. User runs `claude` in their project folder, pastes setup instructions 5. Claude Code configures SSH, rsyncs data, sets up Python + DuckDB -## Step 6: Data Source (Next) +## Step 6: Sample Data (Try Without a Data Adapter) -Configure a real data source in `instance/config/instance.yaml`: +Before connecting a real data source, you can load sample data to verify the full pipeline +(Parquet files, DuckDB, analyst rsync, Claude Code analysis). + +```bash +cd /opt/data-analyst/repo + +# Install generator dependency +/opt/data-analyst/.venv/bin/pip install faker + +# Generate synthetic e-commerce data (size m: ~20K orders, 100K sessions) +/opt/data-analyst/.venv/bin/python scripts/generate_sample_data.py \ + --size m --output /tmp/sample_csv --seed 42 + +# Convert CSVs to Parquet and deploy to data directory +/opt/data-analyst/.venv/bin/python -c " +import pandas as pd +from pathlib import Path + +csv_dir = Path('/tmp/sample_csv') +parquet_dir = Path('/data/src_data/parquet') +parquet_dir.mkdir(parents=True, exist_ok=True) + +for f in sorted(csv_dir.glob('*.csv')): + df = pd.read_csv(f) + out = parquet_dir / f'{f.stem}.parquet' + df.to_parquet(out, index=False) + print(f' {f.stem}: {len(df):,} rows -> {out}') +" + +# Set correct permissions +chown -R root:data-ops /data/src_data/parquet +chmod -R 2775 /data/src_data/parquet + +# Clean up temporary CSVs +rm -rf /tmp/sample_csv +``` + +Available sizes: `xs` (50 customers, ~1 MB), `s` (500, ~15 MB), `m` (5K, ~150 MB), `l` (50K, ~1.5 GB). + +The sample data covers 9 tables: customers, products, campaigns, web_sessions, web_leads, +orders, order_items, payments, support_tickets. See `docs/sample-data.md` for the full +data model, table reference, and built-in analytical patterns. + +### Step 6 Checklist + +| # | Check | Expected | +|---|-------|----------| +| 6.1 | Parquet files | `ls /data/src_data/parquet/*.parquet` shows 9 files | +| 6.2 | Permissions | Files owned by root:data-ops, group-readable | +| 6.3 | Analyst sync | Analyst can rsync parquet files to local machine | +| 6.4 | DuckDB loads | `SELECT count(*) FROM read_parquet('orders.parquet')` returns rows | + +## Step 7: Real Data Source (Production) + +When ready, replace sample data with a real data source adapter in `instance/config/instance.yaml`: ```yaml data_source: @@ -356,6 +410,8 @@ data_source: Add the token to `.env` and create `config/data_description.md` with table schemas. +Other planned adapters: BigQuery, CSV import. + ## Deployment Workflow (Ongoing) ### Update OSS code diff --git a/docs/sample-data.md b/docs/sample-data.md new file mode 100644 index 0000000..77dcd34 --- /dev/null +++ b/docs/sample-data.md @@ -0,0 +1,196 @@ +# Sample Data Generator + +Generate realistic synthetic e-commerce and marketing data for demo, testing, and development without connecting a real data source adapter. + +## Quick Start + +```bash +# Install dependency +pip install faker + +# Generate small dataset (default) +python scripts/generate_sample_data.py --size s --output data/sample + +# List available sizes +python scripts/generate_sample_data.py --list-sizes +``` + +## Data Model + +9 interrelated tables covering the full e-commerce funnel: + +``` +┌──────────────┐ ┌──────────────┐ ┌──────────────┐ +│ campaigns │ │ customers │ │ products │ +│ CMP-0001 │ │ C-000001 │ │ P-00001 │ +└──────┬───────┘ └──────┬───────┘ └──────┬───────┘ + │ │ │ + ▼ ▼ │ +┌──────────────┐ ┌──────────────┐ │ +│ web_sessions │ │ web_leads │ │ +│ S-00000001 │ │ L-000001 │ │ +└──────────────┘ └──────────────┘ │ + │ │ + ▼ ▼ + ┌──────────────┐ ┌──────────────┐ + │ orders │────▶│ order_items │ + │ ORD-0000001 │ │ OI-00000001 │ + └──────┬───────┘ └──────────────┘ + │ + ┌──────┴───────┐ + ▼ ▼ + ┌──────────────┐ ┌──────────────┐ + │ payments │ │ support │ + │ PAY-0000001 │ │ tickets │ + └──────────────┘ │ TKT-000001 │ + └──────────────┘ +``` + +### Table Reference + +| Table | Key Columns | Foreign Keys | +|-------|-------------|--------------| +| **customers** | customer_id, email, segment, country, registration_date | - | +| **products** | product_id, name, category, price, cost | - | +| **campaigns** | campaign_id, channel, budget, spend, impressions, clicks | - | +| **web_sessions** | session_id, started_at, duration_seconds, device_type | customer_id?, campaign_id? | +| **web_leads** | lead_id, source, status, converted_at | customer_id?, campaign_id? | +| **orders** | order_id, status, total_amount, channel | customer_id | +| **order_items** | order_item_id, quantity, unit_price, line_total | order_id, product_id | +| **payments** | payment_id, amount, method, status | order_id, customer_id | +| **support_tickets** | ticket_id, category, priority, satisfaction_score | customer_id, order_id? | + +`?` = nullable (not every record has a value) + +### Customer Segments + +- **b2c** (60%): Individual consumers, smaller order values +- **b2b_small** (25%): Small business buyers, moderate volumes +- **b2b_enterprise** (15%): Large buyers, high quantities, invoice payments + +### Product Categories + +Electronics, Clothing, Home & Garden, Sports & Outdoors, Books & Media, Beauty & Health + +Each category has distinct price ranges and cost margins for realistic profitability analysis. + +## Size Presets + +| Size | Customers | Products | Sessions | Orders | Tickets | ~CSV | ~Time | +|------|-----------|----------|----------|--------|---------|------|-------| +| **xs** | 50 | 30 | 500 | 100 | 30 | 1 MB | <1s | +| **s** | 500 | 100 | 10K | 2K | 500 | 15 MB | <1s | +| **m** | 5,000 | 300 | 100K | 20K | 5K | 150 MB | ~7s | +| **l** | 50,000 | 1,000 | 1M | 200K | 50K | 1.5 GB | ~3min | + +- **xs** - local development, quick iteration +- **s** - unit/integration testing, CI +- **m** - realistic demo, performance testing +- **l** - stress testing, production-like volumes + +## CLI Options + +``` +python scripts/generate_sample_data.py [OPTIONS] + + --size {xs,s,m,l} Data size preset (default: s) + --output PATH Output directory (default: data/sample) + --seed INT Random seed for reproducibility (default: 42) + --list-sizes Show presets and exit +``` + +## Convert to Parquet + +After generating CSVs, convert to Parquet for analytical use: + +```bash +python -c " +import pandas as pd +from pathlib import Path + +csv_dir = Path('data/sample') +parquet_dir = Path('data/sample/parquet') +parquet_dir.mkdir(exist_ok=True) + +for f in sorted(csv_dir.glob('*.csv')): + df = pd.read_csv(f) + out = parquet_dir / f'{f.stem}.parquet' + df.to_parquet(out, index=False) + print(f' {f.stem}: {len(df):,} rows -> {out}') +" +``` + +## Load into DuckDB + +```bash +python -c " +import duckdb +from pathlib import Path + +db = duckdb.connect('data/sample/analytics.duckdb') +parquet_dir = Path('data/sample/parquet') + +for f in sorted(parquet_dir.glob('*.parquet')): + table = f.stem + db.execute(f'CREATE OR REPLACE TABLE {table} AS SELECT * FROM read_parquet(\"{f}\")') + count = db.execute(f'SELECT count(*) FROM {table}').fetchone()[0] + print(f' {table}: {count:,} rows') + +db.close() +print('Database: data/sample/analytics.duckdb') +" +``` + +## Built-in Analytical Patterns + +The generator creates data with discoverable patterns for realistic analysis: + +- **Seasonality**: Q4 traffic and orders ~2x higher than Q1 +- **Growth trend**: 50% increase in activity over the time period +- **Channel effectiveness**: paid_search has highest click-through rates +- **Customer lifetime**: Pareto distribution (20% of customers generate 80% of orders) +- **Segment differences**: B2B enterprise has 3-5x higher order values +- **Product mix**: Electronics = high revenue / lower margin, Books = low revenue / high margin +- **Support correlation**: 60% of tickets linked to specific orders + +## Reproducibility + +Same `--seed` always produces identical output. The default seed is 42. + +```bash +# These two commands produce the same files +python scripts/generate_sample_data.py --size s --seed 42 --output run1 +python scripts/generate_sample_data.py --size s --seed 42 --output run2 +diff -r run1 run2 # no differences +``` + +## Server Deployment + +To use sample data on a deployed server (instead of connecting a data adapter): + +```bash +# On the server +cd /opt/data-analyst/repo + +# Generate CSVs +.venv/bin/python scripts/generate_sample_data.py --size m --output /tmp/sample_csv + +# Convert to Parquet and deploy +.venv/bin/python -c " +import pandas as pd +from pathlib import Path + +csv_dir = Path('/tmp/sample_csv') +parquet_dir = Path('/data/src_data/parquet') +parquet_dir.mkdir(parents=True, exist_ok=True) + +for f in sorted(csv_dir.glob('*.csv')): + df = pd.read_csv(f) + out = parquet_dir / f'{f.stem}.parquet' + df.to_parquet(out, index=False) + print(f' {f.stem}: {len(df):,} rows -> {out}') +" + +# Clean up CSVs +rm -rf /tmp/sample_csv +``` diff --git a/requirements.txt b/requirements.txt index e68ddb5..81d63f2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -52,3 +52,7 @@ sendgrid>=6.11.0 # Corporate Memory knowledge extraction # anthropic - Claude API client for HAIKU-based knowledge extraction anthropic>=0.39.0 + +# Sample data generation (development/testing) +# faker - realistic synthetic data for demo datasets +faker>=24.0.0 diff --git a/scripts/generate_sample_data.py b/scripts/generate_sample_data.py new file mode 100644 index 0000000..8f5294b --- /dev/null +++ b/scripts/generate_sample_data.py @@ -0,0 +1,940 @@ +#!/usr/bin/env python3 +""" +Sample data generator for AI Data Analyst demo and testing. + +Generates realistic synthetic e-commerce + marketing data as CSV files. +Tables: customers, products, campaigns, web_sessions, web_leads, + orders, order_items, payments, support_tickets + +Usage: + python scripts/generate_sample_data.py --size xs --output data/sample + python scripts/generate_sample_data.py --size m --seed 42 + python scripts/generate_sample_data.py --list-sizes +""" + +import argparse +import csv +import json +import logging +import random +import sys +import time +from datetime import date, timedelta +from pathlib import Path +from typing import Any, Generator + +try: + from faker import Faker +except ImportError: + print("ERROR: faker is required. Install with: pip install faker") + sys.exit(1) + +logger = logging.getLogger(__name__) + +# ── Size configurations ──────────────────────────────────────────────── + +SIZE_CONFIGS = { + "xs": { + "label": "Extra Small (demo/dev)", + "customers": 50, + "products": 30, + "campaigns": 10, + "web_sessions": 500, + "web_leads": 50, + "orders": 100, + "support_tickets": 30, + "months": 3, + "estimated_csv_mb": 1, + }, + "s": { + "label": "Small (testing)", + "customers": 500, + "products": 100, + "campaigns": 30, + "web_sessions": 10_000, + "web_leads": 1_000, + "orders": 2_000, + "support_tickets": 500, + "months": 12, + "estimated_csv_mb": 15, + }, + "m": { + "label": "Medium (realistic)", + "customers": 5_000, + "products": 300, + "campaigns": 80, + "web_sessions": 100_000, + "web_leads": 10_000, + "orders": 20_000, + "support_tickets": 5_000, + "months": 24, + "estimated_csv_mb": 150, + }, + "l": { + "label": "Large (stress test)", + "customers": 50_000, + "products": 1_000, + "campaigns": 200, + "web_sessions": 1_000_000, + "web_leads": 100_000, + "orders": 200_000, + "support_tickets": 50_000, + "months": 36, + "estimated_csv_mb": 1500, + }, +} + +# ── Domain data ──────────────────────────────────────────────────────── + +# Monthly seasonality multipliers (index 0 = January) +MONTHLY_SEASONALITY = [0.70, 0.75, 0.85, 0.90, 0.95, 1.00, + 0.90, 0.85, 1.00, 1.10, 1.30, 1.50] + +# Day-of-week multipliers (Monday=0 .. Sunday=6) +DOW_MULTIPLIER = [1.0, 1.0, 1.0, 1.05, 1.15, 0.80, 0.60] + +# Hour-of-day weights (24 values, peak at 10-14) +HOUR_WEIGHTS = [2, 1, 1, 1, 1, 2, 4, 8, 14, 18, 20, 19, + 18, 17, 16, 15, 14, 12, 10, 8, 6, 5, 4, 3] + +CUSTOMER_SEGMENTS = [ + ("b2c", 0.60), + ("b2b_small", 0.25), + ("b2b_enterprise", 0.15), +] + +COUNTRIES = [ + ("Czech Republic", "CZ", 0.25), ("Germany", "DE", 0.15), + ("United States", "US", 0.12), ("United Kingdom", "GB", 0.10), + ("France", "FR", 0.08), ("Austria", "AT", 0.05), + ("Poland", "PL", 0.05), ("Netherlands", "NL", 0.05), + ("Slovakia", "SK", 0.05), ("Spain", "ES", 0.04), + ("Italy", "IT", 0.03), ("Sweden", "SE", 0.03), +] + +EMAIL_DOMAINS = [ + "gmail.com", "yahoo.com", "outlook.com", "hotmail.com", + "protonmail.com", "icloud.com", "mail.com", +] + +PRODUCT_CATEGORIES = { + "Electronics": { + "items": [ + "Wireless Headphones", "USB-C Charger 65W", "Smart Watch", + "Webcam 4K", "Bluetooth Speaker", "Noise-Cancelling Earbuds", + "Mechanical Keyboard", "27in Monitor QHD", "Laptop Stand", + "Power Bank 20000mAh", "Smart Home Hub", "LED Desk Lamp", + "External SSD 1TB", "Wireless Charging Pad", "Action Camera", + ], + "price_range": (19.99, 1299.99), + "cost_ratio": (0.40, 0.65), + }, + "Clothing": { + "items": [ + "Oxford Shirt Classic", "Slim Chino Pants", "Merino Sweater", + "Leather Belt Premium", "Running Sneakers", "Denim Jacket", + "Polo Shirt Casual", "Winter Down Jacket", "Cotton T-Shirt", + "Formal Dress Shoes", "Yoga Leggings", "Crossbody Bag", + "Wool Blend Coat", "Sport Shorts Quick-Dry", "Canvas Tote Bag", + ], + "price_range": (9.99, 299.99), + "cost_ratio": (0.30, 0.55), + }, + "Home & Garden": { + "items": [ + "Ceramic Mug Set", "Bamboo Cutting Board", "Steel Water Bottle", + "Indoor Plant Pot Set", "LED String Lights 10m", "Bath Towel Set", + "Memory Foam Pillow", "Scented Candle Set", "Kitchen Knife Set 5pc", + "Garden Tool Set", "Bedside Lamp", "Throw Blanket Fleece", + "Wall Clock Minimal", "Spice Rack Organizer", "Herb Garden Kit", + ], + "price_range": (7.99, 199.99), + "cost_ratio": (0.35, 0.55), + }, + "Sports & Outdoors": { + "items": [ + "Yoga Mat Premium", "Resistance Bands Set", "Insulated Bottle", + "Hiking Backpack 40L", "Speed Jump Rope", "Foam Roller 45cm", + "Camping Hammock", "Cycling Gloves", "Tennis Balls 4-Pack", + "Swim Goggles Anti-Fog", "Adjustable Dumbbells", "Running Armband", + "Compact Sleeping Bag", "Compression Socks", "Fishing Tackle Box", + ], + "price_range": (8.99, 249.99), + "cost_ratio": (0.35, 0.60), + }, + "Books & Media": { + "items": [ + "Data Science Handbook", "Leadership in Practice", "Creative Writing", + "Python Programming", "World History Atlas", "Cooking Masterclass", + "Mindfulness Journal", "Photography Basics", "Financial Planning", + "Sci-Fi Novel Collection", "Art Supplies Set", "Board Game Classic", + "Puzzle 1000 Pieces", "Drawing Pencil Set 24pc", "Travel Guide Europe", + ], + "price_range": (5.99, 79.99), + "cost_ratio": (0.25, 0.45), + }, + "Beauty & Health": { + "items": [ + "Moisturizer SPF30", "Organic Shampoo 500ml", "Electric Toothbrush", + "Vitamin D3 Supplements", "Essential Oil Set 6pk", "Hair Dryer Pro", + "Sunscreen SPF50", "Protein Powder Vanilla", "Face Mask Pack 10", + "Hand Cream Repair", "Body Lotion Hydrating", "Beard Grooming Set", + "Collagen Drink Mix", "Makeup Brush Set 12pc", "Bath Bomb Gift Set", + ], + "price_range": (4.99, 149.99), + "cost_ratio": (0.20, 0.45), + }, +} + +PRODUCT_VARIANTS = ["Pro", "Ultra", "Lite", "Plus", "Mini", "Max"] +PRODUCT_COLORS = ["Black", "White", "Blue", "Red", "Green", "Grey"] + +CAMPAIGN_CHANNELS = [ + ("email", 0.20), + ("paid_search", 0.22), + ("paid_social", 0.18), + ("organic_social", 0.12), + ("display", 0.12), + ("affiliate", 0.08), + ("retargeting", 0.08), +] + +CAMPAIGN_TEMPLATES = [ + "Spring Sale", "Summer Clearance", "Back to School", "Black Friday", + "Holiday Season", "New Year Push", "Flash Sale", "Product Launch", + "Loyalty Rewards", "Newsletter Blast", "Retargeting Wave", + "Brand Awareness", "Category Spotlight", "Win-Back", "Early Access", +] + +LEAD_SOURCES = [ + ("newsletter_signup", 0.30), + ("contact_form", 0.25), + ("demo_request", 0.15), + ("content_download", 0.20), + ("webinar_registration", 0.10), +] + +DEVICES = [("desktop", 0.45), ("mobile", 0.45), ("tablet", 0.10)] +BROWSERS = [("Chrome", 0.64), ("Safari", 0.19), ("Firefox", 0.08), + ("Edge", 0.07), ("Other", 0.02)] + +LANDING_PAGES = [ + "/", "/products", "/products/electronics", "/products/clothing", + "/products/home-garden", "/sale", "/new-arrivals", "/about", + "/blog", "/blog/tips", "/blog/reviews", "/contact", +] + +ORDER_STATUSES = [ + ("delivered", 0.58), ("shipped", 0.15), ("confirmed", 0.10), + ("pending", 0.04), ("cancelled", 0.08), ("returned", 0.05), +] + +ORDER_CHANNELS = [ + ("web", 0.55), ("mobile_app", 0.35), ("phone", 0.05), ("api", 0.05), +] + +PAYMENT_METHODS = [ + ("credit_card", 0.38), ("debit_card", 0.20), ("paypal", 0.18), + ("bank_transfer", 0.12), ("apple_pay", 0.08), ("invoice", 0.04), +] + +TICKET_CATEGORIES = [ + ("question", 0.28), ("complaint", 0.18), ("return_request", 0.14), + ("shipping", 0.16), ("technical_issue", 0.12), ("refund", 0.12), +] + +TICKET_PRIORITIES = [ + ("low", 0.38), ("medium", 0.35), ("high", 0.20), ("critical", 0.07), +] + +TICKET_SUBJECTS = { + "question": [ + "Delivery time estimate", "Product compatibility", "Return policy", + "Bulk order pricing", "Warranty coverage", "Size guide help", + ], + "complaint": [ + "Item arrived damaged", "Wrong product received", "Poor quality", + "Missing items in order", "Packaging insufficient", "Late delivery", + ], + "return_request": [ + "Does not match description", "Changed my mind", "Duplicate order", + "Size does not fit", "Defective product", "Better price elsewhere", + ], + "shipping": [ + "Package not delivered", "Tracking not updating", "Wrong address", + "Expedited shipping request", "International shipping", "Lost package", + ], + "technical_issue": [ + "Cannot complete checkout", "Payment error", "Login problem", + "Page not loading", "Mobile app crash", "Coupon not working", + ], + "refund": [ + "Cancelled order refund", "Partial refund request", "Overcharged", + "Refund not received", "Billing discrepancy", "Double charged", + ], +} + +TICKET_CHANNELS = [ + ("email", 0.40), ("chat", 0.30), ("phone", 0.15), ("web_form", 0.15), +] + + +# ── Generator ────────────────────────────────────────────────────────── + +class SampleDataGenerator: + """Generates realistic synthetic e-commerce data as CSV files.""" + + def __init__(self, size: str, seed: int, output_dir: Path): + self.cfg = SIZE_CONFIGS[size] + self.size_name = size + self.rng = random.Random(seed) + self.fake = Faker(["en_US", "de_DE", "cs_CZ", "fr_FR"]) + Faker.seed(seed) + self.output_dir = output_dir + self.row_counts: dict[str, int] = {} + + # Time range + months = self.cfg["months"] + self.end_date = date(2026, 3, 1) + self.start_date = self.end_date - timedelta(days=months * 30) + self.total_days = (self.end_date - self.start_date).days + + # Pre-compute day weights for temporal distribution + self._days: list[date] = [] + self._day_weights: list[float] = [] + for i in range(self.total_days): + d = self.start_date + timedelta(days=i) + growth = 1.0 + 0.5 * (i / max(self.total_days, 1)) + season = MONTHLY_SEASONALITY[d.month - 1] + dow = DOW_MULTIPLIER[d.weekday()] + self._days.append(d) + self._day_weights.append(growth * season * dow) + + # Reference data (populated during generation) + self._customer_ids: list[str] = [] + self._customer_reg_dates: dict[str, date] = {} + self._customer_segments: dict[str, str] = {} + self._product_ids: list[str] = [] + self._product_prices: dict[str, float] = {} + self._product_categories: dict[str, str] = {} + self._campaign_ids: list[str] = [] + self._campaign_ranges: dict[str, tuple[date, date]] = {} + self._order_ids: list[str] = [] + self._order_customers: dict[str, str] = {} + self._order_dates: dict[str, date] = {} + self._order_statuses: dict[str, str] = {} + self._order_totals: dict[str, float] = {} + + # ── Helpers ───────────────────────────────────────────────── + + def _weighted_choice(self, options: list[tuple[str, float]]) -> str: + """Pick from [(value, weight), ...] using instance RNG.""" + values, weights = zip(*options) + return self.rng.choices(values, weights=weights, k=1)[0] + + def _random_date(self) -> date: + """Random date weighted by growth + seasonality + day-of-week.""" + return self.rng.choices(self._days, weights=self._day_weights, k=1)[0] + + def _random_datetime(self, d: date | None = None) -> str: + """Random datetime string. If d is None, pick a weighted random date.""" + if d is None: + d = self._random_date() + hour = self.rng.choices(range(24), weights=HOUR_WEIGHTS, k=1)[0] + minute = self.rng.randint(0, 59) + second = self.rng.randint(0, 59) + return f"{d} {hour:02d}:{minute:02d}:{second:02d}" + + def _random_date_after(self, start: date, max_days: int = 30) -> date: + """Random date between start and start + max_days (capped at end_date).""" + end = min(start + timedelta(days=max_days), self.end_date) + delta = (end - start).days + if delta <= 0: + return start + return start + timedelta(days=self.rng.randint(0, delta)) + + def _write_table(self, name: str, fields: list[str], + rows: list[dict] | Generator) -> int: + """Write CSV table from list or generator of dicts.""" + path = self.output_dir / f"{name}.csv" + count = 0 + with open(path, "w", newline="", encoding="utf-8") as f: + writer = csv.DictWriter(f, fieldnames=fields) + writer.writeheader() + for row in rows: + writer.writerow(row) + count += 1 + if count % 250_000 == 0: + logger.info(f" ... {count:,} rows written") + self.row_counts[name] = count + return count + + # ── Table generators ─────────────────────────────────────── + + def _generate_customers(self) -> None: + n = self.cfg["customers"] + logger.info(f" Generating {n:,} customers...") + + country_vals = [(c[0], c[2]) for c in COUNTRIES] + rows = [] + for i in range(n): + cid = f"C-{i + 1:06d}" + segment = self._weighted_choice(CUSTOMER_SEGMENTS) + reg_date = self._random_date() + first = self.fake.first_name() + last = self.fake.last_name() + country = self._weighted_choice(country_vals) + + if segment.startswith("b2b"): + company = self.fake.company() + domain = company.lower().split()[0].replace(",", "") + ".com" + email = f"{first.lower()}.{last.lower()}@{domain}" + else: + company = "" + domain = self.rng.choice(EMAIL_DOMAINS) + email = f"{first.lower()}.{last.lower()}@{domain}" + + rows.append({ + "customer_id": cid, + "email": email, + "first_name": first, + "last_name": last, + "company": company, + "country": country, + "city": self.fake.city(), + "segment": segment, + "registration_date": str(reg_date), + "is_active": self.rng.choices([1, 0], weights=[0.85, 0.15])[0], + }) + self._customer_ids.append(cid) + self._customer_reg_dates[cid] = reg_date + self._customer_segments[cid] = segment + + self._write_table("customers", list(rows[0].keys()), rows) + + def _generate_products(self) -> None: + n = self.cfg["products"] + logger.info(f" Generating {n:,} products...") + + # Build product pool: base items + variants for larger sizes + pool: list[tuple[str, str, str]] = [] # (name, category, subcategory) + categories = list(PRODUCT_CATEGORIES.keys()) + for cat in categories: + for item in PRODUCT_CATEGORIES[cat]["items"]: + pool.append((item, cat, cat)) + + # Add variants if we need more than base pool + while len(pool) < n: + cat = self.rng.choice(categories) + item = self.rng.choice(PRODUCT_CATEGORIES[cat]["items"]) + variant = self.rng.choice(PRODUCT_VARIANTS) + color = self.rng.choice(PRODUCT_COLORS) + name = f"{item} {variant} - {color}" + pool.append((name, cat, cat)) + + self.rng.shuffle(pool) + pool = pool[:n] + + rows = [] + for i, (name, category, _subcat) in enumerate(pool): + pid = f"P-{i + 1:05d}" + cat_cfg = PRODUCT_CATEGORIES[category] + price = round(self.rng.uniform(*cat_cfg["price_range"]), 2) + cost_ratio = self.rng.uniform(*cat_cfg["cost_ratio"]) + cost = round(price * cost_ratio, 2) + + rows.append({ + "product_id": pid, + "sku": f"SKU-{self.rng.randint(10000, 99999)}", + "name": name, + "category": category, + "price": price, + "cost": cost, + "weight_kg": round(self.rng.uniform(0.1, 15.0), 2), + "is_active": self.rng.choices([1, 0], weights=[0.90, 0.10])[0], + "created_at": str(self._random_date()), + }) + self._product_ids.append(pid) + self._product_prices[pid] = price + self._product_categories[pid] = category + + self._write_table("products", list(rows[0].keys()), rows) + + def _generate_campaigns(self) -> None: + n = self.cfg["campaigns"] + logger.info(f" Generating {n:,} campaigns...") + + rows = [] + for i in range(n): + cid = f"CMP-{i + 1:04d}" + channel = self._weighted_choice(CAMPAIGN_CHANNELS) + start = self._random_date() + duration = self.rng.randint(7, 60) + end = min(start + timedelta(days=duration), self.end_date) + is_past = end < self.end_date - timedelta(days=7) + + budget = round(self.rng.uniform(500, 25000), 2) + spend_ratio = self.rng.uniform(0.6, 1.1) if is_past else self.rng.uniform(0.2, 0.7) + spend = round(budget * min(spend_ratio, 1.0), 2) + impressions = int(spend * self.rng.uniform(80, 500)) + ctr = self.rng.uniform(0.005, 0.08) + clicks = int(impressions * ctr) + + template = self.rng.choice(CAMPAIGN_TEMPLATES) + name = f"{template} - {channel.replace('_', ' ').title()} {start.year}" + + status = "completed" if is_past else self.rng.choice(["active", "paused"]) + + rows.append({ + "campaign_id": cid, + "name": name, + "channel": channel, + "status": status, + "budget": budget, + "spend": spend, + "impressions": impressions, + "clicks": clicks, + "start_date": str(start), + "end_date": str(end), + "target_segment": self._weighted_choice(CUSTOMER_SEGMENTS), + }) + self._campaign_ids.append(cid) + self._campaign_ranges[cid] = (start, end) + + self._write_table("campaigns", list(rows[0].keys()), rows) + + def _generate_web_sessions(self) -> None: + n = self.cfg["web_sessions"] + logger.info(f" Generating {n:,} web sessions...") + + fields = [ + "session_id", "visitor_id", "customer_id", "campaign_id", + "started_at", "duration_seconds", "pages_viewed", + "device_type", "browser", "country", "landing_page", "is_bounce", + ] + country_vals = [(c[0], c[2]) for c in COUNTRIES] + + def gen_rows() -> Generator[dict[str, Any], None, None]: + for i in range(n): + sid = f"S-{i + 1:08d}" + d = self._random_date() + + # 40% sessions from logged-in customers + customer_id = "" + if self.rng.random() < 0.40 and self._customer_ids: + customer_id = self.rng.choice(self._customer_ids) + + # 25% sessions attributed to a campaign + campaign_id = "" + if self.rng.random() < 0.25 and self._campaign_ids: + # Pick a campaign that was active on this date + candidates = [ + c for c in self._campaign_ids + if self._campaign_ranges[c][0] <= d <= self._campaign_ranges[c][1] + ] + if candidates: + campaign_id = self.rng.choice(candidates) + + is_bounce = self.rng.random() < 0.35 + if is_bounce: + duration = self.rng.randint(5, 30) + pages = 1 + else: + duration = self.rng.randint(30, 900) + pages = self.rng.randint(2, 15) + + yield { + "session_id": sid, + "visitor_id": f"V-{self.rng.randint(1, n // 3):08d}", + "customer_id": customer_id, + "campaign_id": campaign_id, + "started_at": self._random_datetime(d), + "duration_seconds": duration, + "pages_viewed": pages, + "device_type": self._weighted_choice(DEVICES), + "browser": self._weighted_choice(BROWSERS), + "country": self._weighted_choice(country_vals), + "landing_page": self.rng.choice(LANDING_PAGES), + "is_bounce": int(is_bounce), + } + + self._write_table("web_sessions", fields, gen_rows()) + + def _generate_web_leads(self) -> None: + n = self.cfg["web_leads"] + logger.info(f" Generating {n:,} web leads...") + + fields = [ + "lead_id", "customer_id", "email", "source", "campaign_id", + "created_at", "status", "converted_at", + ] + lead_statuses = [ + ("new", 0.35), ("contacted", 0.20), ("qualified", 0.15), + ("converted", 0.18), ("lost", 0.12), + ] + + rows = [] + for i in range(n): + lid = f"L-{i + 1:06d}" + d = self._random_date() + status = self._weighted_choice(lead_statuses) + + # 55% from existing customers + customer_id = "" + email = self.fake.email() + if self.rng.random() < 0.55 and self._customer_ids: + customer_id = self.rng.choice(self._customer_ids) + + campaign_id = "" + if self.rng.random() < 0.40 and self._campaign_ids: + campaign_id = self.rng.choice(self._campaign_ids) + + converted_at = "" + if status == "converted": + converted_at = self._random_datetime( + self._random_date_after(d, max_days=14) + ) + + rows.append({ + "lead_id": lid, + "customer_id": customer_id, + "email": email, + "source": self._weighted_choice(LEAD_SOURCES), + "campaign_id": campaign_id, + "created_at": self._random_datetime(d), + "status": status, + "converted_at": converted_at, + }) + + self._write_table("web_leads", fields, rows) + + def _generate_orders_and_items(self) -> None: + n_orders = self.cfg["orders"] + logger.info(f" Generating {n_orders:,} orders + order items...") + + # Customer activity weights (Pareto-like distribution) + activity = [self.rng.paretovariate(1.2) for _ in self._customer_ids] + + order_fields = [ + "order_id", "customer_id", "created_at", "status", + "items_total", "discount_amount", "shipping_amount", + "total_amount", "channel", + ] + item_fields = [ + "order_item_id", "order_id", "product_id", "quantity", + "unit_price", "discount_percent", "line_total", + ] + + order_rows = [] + item_rows = [] + item_seq = 0 + + for i in range(n_orders): + oid = f"ORD-{i + 1:07d}" + cust_id = self.rng.choices(self._customer_ids, weights=activity, k=1)[0] + reg_date = self._customer_reg_dates[cust_id] + segment = self._customer_segments[cust_id] + + # Order date: after customer registration + order_date = self._random_date_after(reg_date, + max_days=(self.end_date - reg_date).days) + status = self._weighted_choice(ORDER_STATUSES) + + # B2B orders tend to have more items + max_items = 8 if segment.startswith("b2b") else 5 + item_weights = list(range(max_items, 0, -1)) # favor fewer items + n_items = self.rng.choices(range(1, max_items + 1), weights=item_weights, k=1)[0] + + items_total = 0.0 + for _j in range(n_items): + item_seq += 1 + pid = self.rng.choice(self._product_ids) + qty = self.rng.choices([1, 2, 3, 4, 5], + weights=[60, 20, 10, 5, 5], k=1)[0] + if segment == "b2b_enterprise": + qty *= self.rng.randint(1, 5) + unit_price = self._product_prices[pid] + disc_pct = self.rng.choices( + [0, 5, 10, 15, 20], + weights=[50, 20, 15, 10, 5], k=1 + )[0] + line_total = round(unit_price * qty * (1 - disc_pct / 100), 2) + items_total += line_total + + item_rows.append({ + "order_item_id": f"OI-{item_seq:08d}", + "order_id": oid, + "product_id": pid, + "quantity": qty, + "unit_price": unit_price, + "discount_percent": disc_pct, + "line_total": line_total, + }) + + discount_amount = round(items_total * self.rng.uniform(0, 0.05), 2) + shipping = round(self.rng.uniform(0, 15.99), 2) if items_total < 100 else 0.0 + total = round(items_total - discount_amount + shipping, 2) + + order_rows.append({ + "order_id": oid, + "customer_id": cust_id, + "created_at": self._random_datetime(order_date), + "status": status, + "items_total": round(items_total, 2), + "discount_amount": discount_amount, + "shipping_amount": shipping, + "total_amount": total, + "channel": self._weighted_choice(ORDER_CHANNELS), + }) + self._order_ids.append(oid) + self._order_customers[oid] = cust_id + self._order_dates[oid] = order_date + self._order_statuses[oid] = status + self._order_totals[oid] = total + + self._write_table("orders", order_fields, order_rows) + self._write_table("order_items", item_fields, item_rows) + + def _generate_payments(self) -> None: + logger.info(f" Generating payments for {len(self._order_ids):,} orders...") + + fields = [ + "payment_id", "order_id", "customer_id", "amount", "currency", + "method", "status", "created_at", "completed_at", + ] + + rows = [] + seq = 0 + for oid in self._order_ids: + cust_id = self._order_customers[oid] + segment = self._customer_segments[cust_id] + order_date = self._order_dates[oid] + order_status = self._order_statuses[oid] + amount = self._order_totals[oid] + + # B2B more likely to use invoice/bank_transfer + if segment.startswith("b2b") and self.rng.random() < 0.40: + method = self.rng.choice(["bank_transfer", "invoice"]) + else: + method = self._weighted_choice(PAYMENT_METHODS) + + # 5% chance of a failed payment attempt first + if self.rng.random() < 0.05: + seq += 1 + rows.append({ + "payment_id": f"PAY-{seq:07d}", + "order_id": oid, + "customer_id": cust_id, + "amount": amount, + "currency": "EUR", + "method": method, + "status": "failed", + "created_at": self._random_datetime(order_date), + "completed_at": "", + }) + + seq += 1 + if order_status == "cancelled": + pay_status = "cancelled" + completed = "" + elif order_status == "returned": + pay_status = "refunded" + completed = self._random_datetime( + self._random_date_after(order_date, max_days=14) + ) + else: + pay_status = "completed" + completed = self._random_datetime( + self._random_date_after(order_date, max_days=3) + ) + + rows.append({ + "payment_id": f"PAY-{seq:07d}", + "order_id": oid, + "customer_id": cust_id, + "amount": amount, + "currency": "EUR", + "method": method, + "status": pay_status, + "created_at": self._random_datetime(order_date), + "completed_at": completed, + }) + + self._write_table("payments", fields, rows) + + def _generate_support_tickets(self) -> None: + n = self.cfg["support_tickets"] + logger.info(f" Generating {n:,} support tickets...") + + fields = [ + "ticket_id", "customer_id", "order_id", "category", "priority", + "status", "channel", "subject", "created_at", "first_response_at", + "resolved_at", "satisfaction_score", + ] + + rows = [] + for i in range(n): + tid = f"TKT-{i + 1:06d}" + cust_id = self.rng.choice(self._customer_ids) + category = self._weighted_choice(TICKET_CATEGORIES) + priority = self._weighted_choice(TICKET_PRIORITIES) + subject = self.rng.choice(TICKET_SUBJECTS[category]) + d = self._random_date() + + # 60% linked to an order + order_id = "" + if self.rng.random() < 0.60 and self._order_ids: + # Pick an order from this customer if possible + cust_orders = [ + o for o in self._order_ids + if self._order_customers[o] == cust_id + ] + if cust_orders: + order_id = self.rng.choice(cust_orders) + else: + order_id = self.rng.choice(self._order_ids) + + # Status progression + is_resolved = self.rng.random() < 0.75 + if is_resolved: + status = self.rng.choice(["resolved", "closed"]) + else: + status = self.rng.choice(["open", "in_progress", "waiting_customer"]) + + # Response and resolution times based on priority + response_hours = { + "critical": (0.5, 4), "high": (1, 12), + "medium": (4, 48), "low": (8, 96), + } + rh = response_hours[priority] + first_response = "" + resolved_at = "" + satisfaction = "" + + if status not in ("open",): + resp_delta = timedelta(hours=self.rng.uniform(*rh)) + first_response = self._random_datetime( + min(d + timedelta(days=int(resp_delta.total_seconds() // 86400)), + self.end_date) + ) + + if is_resolved: + resolve_days = self.rng.randint(1, 14) + resolved_at = self._random_datetime( + self._random_date_after(d, max_days=resolve_days) + ) + # Satisfaction: skewed toward 4-5 for resolved + satisfaction = self.rng.choices( + [1, 2, 3, 4, 5], + weights=[5, 8, 15, 35, 37], k=1 + )[0] + + rows.append({ + "ticket_id": tid, + "customer_id": cust_id, + "order_id": order_id, + "category": category, + "priority": priority, + "status": status, + "channel": self._weighted_choice(TICKET_CHANNELS), + "subject": subject, + "created_at": self._random_datetime(d), + "first_response_at": first_response, + "resolved_at": resolved_at, + "satisfaction_score": satisfaction, + }) + + self._write_table("support_tickets", fields, rows) + + # ── Orchestration ────────────────────────────────────────── + + def run(self) -> dict[str, Any]: + """Generate all tables and return manifest data.""" + self.output_dir.mkdir(parents=True, exist_ok=True) + t0 = time.time() + + logger.info(f"Generating sample data (size: {self.size_name})") + logger.info(f" Period: {self.start_date} to {self.end_date} " + f"({self.cfg['months']} months)") + logger.info(f" Output: {self.output_dir}/") + + self._generate_customers() + self._generate_products() + self._generate_campaigns() + self._generate_web_sessions() + self._generate_web_leads() + self._generate_orders_and_items() + self._generate_payments() + self._generate_support_tickets() + + elapsed = time.time() - t0 + total_rows = sum(self.row_counts.values()) + + manifest = { + "generator": "generate_sample_data.py", + "size": self.size_name, + "seed": self.rng.getstate()[1][0], + "date_range": { + "start": str(self.start_date), + "end": str(self.end_date), + }, + "tables": self.row_counts, + "total_rows": total_rows, + "elapsed_seconds": round(elapsed, 1), + } + manifest_path = self.output_dir / "_manifest.json" + with open(manifest_path, "w", encoding="utf-8") as f: + json.dump(manifest, f, indent=2) + + logger.info("") + logger.info(f"Done! {len(self.row_counts)} tables, " + f"{total_rows:,} total rows in {elapsed:.1f}s") + logger.info(f"Manifest: {manifest_path}") + return manifest + + +# ── CLI ──────────────────────────────────────────────────────────────── + +def main() -> None: + parser = argparse.ArgumentParser( + description="Generate synthetic e-commerce sample data as CSV files." + ) + parser.add_argument( + "--size", choices=SIZE_CONFIGS.keys(), default="s", + help="Data size preset (default: s)", + ) + parser.add_argument( + "--output", type=Path, default=Path("data/sample"), + help="Output directory for CSV files (default: data/sample)", + ) + parser.add_argument( + "--seed", type=int, default=42, + help="Random seed for reproducibility (default: 42)", + ) + parser.add_argument( + "--list-sizes", action="store_true", + help="Show available size presets and exit", + ) + args = parser.parse_args() + + logging.basicConfig(level=logging.INFO, format="%(message)s") + + if args.list_sizes: + print("\nAvailable size presets:\n") + print(f" {'Size':<6} {'Label':<24} {'Customers':>10} {'Products':>10} " + f"{'Sessions':>10} {'Orders':>10} {'~CSV MB':>8}") + print(f" {'─' * 6} {'─' * 24} {'─' * 10} {'─' * 10} " + f"{'─' * 10} {'─' * 10} {'─' * 8}") + for key, cfg in SIZE_CONFIGS.items(): + print(f" {key:<6} {cfg['label']:<24} {cfg['customers']:>10,} " + f"{cfg['products']:>10,} {cfg['web_sessions']:>10,} " + f"{cfg['orders']:>10,} {cfg['estimated_csv_mb']:>7,}") + print() + return + + gen = SampleDataGenerator(size=args.size, seed=args.seed, output_dir=args.output) + gen.run() + + +if __name__ == "__main__": + main() diff --git a/tests/test_generate_sample_data.py b/tests/test_generate_sample_data.py new file mode 100644 index 0000000..45597de --- /dev/null +++ b/tests/test_generate_sample_data.py @@ -0,0 +1,173 @@ +"""Tests for the sample data generator.""" + +import csv +import json +import pytest +from pathlib import Path + +from scripts.generate_sample_data import SampleDataGenerator, SIZE_CONFIGS + + +@pytest.fixture +def output_dir(tmp_path: Path) -> Path: + """Temporary output directory for generated CSV files.""" + return tmp_path / "sample_data" + + +class TestSizeConfigs: + """Verify size configuration integrity.""" + + def test_all_sizes_have_required_keys(self): + required = { + "customers", "products", "campaigns", "web_sessions", + "web_leads", "orders", "support_tickets", "months", + } + for size, cfg in SIZE_CONFIGS.items(): + missing = required - set(cfg.keys()) + assert not missing, f"Size '{size}' missing keys: {missing}" + + def test_sizes_scale_monotonically(self): + """Each size should be strictly larger than the previous one.""" + sizes = list(SIZE_CONFIGS.keys()) + for key in ["customers", "products", "orders", "web_sessions"]: + values = [SIZE_CONFIGS[s][key] for s in sizes] + assert values == sorted(values), ( + f"{key} does not scale monotonically across sizes" + ) + + +class TestXSGeneration: + """Full generation test with xs size (fast).""" + + @pytest.fixture(autouse=True) + def generate(self, output_dir: Path): + self.output_dir = output_dir + gen = SampleDataGenerator(size="xs", seed=42, output_dir=output_dir) + self.manifest = gen.run() + + def test_all_csv_files_created(self): + expected = { + "customers", "products", "campaigns", "web_sessions", + "web_leads", "orders", "order_items", "payments", + "support_tickets", + } + csv_files = {p.stem for p in self.output_dir.glob("*.csv")} + assert expected == csv_files + + def test_manifest_created(self): + manifest_path = self.output_dir / "_manifest.json" + assert manifest_path.exists() + data = json.loads(manifest_path.read_text()) + assert data["size"] == "xs" + assert "tables" in data + assert data["total_rows"] > 0 + + def test_row_counts_match_config(self): + """Row counts for directly specified tables should match config.""" + cfg = SIZE_CONFIGS["xs"] + for table in ["customers", "products", "campaigns", "web_sessions", + "web_leads", "orders", "support_tickets"]: + assert self.manifest["tables"][table] == cfg[table], ( + f"{table}: expected {cfg[table]}, got {self.manifest['tables'][table]}" + ) + + def test_order_items_derived(self): + """Order items should be > orders (most orders have multiple items).""" + assert self.manifest["tables"]["order_items"] > self.manifest["tables"]["orders"] + + def test_payments_at_least_one_per_order(self): + """Payments should be >= orders (some have failed retries).""" + assert self.manifest["tables"]["payments"] >= self.manifest["tables"]["orders"] + + def test_csv_headers_not_empty(self): + """Every CSV should have a header and at least one data row.""" + for csv_path in self.output_dir.glob("*.csv"): + with open(csv_path) as f: + reader = csv.reader(f) + header = next(reader) + assert len(header) > 0, f"{csv_path.name}: empty header" + first_row = next(reader, None) + assert first_row is not None, f"{csv_path.name}: no data rows" + + +class TestReferentialIntegrity: + """Verify foreign key relationships across tables.""" + + @pytest.fixture(autouse=True) + def generate(self, output_dir: Path): + self.output_dir = output_dir + gen = SampleDataGenerator(size="xs", seed=123, output_dir=output_dir) + gen.run() + self.tables = {} + for csv_path in output_dir.glob("*.csv"): + with open(csv_path) as f: + self.tables[csv_path.stem] = list(csv.DictReader(f)) + + def _get_ids(self, table: str, column: str) -> set[str]: + return {row[column] for row in self.tables[table]} + + def _get_fk_values(self, table: str, column: str) -> set[str]: + return {row[column] for row in self.tables[table] if row[column]} + + def test_orders_reference_valid_customers(self): + customer_ids = self._get_ids("customers", "customer_id") + order_customer_ids = self._get_fk_values("orders", "customer_id") + orphans = order_customer_ids - customer_ids + assert not orphans, f"Orders reference non-existent customers: {orphans}" + + def test_order_items_reference_valid_orders(self): + order_ids = self._get_ids("orders", "order_id") + item_order_ids = self._get_fk_values("order_items", "order_id") + orphans = item_order_ids - order_ids + assert not orphans, f"Order items reference non-existent orders: {orphans}" + + def test_order_items_reference_valid_products(self): + product_ids = self._get_ids("products", "product_id") + item_product_ids = self._get_fk_values("order_items", "product_id") + orphans = item_product_ids - product_ids + assert not orphans, f"Order items reference non-existent products: {orphans}" + + def test_payments_reference_valid_orders(self): + order_ids = self._get_ids("orders", "order_id") + payment_order_ids = self._get_fk_values("payments", "order_id") + orphans = payment_order_ids - order_ids + assert not orphans, f"Payments reference non-existent orders: {orphans}" + + def test_support_tickets_reference_valid_customers(self): + customer_ids = self._get_ids("customers", "customer_id") + ticket_customer_ids = self._get_fk_values("support_tickets", "customer_id") + orphans = ticket_customer_ids - customer_ids + assert not orphans, f"Tickets reference non-existent customers: {orphans}" + + +class TestDeterminism: + """Verify reproducibility with same seed.""" + + def test_same_seed_produces_same_output(self, tmp_path: Path): + dir1 = tmp_path / "run1" + dir2 = tmp_path / "run2" + + gen1 = SampleDataGenerator(size="xs", seed=99, output_dir=dir1) + gen1.run() + + gen2 = SampleDataGenerator(size="xs", seed=99, output_dir=dir2) + gen2.run() + + for csv_path in dir1.glob("*.csv"): + content1 = csv_path.read_text() + content2 = (dir2 / csv_path.name).read_text() + assert content1 == content2, f"{csv_path.name} differs between runs" + + def test_different_seed_produces_different_output(self, tmp_path: Path): + dir1 = tmp_path / "seed1" + dir2 = tmp_path / "seed2" + + gen1 = SampleDataGenerator(size="xs", seed=1, output_dir=dir1) + gen1.run() + + gen2 = SampleDataGenerator(size="xs", seed=2, output_dir=dir2) + gen2.run() + + content1 = (dir1 / "customers.csv").read_text() + content2 = (dir2 / "customers.csv").read_text() + assert content1 != content2