From 44bf43535b95ec80ef9194714788a84ee6e97a60 Mon Sep 17 00:00:00 2001
From: Petr <petr@keboola.com>
Date: Tue, 10 Mar 2026 12:31:14 +0100
Subject: [PATCH] Add sample data generator with 9 e-commerce tables

Synthetic data generator for demo/testing without real data adapter:
- 9 tables: customers, products, campaigns, web_sessions, web_leads,
  orders, order_items, payments, support_tickets
- 4 size presets: xs (1MB), s (15MB), m (150MB), l (1.5GB)
- Realistic patterns: seasonality, Pareto customer distribution,
  segment-based behavior, referential integrity
- Deterministic output via --seed parameter

Also: docs/sample-data.md, updated auto-install.md with Step 6,
updated CLAUDE.md (email auth provider, dual-repo architecture)
---
 CLAUDE.md                          |  17 +-
 docs/auto-install.md               |  60 +-
 docs/sample-data.md                | 196 ++++++
 requirements.txt                   |   4 +
 scripts/generate_sample_data.py    | 940 +++++++++++++++++++++++++++++
 tests/test_generate_sample_data.py | 173 ++++++
 6 files changed, 1385 insertions(+), 5 deletions(-)
 create mode 100644 docs/sample-data.md
 create mode 100644 scripts/generate_sample_data.py
 create mode 100644 tests/test_generate_sample_data.py
diff --git a/CLAUDE.md b/CLAUDE.md
index 4e611b9..b2fee42 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -42,7 +42,7 @@ Ask the user for:
 │   └── jira/               # Jira webhook connector
 ├── auth/                   # Authentication providers (pluggable)
 │   ├── google/             # Google OAuth provider
-│   ├── password/           # Email/password provider
+│   ├── email/              # Email magic link provider
 │   └── desktop/            # Desktop JWT provider (API-only)
 ├── services/               # Standalone services (own systemd units)
 │   ├── telegram_bot/       # Telegram notification bot
@@ -88,6 +88,15 @@ Environment variables go in `.env` (never committed to git).
 
 Data schema is defined in `docs/data_description.md` (YAML blocks in markdown).
 
+### Dual-Repo Deployment
+Production uses two repos on the server:
+- **OSS repo** (`/opt/data-analyst/repo/`): application code, no secrets or config
+- **Instance repo** (`/opt/data-analyst/instance/`): private config, secrets template, data schema
+
+Symlinks bridge them: `repo/config/instance.yaml -> instance/config/instance.yaml`.
+Each repo has its own SSH deploy key (github-oss / github-cfg aliases).
+See `docs/auto-install.md` for full setup guide.
+
 ## Development
 
 ```bash
@@ -117,7 +126,7 @@ Pluggable data source connectors in `connectors/`:
 ### Authentication
 Pluggable auth providers in `auth/`:
 - **Google** (`google`): OAuth via Google
-- **Password** (`password`): Email/password with magic links
+- **Email** (`email`): Email magic link (itsdangerous token, no password needed)
 - **Desktop** (`desktop`): JWT for desktop app API
 - New provider = `auth/<name>/provider.py` implementing `AuthProvider`
 
@@ -164,7 +173,9 @@ When reopening the project in Claude Code:
 ### Auth Provider Pattern
 - ABC: `AuthProvider` class in `auth/__init__.py`
 - Discovery: `discover_providers()` scans `auth/*/provider.py`
-- Providers: google, password, desktop (each exports `provider` instance)
+- Providers: google, email, desktop (each exports `provider` instance)
+- Email provider: uses `itsdangerous.URLSafeTimedSerializer` for magic link tokens
+- Multi-domain: `auth.allowed_domain` in instance.yaml supports comma-separated domains
 - Session contract: all providers set `session["user"] = {"email", "name", "picture"}`
 
 ### Service Pattern
diff --git a/docs/auto-install.md b/docs/auto-install.md
index 6bfafa7..7201d7e 100644
--- a/docs/auto-install.md
+++ b/docs/auto-install.md
@@ -341,9 +341,63 @@ After server is set up, analysts self-onboard via the webapp:
 4. User runs `claude` in their project folder, pastes setup instructions
 5. Claude Code configures SSH, rsyncs data, sets up Python + DuckDB
 
-## Step 6: Data Source (Next)
+## Step 6: Sample Data (Try Without a Data Adapter)
 
-Configure a real data source in `instance/config/instance.yaml`:
+Before connecting a real data source, you can load sample data to verify the full pipeline
+(Parquet files, DuckDB, analyst rsync, Claude Code analysis).
+
+```bash
+cd /opt/data-analyst/repo
+
+# Install generator dependency
+/opt/data-analyst/.venv/bin/pip install faker
+
+# Generate synthetic e-commerce data (size m: ~20K orders, 100K sessions)
+/opt/data-analyst/.venv/bin/python scripts/generate_sample_data.py \
+    --size m --output /tmp/sample_csv --seed 42
+
+# Convert CSVs to Parquet and deploy to data directory
+/opt/data-analyst/.venv/bin/python -c "
+import pandas as pd
+from pathlib import Path
+
+csv_dir = Path('/tmp/sample_csv')
+parquet_dir = Path('/data/src_data/parquet')
+parquet_dir.mkdir(parents=True, exist_ok=True)
+
+for f in sorted(csv_dir.glob('*.csv')):
+    df = pd.read_csv(f)
+    out = parquet_dir / f'{f.stem}.parquet'
+    df.to_parquet(out, index=False)
+    print(f'  {f.stem}: {len(df):,} rows -> {out}')
+"
+
+# Set correct permissions
+chown -R root:data-ops /data/src_data/parquet
+chmod -R 2775 /data/src_data/parquet
+
+# Clean up temporary CSVs
+rm -rf /tmp/sample_csv
+```
+
+Available sizes: `xs` (50 customers, ~1 MB), `s` (500, ~15 MB), `m` (5K, ~150 MB), `l` (50K, ~1.5 GB).
+
+The sample data covers 9 tables: customers, products, campaigns, web_sessions, web_leads,
+orders, order_items, payments, support_tickets. See `docs/sample-data.md` for the full
+data model, table reference, and built-in analytical patterns.
+
+### Step 6 Checklist
+
+| # | Check | Expected |
+|---|-------|----------|
+| 6.1 | Parquet files | `ls /data/src_data/parquet/*.parquet` shows 9 files |
+| 6.2 | Permissions | Files owned by root:data-ops, group-readable |
+| 6.3 | Analyst sync | Analyst can rsync parquet files to local machine |
+| 6.4 | DuckDB loads | `SELECT count(*) FROM read_parquet('orders.parquet')` returns rows |
+
+## Step 7: Real Data Source (Production)
+
+When ready, replace sample data with a real data source adapter in `instance/config/instance.yaml`:
 
 ```yaml
 data_source:
@@ -356,6 +410,8 @@ data_source:
 
 Add the token to `.env` and create `config/data_description.md` with table schemas.
 
+Other planned adapters: BigQuery, CSV import.
+
 ## Deployment Workflow (Ongoing)
 
 ### Update OSS code
diff --git a/docs/sample-data.md b/docs/sample-data.md
new file mode 100644
index 0000000..77dcd34
--- /dev/null
+++ b/docs/sample-data.md
@@ -0,0 +1,196 @@
+# Sample Data Generator
+
+Generate realistic synthetic e-commerce and marketing data for demo, testing, and development without connecting a real data source adapter.
+
+## Quick Start
+
+```bash
+# Install dependency
+pip install faker
+
+# Generate small dataset (default)
+python scripts/generate_sample_data.py --size s --output data/sample
+
+# List available sizes
+python scripts/generate_sample_data.py --list-sizes
+```
+
+## Data Model
+
+9 interrelated tables covering the full e-commerce funnel:
+
+```
+┌──────────────┐     ┌──────────────┐     ┌──────────────┐
+│  campaigns   │     │  customers   │     │   products   │
+│  CMP-0001    │     │  C-000001    │     │   P-00001    │
+└──────┬───────┘     └──────┬───────┘     └──────┬───────┘
+       │                    │                    │
+       ▼                    ▼                    │
+┌──────────────┐     ┌──────────────┐            │
+│ web_sessions │     │  web_leads   │            │
+│  S-00000001  │     │  L-000001    │            │
+└──────────────┘     └──────────────┘            │
+                            │                    │
+                            ▼                    ▼
+                     ┌──────────────┐     ┌──────────────┐
+                     │   orders     │────▶│ order_items  │
+                     │ ORD-0000001  │     │ OI-00000001  │
+                     └──────┬───────┘     └──────────────┘
+                            │
+                     ┌──────┴───────┐
+                     ▼              ▼
+              ┌──────────────┐ ┌──────────────┐
+              │  payments    │ │   support    │
+              │ PAY-0000001  │ │   tickets    │
+              └──────────────┘ │ TKT-000001   │
+                               └──────────────┘
+```
+
+### Table Reference
+
+| Table | Key Columns | Foreign Keys |
+|-------|-------------|--------------|
+| **customers** | customer_id, email, segment, country, registration_date | - |
+| **products** | product_id, name, category, price, cost | - |
+| **campaigns** | campaign_id, channel, budget, spend, impressions, clicks | - |
+| **web_sessions** | session_id, started_at, duration_seconds, device_type | customer_id?, campaign_id? |
+| **web_leads** | lead_id, source, status, converted_at | customer_id?, campaign_id? |
+| **orders** | order_id, status, total_amount, channel | customer_id |
+| **order_items** | order_item_id, quantity, unit_price, line_total | order_id, product_id |
+| **payments** | payment_id, amount, method, status | order_id, customer_id |
+| **support_tickets** | ticket_id, category, priority, satisfaction_score | customer_id, order_id? |
+
+`?` = nullable (not every record has a value)
+
+### Customer Segments
+
+- **b2c** (60%): Individual consumers, smaller order values
+- **b2b_small** (25%): Small business buyers, moderate volumes
+- **b2b_enterprise** (15%): Large buyers, high quantities, invoice payments
+
+### Product Categories
+
+Electronics, Clothing, Home & Garden, Sports & Outdoors, Books & Media, Beauty & Health
+
+Each category has distinct price ranges and cost margins for realistic profitability analysis.
+
+## Size Presets
+
+| Size | Customers | Products | Sessions | Orders | Tickets | ~CSV | ~Time |
+|------|-----------|----------|----------|--------|---------|------|-------|
+| **xs** | 50 | 30 | 500 | 100 | 30 | 1 MB | <1s |
+| **s** | 500 | 100 | 10K | 2K | 500 | 15 MB | <1s |
+| **m** | 5,000 | 300 | 100K | 20K | 5K | 150 MB | ~7s |
+| **l** | 50,000 | 1,000 | 1M | 200K | 50K | 1.5 GB | ~3min |
+
+- **xs** - local development, quick iteration
+- **s** - unit/integration testing, CI
+- **m** - realistic demo, performance testing
+- **l** - stress testing, production-like volumes
+
+## CLI Options
+
+```
+python scripts/generate_sample_data.py [OPTIONS]
+
+  --size {xs,s,m,l}   Data size preset (default: s)
+  --output PATH        Output directory (default: data/sample)
+  --seed INT           Random seed for reproducibility (default: 42)
+  --list-sizes         Show presets and exit
+```
+
+## Convert to Parquet
+
+After generating CSVs, convert to Parquet for analytical use:
+
+```bash
+python -c "
+import pandas as pd
+from pathlib import Path
+
+csv_dir = Path('data/sample')
+parquet_dir = Path('data/sample/parquet')
+parquet_dir.mkdir(exist_ok=True)
+
+for f in sorted(csv_dir.glob('*.csv')):
+    df = pd.read_csv(f)
+    out = parquet_dir / f'{f.stem}.parquet'
+    df.to_parquet(out, index=False)
+    print(f'  {f.stem}: {len(df):,} rows -> {out}')
+"
+```
+
+## Load into DuckDB
+
+```bash
+python -c "
+import duckdb
+from pathlib import Path
+
+db = duckdb.connect('data/sample/analytics.duckdb')
+parquet_dir = Path('data/sample/parquet')
+
+for f in sorted(parquet_dir.glob('*.parquet')):
+    table = f.stem
+    db.execute(f'CREATE OR REPLACE TABLE {table} AS SELECT * FROM read_parquet(\"{f}\")')
+    count = db.execute(f'SELECT count(*) FROM {table}').fetchone()[0]
+    print(f'  {table}: {count:,} rows')
+
+db.close()
+print('Database: data/sample/analytics.duckdb')
+"
+```
+
+## Built-in Analytical Patterns
+
+The generator creates data with discoverable patterns for realistic analysis:
+
+- **Seasonality**: Q4 traffic and orders ~2x higher than Q1
+- **Growth trend**: 50% increase in activity over the time period
+- **Channel effectiveness**: paid_search has highest click-through rates
+- **Customer lifetime**: Pareto distribution (20% of customers generate 80% of orders)
+- **Segment differences**: B2B enterprise has 3-5x higher order values
+- **Product mix**: Electronics = high revenue / lower margin, Books = low revenue / high margin
+- **Support correlation**: 60% of tickets linked to specific orders
+
+## Reproducibility
+
+Same `--seed` always produces identical output. The default seed is 42.
+
+```bash
+# These two commands produce the same files
+python scripts/generate_sample_data.py --size s --seed 42 --output run1
+python scripts/generate_sample_data.py --size s --seed 42 --output run2
+diff -r run1 run2  # no differences
+```
+
+## Server Deployment
+
+To use sample data on a deployed server (instead of connecting a data adapter):
+
+```bash
+# On the server
+cd /opt/data-analyst/repo
+
+# Generate CSVs
+.venv/bin/python scripts/generate_sample_data.py --size m --output /tmp/sample_csv
+
+# Convert to Parquet and deploy
+.venv/bin/python -c "
+import pandas as pd
+from pathlib import Path
+
+csv_dir = Path('/tmp/sample_csv')
+parquet_dir = Path('/data/src_data/parquet')
+parquet_dir.mkdir(parents=True, exist_ok=True)
+
+for f in sorted(csv_dir.glob('*.csv')):
+    df = pd.read_csv(f)
+    out = parquet_dir / f'{f.stem}.parquet'
+    df.to_parquet(out, index=False)
+    print(f'  {f.stem}: {len(df):,} rows -> {out}')
+"
+
+# Clean up CSVs
+rm -rf /tmp/sample_csv
+```
diff --git a/requirements.txt b/requirements.txt
index e68ddb5..81d63f2 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -52,3 +52,7 @@ sendgrid>=6.11.0
 # Corporate Memory knowledge extraction
 # anthropic - Claude API client for HAIKU-based knowledge extraction
 anthropic>=0.39.0
+
+# Sample data generation (development/testing)
+# faker - realistic synthetic data for demo datasets
+faker>=24.0.0
diff --git a/scripts/generate_sample_data.py b/scripts/generate_sample_data.py
new file mode 100644
index 0000000..8f5294b
--- /dev/null
+++ b/scripts/generate_sample_data.py
@@ -0,0 +1,940 @@
+#!/usr/bin/env python3
+"""
+Sample data generator for AI Data Analyst demo and testing.
+
+Generates realistic synthetic e-commerce + marketing data as CSV files.
+Tables: customers, products, campaigns, web_sessions, web_leads,
+        orders, order_items, payments, support_tickets
+
+Usage:
+    python scripts/generate_sample_data.py --size xs --output data/sample
+    python scripts/generate_sample_data.py --size m --seed 42
+    python scripts/generate_sample_data.py --list-sizes
+"""
+
+import argparse
+import csv
+import json
+import logging
+import random
+import sys
+import time
+from datetime import date, timedelta
+from pathlib import Path
+from typing import Any, Generator
+
+try:
+    from faker import Faker
+except ImportError:
+    print("ERROR: faker is required. Install with: pip install faker")
+    sys.exit(1)
+
+logger = logging.getLogger(__name__)
+
+# ── Size configurations ────────────────────────────────────────────────
+
+SIZE_CONFIGS = {
+    "xs": {
+        "label": "Extra Small (demo/dev)",
+        "customers": 50,
+        "products": 30,
+        "campaigns": 10,
+        "web_sessions": 500,
+        "web_leads": 50,
+        "orders": 100,
+        "support_tickets": 30,
+        "months": 3,
+        "estimated_csv_mb": 1,
+    },
+    "s": {
+        "label": "Small (testing)",
+        "customers": 500,
+        "products": 100,
+        "campaigns": 30,
+        "web_sessions": 10_000,
+        "web_leads": 1_000,
+        "orders": 2_000,
+        "support_tickets": 500,
+        "months": 12,
+        "estimated_csv_mb": 15,
+    },
+    "m": {
+        "label": "Medium (realistic)",
+        "customers": 5_000,
+        "products": 300,
+        "campaigns": 80,
+        "web_sessions": 100_000,
+        "web_leads": 10_000,
+        "orders": 20_000,
+        "support_tickets": 5_000,
+        "months": 24,
+        "estimated_csv_mb": 150,
+    },
+    "l": {
+        "label": "Large (stress test)",
+        "customers": 50_000,
+        "products": 1_000,
+        "campaigns": 200,
+        "web_sessions": 1_000_000,
+        "web_leads": 100_000,
+        "orders": 200_000,
+        "support_tickets": 50_000,
+        "months": 36,
+        "estimated_csv_mb": 1500,
+    },
+}
+
+# ── Domain data ────────────────────────────────────────────────────────
+
+# Monthly seasonality multipliers (index 0 = January)
+MONTHLY_SEASONALITY = [0.70, 0.75, 0.85, 0.90, 0.95, 1.00,
+                       0.90, 0.85, 1.00, 1.10, 1.30, 1.50]
+
+# Day-of-week multipliers (Monday=0 .. Sunday=6)
+DOW_MULTIPLIER = [1.0, 1.0, 1.0, 1.05, 1.15, 0.80, 0.60]
+
+# Hour-of-day weights (24 values, peak at 10-14)
+HOUR_WEIGHTS = [2, 1, 1, 1, 1, 2, 4, 8, 14, 18, 20, 19,
+                18, 17, 16, 15, 14, 12, 10, 8, 6, 5, 4, 3]
+
+CUSTOMER_SEGMENTS = [
+    ("b2c", 0.60),
+    ("b2b_small", 0.25),
+    ("b2b_enterprise", 0.15),
+]
+
+COUNTRIES = [
+    ("Czech Republic", "CZ", 0.25), ("Germany", "DE", 0.15),
+    ("United States", "US", 0.12),  ("United Kingdom", "GB", 0.10),
+    ("France", "FR", 0.08),         ("Austria", "AT", 0.05),
+    ("Poland", "PL", 0.05),         ("Netherlands", "NL", 0.05),
+    ("Slovakia", "SK", 0.05),       ("Spain", "ES", 0.04),
+    ("Italy", "IT", 0.03),          ("Sweden", "SE", 0.03),
+]
+
+EMAIL_DOMAINS = [
+    "gmail.com", "yahoo.com", "outlook.com", "hotmail.com",
+    "protonmail.com", "icloud.com", "mail.com",
+]
+
+PRODUCT_CATEGORIES = {
+    "Electronics": {
+        "items": [
+            "Wireless Headphones", "USB-C Charger 65W", "Smart Watch",
+            "Webcam 4K", "Bluetooth Speaker", "Noise-Cancelling Earbuds",
+            "Mechanical Keyboard", "27in Monitor QHD", "Laptop Stand",
+            "Power Bank 20000mAh", "Smart Home Hub", "LED Desk Lamp",
+            "External SSD 1TB", "Wireless Charging Pad", "Action Camera",
+        ],
+        "price_range": (19.99, 1299.99),
+        "cost_ratio": (0.40, 0.65),
+    },
+    "Clothing": {
+        "items": [
+            "Oxford Shirt Classic", "Slim Chino Pants", "Merino Sweater",
+            "Leather Belt Premium", "Running Sneakers", "Denim Jacket",
+            "Polo Shirt Casual", "Winter Down Jacket", "Cotton T-Shirt",
+            "Formal Dress Shoes", "Yoga Leggings", "Crossbody Bag",
+            "Wool Blend Coat", "Sport Shorts Quick-Dry", "Canvas Tote Bag",
+        ],
+        "price_range": (9.99, 299.99),
+        "cost_ratio": (0.30, 0.55),
+    },
+    "Home & Garden": {
+        "items": [
+            "Ceramic Mug Set", "Bamboo Cutting Board", "Steel Water Bottle",
+            "Indoor Plant Pot Set", "LED String Lights 10m", "Bath Towel Set",
+            "Memory Foam Pillow", "Scented Candle Set", "Kitchen Knife Set 5pc",
+            "Garden Tool Set", "Bedside Lamp", "Throw Blanket Fleece",
+            "Wall Clock Minimal", "Spice Rack Organizer", "Herb Garden Kit",
+        ],
+        "price_range": (7.99, 199.99),
+        "cost_ratio": (0.35, 0.55),
+    },
+    "Sports & Outdoors": {
+        "items": [
+            "Yoga Mat Premium", "Resistance Bands Set", "Insulated Bottle",
+            "Hiking Backpack 40L", "Speed Jump Rope", "Foam Roller 45cm",
+            "Camping Hammock", "Cycling Gloves", "Tennis Balls 4-Pack",
+            "Swim Goggles Anti-Fog", "Adjustable Dumbbells", "Running Armband",
+            "Compact Sleeping Bag", "Compression Socks", "Fishing Tackle Box",
+        ],
+        "price_range": (8.99, 249.99),
+        "cost_ratio": (0.35, 0.60),
+    },
+    "Books & Media": {
+        "items": [
+            "Data Science Handbook", "Leadership in Practice", "Creative Writing",
+            "Python Programming", "World History Atlas", "Cooking Masterclass",
+            "Mindfulness Journal", "Photography Basics", "Financial Planning",
+            "Sci-Fi Novel Collection", "Art Supplies Set", "Board Game Classic",
+            "Puzzle 1000 Pieces", "Drawing Pencil Set 24pc", "Travel Guide Europe",
+        ],
+        "price_range": (5.99, 79.99),
+        "cost_ratio": (0.25, 0.45),
+    },
+    "Beauty & Health": {
+        "items": [
+            "Moisturizer SPF30", "Organic Shampoo 500ml", "Electric Toothbrush",
+            "Vitamin D3 Supplements", "Essential Oil Set 6pk", "Hair Dryer Pro",
+            "Sunscreen SPF50", "Protein Powder Vanilla", "Face Mask Pack 10",
+            "Hand Cream Repair", "Body Lotion Hydrating", "Beard Grooming Set",
+            "Collagen Drink Mix", "Makeup Brush Set 12pc", "Bath Bomb Gift Set",
+        ],
+        "price_range": (4.99, 149.99),
+        "cost_ratio": (0.20, 0.45),
+    },
+}
+
+PRODUCT_VARIANTS = ["Pro", "Ultra", "Lite", "Plus", "Mini", "Max"]
+PRODUCT_COLORS = ["Black", "White", "Blue", "Red", "Green", "Grey"]
+
+CAMPAIGN_CHANNELS = [
+    ("email", 0.20),
+    ("paid_search", 0.22),
+    ("paid_social", 0.18),
+    ("organic_social", 0.12),
+    ("display", 0.12),
+    ("affiliate", 0.08),
+    ("retargeting", 0.08),
+]
+
+CAMPAIGN_TEMPLATES = [
+    "Spring Sale", "Summer Clearance", "Back to School", "Black Friday",
+    "Holiday Season", "New Year Push", "Flash Sale", "Product Launch",
+    "Loyalty Rewards", "Newsletter Blast", "Retargeting Wave",
+    "Brand Awareness", "Category Spotlight", "Win-Back", "Early Access",
+]
+
+LEAD_SOURCES = [
+    ("newsletter_signup", 0.30),
+    ("contact_form", 0.25),
+    ("demo_request", 0.15),
+    ("content_download", 0.20),
+    ("webinar_registration", 0.10),
+]
+
+DEVICES = [("desktop", 0.45), ("mobile", 0.45), ("tablet", 0.10)]
+BROWSERS = [("Chrome", 0.64), ("Safari", 0.19), ("Firefox", 0.08),
+            ("Edge", 0.07), ("Other", 0.02)]
+
+LANDING_PAGES = [
+    "/", "/products", "/products/electronics", "/products/clothing",
+    "/products/home-garden", "/sale", "/new-arrivals", "/about",
+    "/blog", "/blog/tips", "/blog/reviews", "/contact",
+]
+
+ORDER_STATUSES = [
+    ("delivered", 0.58), ("shipped", 0.15), ("confirmed", 0.10),
+    ("pending", 0.04), ("cancelled", 0.08), ("returned", 0.05),
+]
+
+ORDER_CHANNELS = [
+    ("web", 0.55), ("mobile_app", 0.35), ("phone", 0.05), ("api", 0.05),
+]
+
+PAYMENT_METHODS = [
+    ("credit_card", 0.38), ("debit_card", 0.20), ("paypal", 0.18),
+    ("bank_transfer", 0.12), ("apple_pay", 0.08), ("invoice", 0.04),
+]
+
+TICKET_CATEGORIES = [
+    ("question", 0.28), ("complaint", 0.18), ("return_request", 0.14),
+    ("shipping", 0.16), ("technical_issue", 0.12), ("refund", 0.12),
+]
+
+TICKET_PRIORITIES = [
+    ("low", 0.38), ("medium", 0.35), ("high", 0.20), ("critical", 0.07),
+]
+
+TICKET_SUBJECTS = {
+    "question": [
+        "Delivery time estimate", "Product compatibility", "Return policy",
+        "Bulk order pricing", "Warranty coverage", "Size guide help",
+    ],
+    "complaint": [
+        "Item arrived damaged", "Wrong product received", "Poor quality",
+        "Missing items in order", "Packaging insufficient", "Late delivery",
+    ],
+    "return_request": [
+        "Does not match description", "Changed my mind", "Duplicate order",
+        "Size does not fit", "Defective product", "Better price elsewhere",
+    ],
+    "shipping": [
+        "Package not delivered", "Tracking not updating", "Wrong address",
+        "Expedited shipping request", "International shipping", "Lost package",
+    ],
+    "technical_issue": [
+        "Cannot complete checkout", "Payment error", "Login problem",
+        "Page not loading", "Mobile app crash", "Coupon not working",
+    ],
+    "refund": [
+        "Cancelled order refund", "Partial refund request", "Overcharged",
+        "Refund not received", "Billing discrepancy", "Double charged",
+    ],
+}
+
+TICKET_CHANNELS = [
+    ("email", 0.40), ("chat", 0.30), ("phone", 0.15), ("web_form", 0.15),
+]
+
+
+# ── Generator ──────────────────────────────────────────────────────────
+
+class SampleDataGenerator:
+    """Generates realistic synthetic e-commerce data as CSV files."""
+
+    def __init__(self, size: str, seed: int, output_dir: Path):
+        self.cfg = SIZE_CONFIGS[size]
+        self.size_name = size
+        self.rng = random.Random(seed)
+        self.fake = Faker(["en_US", "de_DE", "cs_CZ", "fr_FR"])
+        Faker.seed(seed)
+        self.output_dir = output_dir
+        self.row_counts: dict[str, int] = {}
+
+        # Time range
+        months = self.cfg["months"]
+        self.end_date = date(2026, 3, 1)
+        self.start_date = self.end_date - timedelta(days=months * 30)
+        self.total_days = (self.end_date - self.start_date).days
+
+        # Pre-compute day weights for temporal distribution
+        self._days: list[date] = []
+        self._day_weights: list[float] = []
+        for i in range(self.total_days):
+            d = self.start_date + timedelta(days=i)
+            growth = 1.0 + 0.5 * (i / max(self.total_days, 1))
+            season = MONTHLY_SEASONALITY[d.month - 1]
+            dow = DOW_MULTIPLIER[d.weekday()]
+            self._days.append(d)
+            self._day_weights.append(growth * season * dow)
+
+        # Reference data (populated during generation)
+        self._customer_ids: list[str] = []
+        self._customer_reg_dates: dict[str, date] = {}
+        self._customer_segments: dict[str, str] = {}
+        self._product_ids: list[str] = []
+        self._product_prices: dict[str, float] = {}
+        self._product_categories: dict[str, str] = {}
+        self._campaign_ids: list[str] = []
+        self._campaign_ranges: dict[str, tuple[date, date]] = {}
+        self._order_ids: list[str] = []
+        self._order_customers: dict[str, str] = {}
+        self._order_dates: dict[str, date] = {}
+        self._order_statuses: dict[str, str] = {}
+        self._order_totals: dict[str, float] = {}
+
+    # ── Helpers ─────────────────────────────────────────────────
+
+    def _weighted_choice(self, options: list[tuple[str, float]]) -> str:
+        """Pick from [(value, weight), ...] using instance RNG."""
+        values, weights = zip(*options)
+        return self.rng.choices(values, weights=weights, k=1)[0]
+
+    def _random_date(self) -> date:
+        """Random date weighted by growth + seasonality + day-of-week."""
+        return self.rng.choices(self._days, weights=self._day_weights, k=1)[0]
+
+    def _random_datetime(self, d: date | None = None) -> str:
+        """Random datetime string. If d is None, pick a weighted random date."""
+        if d is None:
+            d = self._random_date()
+        hour = self.rng.choices(range(24), weights=HOUR_WEIGHTS, k=1)[0]
+        minute = self.rng.randint(0, 59)
+        second = self.rng.randint(0, 59)
+        return f"{d} {hour:02d}:{minute:02d}:{second:02d}"
+
+    def _random_date_after(self, start: date, max_days: int = 30) -> date:
+        """Random date between start and start + max_days (capped at end_date)."""
+        end = min(start + timedelta(days=max_days), self.end_date)
+        delta = (end - start).days
+        if delta <= 0:
+            return start
+        return start + timedelta(days=self.rng.randint(0, delta))
+
+    def _write_table(self, name: str, fields: list[str],
+                     rows: list[dict] | Generator) -> int:
+        """Write CSV table from list or generator of dicts."""
+        path = self.output_dir / f"{name}.csv"
+        count = 0
+        with open(path, "w", newline="", encoding="utf-8") as f:
+            writer = csv.DictWriter(f, fieldnames=fields)
+            writer.writeheader()
+            for row in rows:
+                writer.writerow(row)
+                count += 1
+                if count % 250_000 == 0:
+                    logger.info(f"  ... {count:,} rows written")
+        self.row_counts[name] = count
+        return count
+
+    # ── Table generators ───────────────────────────────────────
+
+    def _generate_customers(self) -> None:
+        n = self.cfg["customers"]
+        logger.info(f"  Generating {n:,} customers...")
+
+        country_vals = [(c[0], c[2]) for c in COUNTRIES]
+        rows = []
+        for i in range(n):
+            cid = f"C-{i + 1:06d}"
+            segment = self._weighted_choice(CUSTOMER_SEGMENTS)
+            reg_date = self._random_date()
+            first = self.fake.first_name()
+            last = self.fake.last_name()
+            country = self._weighted_choice(country_vals)
+
+            if segment.startswith("b2b"):
+                company = self.fake.company()
+                domain = company.lower().split()[0].replace(",", "") + ".com"
+                email = f"{first.lower()}.{last.lower()}@{domain}"
+            else:
+                company = ""
+                domain = self.rng.choice(EMAIL_DOMAINS)
+                email = f"{first.lower()}.{last.lower()}@{domain}"
+
+            rows.append({
+                "customer_id": cid,
+                "email": email,
+                "first_name": first,
+                "last_name": last,
+                "company": company,
+                "country": country,
+                "city": self.fake.city(),
+                "segment": segment,
+                "registration_date": str(reg_date),
+                "is_active": self.rng.choices([1, 0], weights=[0.85, 0.15])[0],
+            })
+            self._customer_ids.append(cid)
+            self._customer_reg_dates[cid] = reg_date
+            self._customer_segments[cid] = segment
+
+        self._write_table("customers", list(rows[0].keys()), rows)
+
+    def _generate_products(self) -> None:
+        n = self.cfg["products"]
+        logger.info(f"  Generating {n:,} products...")
+
+        # Build product pool: base items + variants for larger sizes
+        pool: list[tuple[str, str, str]] = []  # (name, category, subcategory)
+        categories = list(PRODUCT_CATEGORIES.keys())
+        for cat in categories:
+            for item in PRODUCT_CATEGORIES[cat]["items"]:
+                pool.append((item, cat, cat))
+
+        # Add variants if we need more than base pool
+        while len(pool) < n:
+            cat = self.rng.choice(categories)
+            item = self.rng.choice(PRODUCT_CATEGORIES[cat]["items"])
+            variant = self.rng.choice(PRODUCT_VARIANTS)
+            color = self.rng.choice(PRODUCT_COLORS)
+            name = f"{item} {variant} - {color}"
+            pool.append((name, cat, cat))
+
+        self.rng.shuffle(pool)
+        pool = pool[:n]
+
+        rows = []
+        for i, (name, category, _subcat) in enumerate(pool):
+            pid = f"P-{i + 1:05d}"
+            cat_cfg = PRODUCT_CATEGORIES[category]
+            price = round(self.rng.uniform(*cat_cfg["price_range"]), 2)
+            cost_ratio = self.rng.uniform(*cat_cfg["cost_ratio"])
+            cost = round(price * cost_ratio, 2)
+
+            rows.append({
+                "product_id": pid,
+                "sku": f"SKU-{self.rng.randint(10000, 99999)}",
+                "name": name,
+                "category": category,
+                "price": price,
+                "cost": cost,
+                "weight_kg": round(self.rng.uniform(0.1, 15.0), 2),
+                "is_active": self.rng.choices([1, 0], weights=[0.90, 0.10])[0],
+                "created_at": str(self._random_date()),
+            })
+            self._product_ids.append(pid)
+            self._product_prices[pid] = price
+            self._product_categories[pid] = category
+
+        self._write_table("products", list(rows[0].keys()), rows)
+
+    def _generate_campaigns(self) -> None:
+        n = self.cfg["campaigns"]
+        logger.info(f"  Generating {n:,} campaigns...")
+
+        rows = []
+        for i in range(n):
+            cid = f"CMP-{i + 1:04d}"
+            channel = self._weighted_choice(CAMPAIGN_CHANNELS)
+            start = self._random_date()
+            duration = self.rng.randint(7, 60)
+            end = min(start + timedelta(days=duration), self.end_date)
+            is_past = end < self.end_date - timedelta(days=7)
+
+            budget = round(self.rng.uniform(500, 25000), 2)
+            spend_ratio = self.rng.uniform(0.6, 1.1) if is_past else self.rng.uniform(0.2, 0.7)
+            spend = round(budget * min(spend_ratio, 1.0), 2)
+            impressions = int(spend * self.rng.uniform(80, 500))
+            ctr = self.rng.uniform(0.005, 0.08)
+            clicks = int(impressions * ctr)
+
+            template = self.rng.choice(CAMPAIGN_TEMPLATES)
+            name = f"{template} - {channel.replace('_', ' ').title()} {start.year}"
+
+            status = "completed" if is_past else self.rng.choice(["active", "paused"])
+
+            rows.append({
+                "campaign_id": cid,
+                "name": name,
+                "channel": channel,
+                "status": status,
+                "budget": budget,
+                "spend": spend,
+                "impressions": impressions,
+                "clicks": clicks,
+                "start_date": str(start),
+                "end_date": str(end),
+                "target_segment": self._weighted_choice(CUSTOMER_SEGMENTS),
+            })
+            self._campaign_ids.append(cid)
+            self._campaign_ranges[cid] = (start, end)
+
+        self._write_table("campaigns", list(rows[0].keys()), rows)
+
+    def _generate_web_sessions(self) -> None:
+        n = self.cfg["web_sessions"]
+        logger.info(f"  Generating {n:,} web sessions...")
+
+        fields = [
+            "session_id", "visitor_id", "customer_id", "campaign_id",
+            "started_at", "duration_seconds", "pages_viewed",
+            "device_type", "browser", "country", "landing_page", "is_bounce",
+        ]
+        country_vals = [(c[0], c[2]) for c in COUNTRIES]
+
+        def gen_rows() -> Generator[dict[str, Any], None, None]:
+            for i in range(n):
+                sid = f"S-{i + 1:08d}"
+                d = self._random_date()
+
+                # 40% sessions from logged-in customers
+                customer_id = ""
+                if self.rng.random() < 0.40 and self._customer_ids:
+                    customer_id = self.rng.choice(self._customer_ids)
+
+                # 25% sessions attributed to a campaign
+                campaign_id = ""
+                if self.rng.random() < 0.25 and self._campaign_ids:
+                    # Pick a campaign that was active on this date
+                    candidates = [
+                        c for c in self._campaign_ids
+                        if self._campaign_ranges[c][0] <= d <= self._campaign_ranges[c][1]
+                    ]
+                    if candidates:
+                        campaign_id = self.rng.choice(candidates)
+
+                is_bounce = self.rng.random() < 0.35
+                if is_bounce:
+                    duration = self.rng.randint(5, 30)
+                    pages = 1
+                else:
+                    duration = self.rng.randint(30, 900)
+                    pages = self.rng.randint(2, 15)
+
+                yield {
+                    "session_id": sid,
+                    "visitor_id": f"V-{self.rng.randint(1, n // 3):08d}",
+                    "customer_id": customer_id,
+                    "campaign_id": campaign_id,
+                    "started_at": self._random_datetime(d),
+                    "duration_seconds": duration,
+                    "pages_viewed": pages,
+                    "device_type": self._weighted_choice(DEVICES),
+                    "browser": self._weighted_choice(BROWSERS),
+                    "country": self._weighted_choice(country_vals),
+                    "landing_page": self.rng.choice(LANDING_PAGES),
+                    "is_bounce": int(is_bounce),
+                }
+
+        self._write_table("web_sessions", fields, gen_rows())
+
+    def _generate_web_leads(self) -> None:
+        n = self.cfg["web_leads"]
+        logger.info(f"  Generating {n:,} web leads...")
+
+        fields = [
+            "lead_id", "customer_id", "email", "source", "campaign_id",
+            "created_at", "status", "converted_at",
+        ]
+        lead_statuses = [
+            ("new", 0.35), ("contacted", 0.20), ("qualified", 0.15),
+            ("converted", 0.18), ("lost", 0.12),
+        ]
+
+        rows = []
+        for i in range(n):
+            lid = f"L-{i + 1:06d}"
+            d = self._random_date()
+            status = self._weighted_choice(lead_statuses)
+
+            # 55% from existing customers
+            customer_id = ""
+            email = self.fake.email()
+            if self.rng.random() < 0.55 and self._customer_ids:
+                customer_id = self.rng.choice(self._customer_ids)
+
+            campaign_id = ""
+            if self.rng.random() < 0.40 and self._campaign_ids:
+                campaign_id = self.rng.choice(self._campaign_ids)
+
+            converted_at = ""
+            if status == "converted":
+                converted_at = self._random_datetime(
+                    self._random_date_after(d, max_days=14)
+                )
+
+            rows.append({
+                "lead_id": lid,
+                "customer_id": customer_id,
+                "email": email,
+                "source": self._weighted_choice(LEAD_SOURCES),
+                "campaign_id": campaign_id,
+                "created_at": self._random_datetime(d),
+                "status": status,
+                "converted_at": converted_at,
+            })
+
+        self._write_table("web_leads", fields, rows)
+
+    def _generate_orders_and_items(self) -> None:
+        n_orders = self.cfg["orders"]
+        logger.info(f"  Generating {n_orders:,} orders + order items...")
+
+        # Customer activity weights (Pareto-like distribution)
+        activity = [self.rng.paretovariate(1.2) for _ in self._customer_ids]
+
+        order_fields = [
+            "order_id", "customer_id", "created_at", "status",
+            "items_total", "discount_amount", "shipping_amount",
+            "total_amount", "channel",
+        ]
+        item_fields = [
+            "order_item_id", "order_id", "product_id", "quantity",
+            "unit_price", "discount_percent", "line_total",
+        ]
+
+        order_rows = []
+        item_rows = []
+        item_seq = 0
+
+        for i in range(n_orders):
+            oid = f"ORD-{i + 1:07d}"
+            cust_id = self.rng.choices(self._customer_ids, weights=activity, k=1)[0]
+            reg_date = self._customer_reg_dates[cust_id]
+            segment = self._customer_segments[cust_id]
+
+            # Order date: after customer registration
+            order_date = self._random_date_after(reg_date,
+                                                 max_days=(self.end_date - reg_date).days)
+            status = self._weighted_choice(ORDER_STATUSES)
+
+            # B2B orders tend to have more items
+            max_items = 8 if segment.startswith("b2b") else 5
+            item_weights = list(range(max_items, 0, -1))  # favor fewer items
+            n_items = self.rng.choices(range(1, max_items + 1), weights=item_weights, k=1)[0]
+
+            items_total = 0.0
+            for _j in range(n_items):
+                item_seq += 1
+                pid = self.rng.choice(self._product_ids)
+                qty = self.rng.choices([1, 2, 3, 4, 5],
+                                       weights=[60, 20, 10, 5, 5], k=1)[0]
+                if segment == "b2b_enterprise":
+                    qty *= self.rng.randint(1, 5)
+                unit_price = self._product_prices[pid]
+                disc_pct = self.rng.choices(
+                    [0, 5, 10, 15, 20],
+                    weights=[50, 20, 15, 10, 5], k=1
+                )[0]
+                line_total = round(unit_price * qty * (1 - disc_pct / 100), 2)
+                items_total += line_total
+
+                item_rows.append({
+                    "order_item_id": f"OI-{item_seq:08d}",
+                    "order_id": oid,
+                    "product_id": pid,
+                    "quantity": qty,
+                    "unit_price": unit_price,
+                    "discount_percent": disc_pct,
+                    "line_total": line_total,
+                })
+
+            discount_amount = round(items_total * self.rng.uniform(0, 0.05), 2)
+            shipping = round(self.rng.uniform(0, 15.99), 2) if items_total < 100 else 0.0
+            total = round(items_total - discount_amount + shipping, 2)
+
+            order_rows.append({
+                "order_id": oid,
+                "customer_id": cust_id,
+                "created_at": self._random_datetime(order_date),
+                "status": status,
+                "items_total": round(items_total, 2),
+                "discount_amount": discount_amount,
+                "shipping_amount": shipping,
+                "total_amount": total,
+                "channel": self._weighted_choice(ORDER_CHANNELS),
+            })
+            self._order_ids.append(oid)
+            self._order_customers[oid] = cust_id
+            self._order_dates[oid] = order_date
+            self._order_statuses[oid] = status
+            self._order_totals[oid] = total
+
+        self._write_table("orders", order_fields, order_rows)
+        self._write_table("order_items", item_fields, item_rows)
+
+    def _generate_payments(self) -> None:
+        logger.info(f"  Generating payments for {len(self._order_ids):,} orders...")
+
+        fields = [
+            "payment_id", "order_id", "customer_id", "amount", "currency",
+            "method", "status", "created_at", "completed_at",
+        ]
+
+        rows = []
+        seq = 0
+        for oid in self._order_ids:
+            cust_id = self._order_customers[oid]
+            segment = self._customer_segments[cust_id]
+            order_date = self._order_dates[oid]
+            order_status = self._order_statuses[oid]
+            amount = self._order_totals[oid]
+
+            # B2B more likely to use invoice/bank_transfer
+            if segment.startswith("b2b") and self.rng.random() < 0.40:
+                method = self.rng.choice(["bank_transfer", "invoice"])
+            else:
+                method = self._weighted_choice(PAYMENT_METHODS)
+
+            # 5% chance of a failed payment attempt first
+            if self.rng.random() < 0.05:
+                seq += 1
+                rows.append({
+                    "payment_id": f"PAY-{seq:07d}",
+                    "order_id": oid,
+                    "customer_id": cust_id,
+                    "amount": amount,
+                    "currency": "EUR",
+                    "method": method,
+                    "status": "failed",
+                    "created_at": self._random_datetime(order_date),
+                    "completed_at": "",
+                })
+
+            seq += 1
+            if order_status == "cancelled":
+                pay_status = "cancelled"
+                completed = ""
+            elif order_status == "returned":
+                pay_status = "refunded"
+                completed = self._random_datetime(
+                    self._random_date_after(order_date, max_days=14)
+                )
+            else:
+                pay_status = "completed"
+                completed = self._random_datetime(
+                    self._random_date_after(order_date, max_days=3)
+                )
+
+            rows.append({
+                "payment_id": f"PAY-{seq:07d}",
+                "order_id": oid,
+                "customer_id": cust_id,
+                "amount": amount,
+                "currency": "EUR",
+                "method": method,
+                "status": pay_status,
+                "created_at": self._random_datetime(order_date),
+                "completed_at": completed,
+            })
+
+        self._write_table("payments", fields, rows)
+
+    def _generate_support_tickets(self) -> None:
+        n = self.cfg["support_tickets"]
+        logger.info(f"  Generating {n:,} support tickets...")
+
+        fields = [
+            "ticket_id", "customer_id", "order_id", "category", "priority",
+            "status", "channel", "subject", "created_at", "first_response_at",
+            "resolved_at", "satisfaction_score",
+        ]
+
+        rows = []
+        for i in range(n):
+            tid = f"TKT-{i + 1:06d}"
+            cust_id = self.rng.choice(self._customer_ids)
+            category = self._weighted_choice(TICKET_CATEGORIES)
+            priority = self._weighted_choice(TICKET_PRIORITIES)
+            subject = self.rng.choice(TICKET_SUBJECTS[category])
+            d = self._random_date()
+
+            # 60% linked to an order
+            order_id = ""
+            if self.rng.random() < 0.60 and self._order_ids:
+                # Pick an order from this customer if possible
+                cust_orders = [
+                    o for o in self._order_ids
+                    if self._order_customers[o] == cust_id
+                ]
+                if cust_orders:
+                    order_id = self.rng.choice(cust_orders)
+                else:
+                    order_id = self.rng.choice(self._order_ids)
+
+            # Status progression
+            is_resolved = self.rng.random() < 0.75
+            if is_resolved:
+                status = self.rng.choice(["resolved", "closed"])
+            else:
+                status = self.rng.choice(["open", "in_progress", "waiting_customer"])
+
+            # Response and resolution times based on priority
+            response_hours = {
+                "critical": (0.5, 4), "high": (1, 12),
+                "medium": (4, 48), "low": (8, 96),
+            }
+            rh = response_hours[priority]
+            first_response = ""
+            resolved_at = ""
+            satisfaction = ""
+
+            if status not in ("open",):
+                resp_delta = timedelta(hours=self.rng.uniform(*rh))
+                first_response = self._random_datetime(
+                    min(d + timedelta(days=int(resp_delta.total_seconds() // 86400)),
+                        self.end_date)
+                )
+
+            if is_resolved:
+                resolve_days = self.rng.randint(1, 14)
+                resolved_at = self._random_datetime(
+                    self._random_date_after(d, max_days=resolve_days)
+                )
+                # Satisfaction: skewed toward 4-5 for resolved
+                satisfaction = self.rng.choices(
+                    [1, 2, 3, 4, 5],
+                    weights=[5, 8, 15, 35, 37], k=1
+                )[0]
+
+            rows.append({
+                "ticket_id": tid,
+                "customer_id": cust_id,
+                "order_id": order_id,
+                "category": category,
+                "priority": priority,
+                "status": status,
+                "channel": self._weighted_choice(TICKET_CHANNELS),
+                "subject": subject,
+                "created_at": self._random_datetime(d),
+                "first_response_at": first_response,
+                "resolved_at": resolved_at,
+                "satisfaction_score": satisfaction,
+            })
+
+        self._write_table("support_tickets", fields, rows)
+
+    # ── Orchestration ──────────────────────────────────────────
+
+    def run(self) -> dict[str, Any]:
+        """Generate all tables and return manifest data."""
+        self.output_dir.mkdir(parents=True, exist_ok=True)
+        t0 = time.time()
+
+        logger.info(f"Generating sample data (size: {self.size_name})")
+        logger.info(f"  Period: {self.start_date} to {self.end_date} "
+                     f"({self.cfg['months']} months)")
+        logger.info(f"  Output: {self.output_dir}/")
+
+        self._generate_customers()
+        self._generate_products()
+        self._generate_campaigns()
+        self._generate_web_sessions()
+        self._generate_web_leads()
+        self._generate_orders_and_items()
+        self._generate_payments()
+        self._generate_support_tickets()
+
+        elapsed = time.time() - t0
+        total_rows = sum(self.row_counts.values())
+
+        manifest = {
+            "generator": "generate_sample_data.py",
+            "size": self.size_name,
+            "seed": self.rng.getstate()[1][0],
+            "date_range": {
+                "start": str(self.start_date),
+                "end": str(self.end_date),
+            },
+            "tables": self.row_counts,
+            "total_rows": total_rows,
+            "elapsed_seconds": round(elapsed, 1),
+        }
+        manifest_path = self.output_dir / "_manifest.json"
+        with open(manifest_path, "w", encoding="utf-8") as f:
+            json.dump(manifest, f, indent=2)
+
+        logger.info("")
+        logger.info(f"Done! {len(self.row_counts)} tables, "
+                     f"{total_rows:,} total rows in {elapsed:.1f}s")
+        logger.info(f"Manifest: {manifest_path}")
+        return manifest
+
+
+# ── CLI ────────────────────────────────────────────────────────────────
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Generate synthetic e-commerce sample data as CSV files."
+    )
+    parser.add_argument(
+        "--size", choices=SIZE_CONFIGS.keys(), default="s",
+        help="Data size preset (default: s)",
+    )
+    parser.add_argument(
+        "--output", type=Path, default=Path("data/sample"),
+        help="Output directory for CSV files (default: data/sample)",
+    )
+    parser.add_argument(
+        "--seed", type=int, default=42,
+        help="Random seed for reproducibility (default: 42)",
+    )
+    parser.add_argument(
+        "--list-sizes", action="store_true",
+        help="Show available size presets and exit",
+    )
+    args = parser.parse_args()
+
+    logging.basicConfig(level=logging.INFO, format="%(message)s")
+
+    if args.list_sizes:
+        print("\nAvailable size presets:\n")
+        print(f"  {'Size':<6} {'Label':<24} {'Customers':>10} {'Products':>10} "
+              f"{'Sessions':>10} {'Orders':>10} {'~CSV MB':>8}")
+        print(f"  {'─' * 6} {'─' * 24} {'─' * 10} {'─' * 10} "
+              f"{'─' * 10} {'─' * 10} {'─' * 8}")
+        for key, cfg in SIZE_CONFIGS.items():
+            print(f"  {key:<6} {cfg['label']:<24} {cfg['customers']:>10,} "
+                  f"{cfg['products']:>10,} {cfg['web_sessions']:>10,} "
+                  f"{cfg['orders']:>10,} {cfg['estimated_csv_mb']:>7,}")
+        print()
+        return
+
+    gen = SampleDataGenerator(size=args.size, seed=args.seed, output_dir=args.output)
+    gen.run()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/test_generate_sample_data.py b/tests/test_generate_sample_data.py
new file mode 100644
index 0000000..45597de
--- /dev/null
+++ b/tests/test_generate_sample_data.py
@@ -0,0 +1,173 @@
+"""Tests for the sample data generator."""
+
+import csv
+import json
+import pytest
+from pathlib import Path
+
+from scripts.generate_sample_data import SampleDataGenerator, SIZE_CONFIGS
+
+
+@pytest.fixture
+def output_dir(tmp_path: Path) -> Path:
+    """Temporary output directory for generated CSV files."""
+    return tmp_path / "sample_data"
+
+
+class TestSizeConfigs:
+    """Verify size configuration integrity."""
+
+    def test_all_sizes_have_required_keys(self):
+        required = {
+            "customers", "products", "campaigns", "web_sessions",
+            "web_leads", "orders", "support_tickets", "months",
+        }
+        for size, cfg in SIZE_CONFIGS.items():
+            missing = required - set(cfg.keys())
+            assert not missing, f"Size '{size}' missing keys: {missing}"
+
+    def test_sizes_scale_monotonically(self):
+        """Each size should be strictly larger than the previous one."""
+        sizes = list(SIZE_CONFIGS.keys())
+        for key in ["customers", "products", "orders", "web_sessions"]:
+            values = [SIZE_CONFIGS[s][key] for s in sizes]
+            assert values == sorted(values), (
+                f"{key} does not scale monotonically across sizes"
+            )
+
+
+class TestXSGeneration:
+    """Full generation test with xs size (fast)."""
+
+    @pytest.fixture(autouse=True)
+    def generate(self, output_dir: Path):
+        self.output_dir = output_dir
+        gen = SampleDataGenerator(size="xs", seed=42, output_dir=output_dir)
+        self.manifest = gen.run()
+
+    def test_all_csv_files_created(self):
+        expected = {
+            "customers", "products", "campaigns", "web_sessions",
+            "web_leads", "orders", "order_items", "payments",
+            "support_tickets",
+        }
+        csv_files = {p.stem for p in self.output_dir.glob("*.csv")}
+        assert expected == csv_files
+
+    def test_manifest_created(self):
+        manifest_path = self.output_dir / "_manifest.json"
+        assert manifest_path.exists()
+        data = json.loads(manifest_path.read_text())
+        assert data["size"] == "xs"
+        assert "tables" in data
+        assert data["total_rows"] > 0
+
+    def test_row_counts_match_config(self):
+        """Row counts for directly specified tables should match config."""
+        cfg = SIZE_CONFIGS["xs"]
+        for table in ["customers", "products", "campaigns", "web_sessions",
+                       "web_leads", "orders", "support_tickets"]:
+            assert self.manifest["tables"][table] == cfg[table], (
+                f"{table}: expected {cfg[table]}, got {self.manifest['tables'][table]}"
+            )
+
+    def test_order_items_derived(self):
+        """Order items should be > orders (most orders have multiple items)."""
+        assert self.manifest["tables"]["order_items"] > self.manifest["tables"]["orders"]
+
+    def test_payments_at_least_one_per_order(self):
+        """Payments should be >= orders (some have failed retries)."""
+        assert self.manifest["tables"]["payments"] >= self.manifest["tables"]["orders"]
+
+    def test_csv_headers_not_empty(self):
+        """Every CSV should have a header and at least one data row."""
+        for csv_path in self.output_dir.glob("*.csv"):
+            with open(csv_path) as f:
+                reader = csv.reader(f)
+                header = next(reader)
+                assert len(header) > 0, f"{csv_path.name}: empty header"
+                first_row = next(reader, None)
+                assert first_row is not None, f"{csv_path.name}: no data rows"
+
+
+class TestReferentialIntegrity:
+    """Verify foreign key relationships across tables."""
+
+    @pytest.fixture(autouse=True)
+    def generate(self, output_dir: Path):
+        self.output_dir = output_dir
+        gen = SampleDataGenerator(size="xs", seed=123, output_dir=output_dir)
+        gen.run()
+        self.tables = {}
+        for csv_path in output_dir.glob("*.csv"):
+            with open(csv_path) as f:
+                self.tables[csv_path.stem] = list(csv.DictReader(f))
+
+    def _get_ids(self, table: str, column: str) -> set[str]:
+        return {row[column] for row in self.tables[table]}
+
+    def _get_fk_values(self, table: str, column: str) -> set[str]:
+        return {row[column] for row in self.tables[table] if row[column]}
+
+    def test_orders_reference_valid_customers(self):
+        customer_ids = self._get_ids("customers", "customer_id")
+        order_customer_ids = self._get_fk_values("orders", "customer_id")
+        orphans = order_customer_ids - customer_ids
+        assert not orphans, f"Orders reference non-existent customers: {orphans}"
+
+    def test_order_items_reference_valid_orders(self):
+        order_ids = self._get_ids("orders", "order_id")
+        item_order_ids = self._get_fk_values("order_items", "order_id")
+        orphans = item_order_ids - order_ids
+        assert not orphans, f"Order items reference non-existent orders: {orphans}"
+
+    def test_order_items_reference_valid_products(self):
+        product_ids = self._get_ids("products", "product_id")
+        item_product_ids = self._get_fk_values("order_items", "product_id")
+        orphans = item_product_ids - product_ids
+        assert not orphans, f"Order items reference non-existent products: {orphans}"
+
+    def test_payments_reference_valid_orders(self):
+        order_ids = self._get_ids("orders", "order_id")
+        payment_order_ids = self._get_fk_values("payments", "order_id")
+        orphans = payment_order_ids - order_ids
+        assert not orphans, f"Payments reference non-existent orders: {orphans}"
+
+    def test_support_tickets_reference_valid_customers(self):
+        customer_ids = self._get_ids("customers", "customer_id")
+        ticket_customer_ids = self._get_fk_values("support_tickets", "customer_id")
+        orphans = ticket_customer_ids - customer_ids
+        assert not orphans, f"Tickets reference non-existent customers: {orphans}"
+
+
+class TestDeterminism:
+    """Verify reproducibility with same seed."""
+
+    def test_same_seed_produces_same_output(self, tmp_path: Path):
+        dir1 = tmp_path / "run1"
+        dir2 = tmp_path / "run2"
+
+        gen1 = SampleDataGenerator(size="xs", seed=99, output_dir=dir1)
+        gen1.run()
+
+        gen2 = SampleDataGenerator(size="xs", seed=99, output_dir=dir2)
+        gen2.run()
+
+        for csv_path in dir1.glob("*.csv"):
+            content1 = csv_path.read_text()
+            content2 = (dir2 / csv_path.name).read_text()
+            assert content1 == content2, f"{csv_path.name} differs between runs"
+
+    def test_different_seed_produces_different_output(self, tmp_path: Path):
+        dir1 = tmp_path / "seed1"
+        dir2 = tmp_path / "seed2"
+
+        gen1 = SampleDataGenerator(size="xs", seed=1, output_dir=dir1)
+        gen1.run()
+
+        gen2 = SampleDataGenerator(size="xs", seed=2, output_dir=dir2)
+        gen2.run()
+
+        content1 = (dir1 / "customers.csv").read_text()
+        content2 = (dir2 / "customers.csv").read_text()
+        assert content1 != content2