feat: implement extract.duckdb contract — orchestrator + extractors

Phase 0: extend table_registry schema (v1→v2 migration), add
source_type/bucket/source_table/query_mode columns.

Phase 1: SyncOrchestrator ATTACHes extract.duckdb files into master
analytics.duckdb. Keboola extractor uses DuckDB extension with
legacy client fallback. BigQuery extractor is remote-only via
DuckDB BQ extension (no data download).

62 tests passing.
This commit is contained in:
ZdenekSrotyr 2026-03-30 20:12:56 +02:00
parent 0b9720d090
commit 18e5f0b6e8
12 changed files with 1281 additions and 23 deletions

View file

@ -21,6 +21,12 @@ class RegisterTableRequest(BaseModel):
sync_strategy: str = "full_refresh" sync_strategy: str = "full_refresh"
primary_key: Optional[str] = None primary_key: Optional[str] = None
description: Optional[str] = None description: Optional[str] = None
source_type: Optional[str] = None
bucket: Optional[str] = None
source_table: Optional[str] = None
query_mode: str = "local"
sync_schedule: Optional[str] = None
profile_after_sync: bool = True
class UpdateTableRequest(BaseModel): class UpdateTableRequest(BaseModel):
@ -28,6 +34,12 @@ class UpdateTableRequest(BaseModel):
sync_strategy: Optional[str] = None sync_strategy: Optional[str] = None
primary_key: Optional[str] = None primary_key: Optional[str] = None
description: Optional[str] = None description: Optional[str] = None
source_type: Optional[str] = None
bucket: Optional[str] = None
source_table: Optional[str] = None
query_mode: Optional[str] = None
sync_schedule: Optional[str] = None
profile_after_sync: Optional[bool] = None
@router.get("/discover-tables") @router.get("/discover-tables")
@ -78,6 +90,12 @@ async def register_table(
primary_key=request.primary_key, primary_key=request.primary_key,
description=request.description, description=request.description,
registered_by=user.get("email"), registered_by=user.get("email"),
source_type=request.source_type,
bucket=request.bucket,
source_table=request.source_table,
query_mode=request.query_mode,
sync_schedule=request.sync_schedule,
profile_after_sync=request.profile_after_sync,
) )
# Regenerate data_description.md if table_registry module supports it # Regenerate data_description.md if table_registry module supports it

View file

@ -0,0 +1,116 @@
"""BigQuery extractor — produces extract.duckdb with remote views via DuckDB BigQuery extension.
No data is downloaded. All queries go directly to BigQuery via DuckDB extension ATTACH.
"""
import logging
import os
from datetime import datetime, timezone
from pathlib import Path
from typing import List, Dict, Any
import duckdb
logger = logging.getLogger(__name__)
def _create_meta_table(conn: duckdb.DuckDBPyConnection) -> None:
"""Create the _meta table required by the extract.duckdb contract."""
conn.execute("DROP TABLE IF EXISTS _meta")
conn.execute("""CREATE TABLE _meta (
table_name VARCHAR NOT NULL,
description VARCHAR,
rows BIGINT,
size_bytes BIGINT,
extracted_at TIMESTAMP,
query_mode VARCHAR DEFAULT 'remote'
)""")
def init_extract(
output_dir: str,
project_id: str,
table_configs: List[Dict[str, Any]],
) -> Dict[str, Any]:
"""Create extract.duckdb with remote views into BigQuery.
Args:
output_dir: Path to write extract.duckdb
project_id: GCP project ID
table_configs: List of table config dicts from table_registry
Returns:
Dict with stats: {tables_registered: int, errors: list}
"""
output_path = Path(output_dir)
output_path.mkdir(parents=True, exist_ok=True)
db_path = output_path / "extract.duckdb"
conn = duckdb.connect(str(db_path))
stats = {"tables_registered": 0, "errors": []}
now = datetime.now(timezone.utc)
try:
# Install and load BigQuery extension
conn.execute("INSTALL bigquery FROM community; LOAD bigquery;")
conn.execute(f"ATTACH 'project={project_id}' AS bq (TYPE bigquery, READ_ONLY)")
logger.info("Attached BigQuery project: %s", project_id)
_create_meta_table(conn)
for tc in table_configs:
table_name = tc["name"]
dataset = tc.get("bucket", "") # BigQuery dataset
source_table = tc.get("source_table", table_name)
try:
conn.execute(
f'CREATE OR REPLACE VIEW "{table_name}" AS '
f'SELECT * FROM bq."{dataset}"."{source_table}"'
)
conn.execute(
"INSERT INTO _meta VALUES (?, ?, 0, 0, ?, 'remote')",
[table_name, tc.get("description", ""), now],
)
stats["tables_registered"] += 1
logger.info(
"Registered remote view: %s -> bq.%s.%s",
table_name, dataset, source_table,
)
except Exception as e:
logger.error("Failed to register %s: %s", table_name, e)
stats["errors"].append({"table": table_name, "error": str(e)})
conn.execute("DETACH bq")
finally:
conn.close()
return stats
if __name__ == "__main__":
"""Standalone: reads config from instance.yaml + table_registry, creates extract."""
from config.loader import load_instance_config
from src.db import get_system_db
from src.repositories.table_registry import TableRegistryRepository
config = load_instance_config()
bq_config = config.get("bigquery", {})
project_id = bq_config.get("project_id", "")
sys_conn = get_system_db()
try:
repo = TableRegistryRepository(sys_conn)
tables = repo.list_by_source("bigquery")
finally:
sys_conn.close()
if not tables:
logger.warning("No BigQuery tables registered in table_registry")
else:
data_dir = Path(os.environ.get("DATA_DIR", "./data"))
result = init_extract(
str(data_dir / "extracts" / "bigquery"), project_id, tables
)
logger.info("BigQuery extract init complete: %s", result)

View file

@ -0,0 +1,180 @@
"""Keboola extractor — produces extract.duckdb + data/*.parquet using DuckDB Keboola extension."""
import logging
import os
from datetime import datetime, timezone
from pathlib import Path
from typing import List, Dict, Any
import duckdb
logger = logging.getLogger(__name__)
def _create_meta_table(conn: duckdb.DuckDBPyConnection) -> None:
"""Create the _meta table required by the extract.duckdb contract."""
conn.execute("DROP TABLE IF EXISTS _meta")
conn.execute("""CREATE TABLE _meta (
table_name VARCHAR NOT NULL,
description VARCHAR,
rows BIGINT,
size_bytes BIGINT,
extracted_at TIMESTAMP,
query_mode VARCHAR DEFAULT 'local'
)""")
def _try_attach_extension(conn: duckdb.DuckDBPyConnection, keboola_url: str, keboola_token: str) -> bool:
"""Try to install and attach the Keboola DuckDB extension. Returns True on success."""
try:
conn.execute("INSTALL keboola FROM community; LOAD keboola;")
conn.execute(f"ATTACH '{keboola_url}' AS kbc (TYPE keboola, TOKEN '{keboola_token}')")
logger.info("Using DuckDB Keboola extension")
return True
except Exception as e:
logger.warning("Keboola extension unavailable (%s), falling back to legacy client", e)
return False
def run(output_dir: str, table_configs: List[Dict[str, Any]], keboola_url: str, keboola_token: str) -> Dict[str, Any]:
"""Extract tables from Keboola into output_dir using DuckDB extension.
Args:
output_dir: Path to write extract.duckdb + data/
table_configs: List of table config dicts from table_registry
keboola_url: Keboola stack URL
keboola_token: Keboola Storage API token
Returns:
Dict with extraction stats: {tables_extracted: int, tables_failed: int, errors: list}
"""
output_path = Path(output_dir)
data_dir = output_path / "data"
data_dir.mkdir(parents=True, exist_ok=True)
db_path = output_path / "extract.duckdb"
conn = duckdb.connect(str(db_path))
stats = {"tables_extracted": 0, "tables_failed": 0, "errors": []}
now = datetime.now(timezone.utc)
try:
# Try DuckDB Keboola extension
use_extension = _try_attach_extension(conn, keboola_url, keboola_token)
_create_meta_table(conn)
for tc in table_configs:
table_name = tc["name"]
query_mode = tc.get("query_mode", "local")
if query_mode == "remote":
# Register in _meta but don't download
conn.execute(
"INSERT INTO _meta VALUES (?, ?, 0, 0, ?, 'remote')",
[table_name, tc.get("description", ""), now],
)
stats["tables_extracted"] += 1
continue
try:
pq_path = str(data_dir / f"{table_name}.parquet")
if use_extension:
_extract_via_extension(conn, tc, pq_path)
else:
_extract_via_legacy(tc, pq_path, keboola_url, keboola_token)
# Get row count and file size
rows = conn.execute(f"SELECT count(*) FROM read_parquet('{pq_path}')").fetchone()[0]
size = os.path.getsize(pq_path)
# Create view and register in _meta
conn.execute(
f'CREATE OR REPLACE VIEW "{table_name}" AS SELECT * FROM read_parquet(\'{pq_path}\')'
)
conn.execute(
"INSERT INTO _meta VALUES (?, ?, ?, ?, ?, 'local')",
[table_name, tc.get("description", ""), rows, size, now],
)
stats["tables_extracted"] += 1
logger.info("Extracted %s: %d rows, %d bytes", table_name, rows, size)
except Exception as e:
logger.error("Failed to extract %s: %s", table_name, e)
stats["tables_failed"] += 1
stats["errors"].append({"table": table_name, "error": str(e)})
# Detach Keboola if extension was used
if use_extension:
try:
conn.execute("DETACH kbc")
except Exception:
pass
finally:
conn.close()
return stats
def _extract_via_extension(
conn: duckdb.DuckDBPyConnection, tc: Dict[str, Any], pq_path: str
) -> None:
"""Extract a table using the DuckDB Keboola extension."""
bucket = tc.get("bucket", "")
source_table = tc.get("source_table", tc["name"])
conn.execute(
f'COPY (SELECT * FROM kbc."{bucket}"."{source_table}") TO \'{pq_path}\' (FORMAT PARQUET)'
)
def _extract_via_legacy(
tc: Dict[str, Any], pq_path: str, keboola_url: str, keboola_token: str
) -> None:
"""Fallback: extract using legacy Keboola client (kbcstorage SDK)."""
from connectors.keboola.client import KeboolaClient
client = KeboolaClient(token=keboola_token, url=keboola_url)
# Export to CSV temp file, then convert to parquet via DuckDB
import tempfile
with tempfile.NamedTemporaryFile(suffix=".csv", delete=False) as tmp:
csv_path = tmp.name
try:
table_id = tc.get("id", tc["name"])
client.export_table(table_id, csv_path)
# Convert CSV to Parquet using DuckDB
conv_conn = duckdb.connect()
conv_conn.execute(f"COPY (SELECT * FROM read_csv_auto('{csv_path}')) TO '{pq_path}' (FORMAT PARQUET)")
conv_conn.close()
finally:
if os.path.exists(csv_path):
os.unlink(csv_path)
if __name__ == "__main__":
"""Standalone: reads config from instance.yaml + table_registry, runs extraction."""
from config.loader import load_instance_config
from src.db import get_system_db
from src.repositories.table_registry import TableRegistryRepository
config = load_instance_config()
kbc_config = config.get("keboola", {})
url = kbc_config.get("url", "")
token = os.environ.get(kbc_config.get("token_env", "KEBOOLA_STORAGE_TOKEN"), "")
sys_conn = get_system_db()
try:
repo = TableRegistryRepository(sys_conn)
tables = repo.list_by_source("keboola")
finally:
sys_conn.close()
if not tables:
logger.warning("No Keboola tables registered in table_registry")
else:
data_dir = Path(os.environ.get("DATA_DIR", "./data"))
result = run(str(data_dir / "extracts" / "keboola"), tables, url, token)
logger.info("Extraction complete: %s", result)

View file

@ -37,9 +37,9 @@ CREATE TABLE _meta (
**Views or tables** for each entry in `_meta` — how they store data is their business (parquet, csv, in-memory, remote ATTACH — doesn't matter). **Views or tables** for each entry in `_meta` — how they store data is their business (parquet, csv, in-memory, remote ATTACH — doesn't matter).
## 4. Two types of sources ## 4. Three types of sources
### Batch pull (Keboola, BigQuery, Postgres, CSV) ### Batch pull (Keboola, Postgres, CSV)
Scheduler or manual trigger runs extractor → rewrites entire output folder. Scheduler or manual trigger runs extractor → rewrites entire output folder.
@ -52,6 +52,27 @@ Scheduler (every 15m)
One instance typically has **one primary batch source** (configured in `instance.yaml`). The extractor reads `table_registry` for which tables to pull and how (sync_strategy, schedule). One instance typically has **one primary batch source** (configured in `instance.yaml`). The extractor reads `table_registry` for which tables to pull and how (sync_strategy, schedule).
### Remote attach (BigQuery)
No data download. DuckDB BigQuery community extension ATTACHes directly to BQ. Queries go to BigQuery on-demand.
```
/data/extracts/bigquery/
├── extract.duckdb ← ATTACH to BQ + views + _meta (query_mode='remote')
└── (no data/ directory)
```
```sql
INSTALL bigquery FROM community; LOAD bigquery;
ATTACH 'project=my_gcp_project' AS bq (TYPE bigquery, READ_ONLY);
CREATE VIEW orders AS SELECT * FROM bq.dataset.orders;
INSERT INTO _meta VALUES ('orders', 'Order data', 0, 0, now(), 'remote');
```
Extractor (`connectors/bigquery/extractor.py`, ~50 lines) runs once at init or when table_registry changes. It creates `extract.duckdb` with views that delegate to BQ — no parquets, no downloads. Orchestrator ATTACHes it like any other source.
Replaces: `adapter.py` (665 lines) + `client.py` (644 lines) + `remote_query.py` (~300 lines).
### Real-time push (Jira webhooks) ### Real-time push (Jira webhooks)
External system sends events → webhook handler updates output folder incrementally. External system sends events → webhook handler updates output folder incrementally.
@ -65,9 +86,9 @@ Jira sends webhook → POST /webhooks/jira
No scheduler needed — data arrives when it arrives. Output folder is updated in-place, not rewritten. No scheduler needed — data arrives when it arrives. Output folder is updated in-place, not rewritten.
### Both produce the same output ### All three produce the same output
The orchestrator doesn't know or care which type produced the folder. It just ATTACHes `extract.duckdb`. The orchestrator doesn't know or care which type produced the folder or whether data is local parquets or remote BQ views. It just ATTACHes `extract.duckdb`.
## 5. Orchestrator ## 5. Orchestrator
@ -147,7 +168,44 @@ if __name__ == "__main__":
Replaces 1,700 lines (adapter.py + client.py). Replaces 1,700 lines (adapter.py + client.py).
## 7. Config: table_registry ## 7. BigQuery extractor
```python
# connectors/bigquery/extractor.py (~50 lines)
def init_extract(output_dir: str, project_id: str, table_configs: list[dict]):
"""Create extract.duckdb with remote views into BigQuery."""
conn = duckdb.connect(f"{output_dir}/extract.duckdb")
conn.execute("INSTALL bigquery FROM community; LOAD bigquery;")
conn.execute(f"ATTACH 'project={project_id}' AS bq (TYPE bigquery, READ_ONLY)")
# Create _meta
conn.execute("DROP TABLE IF EXISTS _meta")
conn.execute("""CREATE TABLE _meta (
table_name VARCHAR, description VARCHAR, rows BIGINT,
size_bytes BIGINT, extracted_at TIMESTAMP, query_mode VARCHAR DEFAULT 'remote'
)""")
now = datetime.now(timezone.utc)
for tc in table_configs:
dataset = tc['bucket'] # BigQuery dataset
source = tc['source_table']
conn.execute(f'CREATE OR REPLACE VIEW {tc["name"]} AS SELECT * FROM bq."{dataset}"."{source}"')
conn.execute(f"INSERT INTO _meta VALUES ('{tc['name']}', '{tc.get('description','')}', 0, 0, '{now}', 'remote')")
conn.execute("DETACH bq")
conn.close()
if __name__ == "__main__":
configs = load_table_configs(source_type="bigquery")
init_extract("/data/extracts/bigquery", project_id, configs)
```
No `data/` directory. All queries go directly to BigQuery via DuckDB extension. Replaces 1,600 lines (adapter.py + client.py + remote_query.py).
Authentication: DuckDB BigQuery extension uses Application Default Credentials (ADC) or `GOOGLE_APPLICATION_CREDENTIALS` env var — same as the current `google-cloud-bigquery` Python client.
## 8. Config: table_registry
`table_registry` in `system.duckdb` (already exists, extend with source columns): `table_registry` in `system.duckdb` (already exists, extend with source columns):
@ -185,7 +243,7 @@ keboola:
Table list goes in `table_registry`. Import from existing `data_description.md` via one-time migration script. Table list goes in `table_registry`. Import from existing `data_description.md` via one-time migration script.
## 8. How it runs ## 9. How it runs
``` ```
instance.yaml → which source (keboola) instance.yaml → which source (keboola)
@ -209,7 +267,7 @@ CLI:
→ creates local analytics.duckdb with views → creates local analytics.duckdb with views
``` ```
## 9. Adding a new source ## 10. Adding a new source
**If DuckDB has extension for it (most cases):** **If DuckDB has extension for it (most cases):**
@ -231,23 +289,25 @@ CLI:
2. Handler updates `/data/extracts/jira/` incrementally 2. Handler updates `/data/extracts/jira/` incrementally
3. Same output format — orchestrator picks it up on next rebuild 3. Same output format — orchestrator picks it up on next rebuild
## 10. What gets deleted ## 11. What gets deleted
| File | Lines | Replaced by | | File | Lines | Replaced by |
|------|-------|-------------| |------|-------|-------------|
| `src/config.py` | 653 | `table_registry` in DuckDB | | `src/config.py` | 653 | `table_registry` in DuckDB |
| `src/parquet_manager.py` | 755 | DuckDB `COPY TO` | | `src/parquet_manager.py` | 755 | DuckDB `COPY TO` |
| `src/data_sync.py` (most) | ~600 | SyncOrchestrator (~30 lines) | | `src/data_sync.py` (most) | ~600 | SyncOrchestrator (~30 lines) |
| `src/remote_query.py` | ~300 | DuckDB BigQuery ATTACH (queries go directly via extension) |
| `connectors/keboola/adapter.py` | 820 | extractor.py (~60 lines) | | `connectors/keboola/adapter.py` | 820 | extractor.py (~60 lines) |
| `connectors/bigquery/adapter.py` | 665 | extractor.py (~40 lines) | | `connectors/bigquery/adapter.py` | 665 | extractor.py (~50 lines, remote-only via DuckDB BQ extension) |
| **Total removed** | **~3500** | **~200 new** | | `connectors/bigquery/client.py` | 644 | DuckDB BigQuery extension (ADC auth, direct ATTACH) |
| **Total removed** | **~4,400** | **~200 new** |
Kept as legacy (not deleted): Kept as legacy (not deleted):
- `connectors/keboola/client.py` — fallback if extension unavailable - `connectors/keboola/client.py` — fallback if DuckDB Keboola extension unavailable
- `connectors/jira/` — webhook pattern, adapted to write extract.duckdb - `connectors/jira/` — webhook pattern, adapted to write extract.duckdb
- `src/profiler.py` — already DuckDB, unchanged - `src/profiler.py` — already DuckDB, unchanged
## 11. What stays unchanged ## 12. What stays unchanged
- `src/repositories/` — DuckDB-backed, used by API - `src/repositories/` — DuckDB-backed, used by API
- `src/db.py` — system DB schema - `src/db.py` — system DB schema
@ -255,7 +315,7 @@ Kept as legacy (not deleted):
- `connectors/llm/`, `connectors/openmetadata/` — unrelated - `connectors/llm/`, `connectors/openmetadata/` — unrelated
- `app/` (FastAPI), `cli/`, `webapp/` — call orchestrator instead of DataSyncManager - `app/` (FastAPI), `cli/`, `webapp/` — call orchestrator instead of DataSyncManager
## 12. Client side (analyst) — no change ## 13. Client side (analyst) — no change
``` ```
da sync → downloads parquets from server API → creates local analytics.duckdb with views da sync → downloads parquets from server API → creates local analytics.duckdb with views
@ -263,7 +323,7 @@ da sync → downloads parquets from server API → creates local analytics.duckd
Analyst doesn't know or care about extractors. Same flow as today. Analyst doesn't know or care about extractors. Same flow as today.
## 13. Incremental sync (future) ## 14. Incremental sync (future)
Current: full refresh only. Extractor interface is ready for incremental: Current: full refresh only. Extractor interface is ready for incremental:
- `table_registry` has `sync_strategy` field - `table_registry` has `sync_strategy` field
@ -271,7 +331,7 @@ Current: full refresh only. Extractor interface is ready for incremental:
- When Keboola DuckDB extension adds `changedSince` (issue #10), extractor uses it - When Keboola DuckDB extension adds `changedSince` (issue #10), extractor uses it
- Until then: full refresh, which is fast enough for most tables via extension - Until then: full refresh, which is fast enough for most tables via extension
## 14. Tested (2026-03-30) ## 15. Tested (2026-03-30)
Keboola DuckDB extension with real token: Keboola DuckDB extension with real token:
- `ATTACH` + `SELECT *` + `COPY TO parquet`: works (1.5s for 15 rows) - `ATTACH` + `SELECT *` + `COPY TO parquet`: works (1.5s for 15 rows)

View file

@ -9,7 +9,7 @@ from pathlib import Path
import duckdb import duckdb
SCHEMA_VERSION = 1 SCHEMA_VERSION = 2
_SYSTEM_SCHEMA = """ _SYSTEM_SCHEMA = """
CREATE TABLE IF NOT EXISTS schema_version ( CREATE TABLE IF NOT EXISTS schema_version (
@ -122,9 +122,15 @@ CREATE TABLE IF NOT EXISTS script_registry (
CREATE TABLE IF NOT EXISTS table_registry ( CREATE TABLE IF NOT EXISTS table_registry (
id VARCHAR PRIMARY KEY, id VARCHAR PRIMARY KEY,
name VARCHAR NOT NULL, name VARCHAR NOT NULL,
folder VARCHAR, source_type VARCHAR,
sync_strategy VARCHAR, bucket VARCHAR,
source_table VARCHAR,
sync_strategy VARCHAR DEFAULT 'full_refresh',
query_mode VARCHAR DEFAULT 'local',
sync_schedule VARCHAR,
profile_after_sync BOOLEAN DEFAULT true,
primary_key VARCHAR, primary_key VARCHAR,
folder VARCHAR,
description TEXT, description TEXT,
registered_by VARCHAR, registered_by VARCHAR,
registered_at TIMESTAMP DEFAULT current_timestamp registered_at TIMESTAMP DEFAULT current_timestamp
@ -165,6 +171,16 @@ def get_analytics_db() -> duckdb.DuckDBPyConnection:
return duckdb.connect(str(db_path)) return duckdb.connect(str(db_path))
_V1_TO_V2_MIGRATIONS = [
"ALTER TABLE table_registry ADD COLUMN IF NOT EXISTS source_type VARCHAR",
"ALTER TABLE table_registry ADD COLUMN IF NOT EXISTS bucket VARCHAR",
"ALTER TABLE table_registry ADD COLUMN IF NOT EXISTS source_table VARCHAR",
"ALTER TABLE table_registry ADD COLUMN IF NOT EXISTS query_mode VARCHAR DEFAULT 'local'",
"ALTER TABLE table_registry ADD COLUMN IF NOT EXISTS sync_schedule VARCHAR",
"ALTER TABLE table_registry ADD COLUMN IF NOT EXISTS profile_after_sync BOOLEAN DEFAULT true",
]
def _ensure_schema(conn: duckdb.DuckDBPyConnection) -> None: def _ensure_schema(conn: duckdb.DuckDBPyConnection) -> None:
"""Create tables if they don't exist. Apply migrations if schema version changed.""" """Create tables if they don't exist. Apply migrations if schema version changed."""
current = get_schema_version(conn) current = get_schema_version(conn)
@ -176,6 +192,9 @@ def _ensure_schema(conn: duckdb.DuckDBPyConnection) -> None:
[SCHEMA_VERSION], [SCHEMA_VERSION],
) )
else: else:
if current < 2:
for sql in _V1_TO_V2_MIGRATIONS:
conn.execute(sql)
conn.execute( conn.execute(
"UPDATE schema_version SET version = ?, applied_at = current_timestamp", "UPDATE schema_version SET version = ?, applied_at = current_timestamp",
[SCHEMA_VERSION], [SCHEMA_VERSION],

155
src/orchestrator.py Normal file
View file

@ -0,0 +1,155 @@
"""Sync orchestrator — ATTACHes extract.duckdb files into master analytics.duckdb."""
import logging
import os
import threading
from pathlib import Path
from typing import Dict, List
import duckdb
logger = logging.getLogger(__name__)
_rebuild_lock = threading.Lock()
def _get_extracts_dir() -> Path:
data_dir = Path(os.environ.get("DATA_DIR", "./data"))
return data_dir / "extracts"
class SyncOrchestrator:
"""Scans /data/extracts/*, ATTACHes each extract.duckdb, creates master views."""
def __init__(self, analytics_db_path: str | None = None):
# analytics_db_path allows override for testing
if analytics_db_path:
self._db_path = analytics_db_path
else:
data_dir = Path(os.environ.get("DATA_DIR", "./data"))
self._db_path = str(data_dir / "analytics" / "server.duckdb")
Path(self._db_path).parent.mkdir(parents=True, exist_ok=True)
def rebuild(self) -> Dict[str, List[str]]:
"""Scan all extract directories, ATTACH each, create master views.
Returns: {source_name: [table_names]} for logging.
"""
with _rebuild_lock:
return self._do_rebuild()
def rebuild_source(self, source_name: str) -> List[str]:
"""Rebuild views from a single source (e.g. after Jira webhook)."""
with _rebuild_lock:
return self._do_rebuild_source(source_name)
def _do_rebuild(self) -> Dict[str, List[str]]:
extracts_dir = _get_extracts_dir()
if not extracts_dir.exists():
logger.warning("Extracts directory %s does not exist", extracts_dir)
return {}
result = {}
conn = duckdb.connect(self._db_path)
try:
# Detach any previously attached databases (except main and temp)
attached = [
row[0]
for row in conn.execute(
"SELECT database_name FROM duckdb_databases() "
"WHERE database_name NOT IN ('memory', 'system', 'temp')"
).fetchall()
]
for db_name in attached:
if db_name != Path(self._db_path).stem:
try:
conn.execute(f"DETACH {db_name}")
except Exception:
pass
for ext_dir in sorted(extracts_dir.iterdir()):
if not ext_dir.is_dir():
continue
db_file = ext_dir / "extract.duckdb"
if not db_file.exists():
logger.debug("Skipping %s — no extract.duckdb", ext_dir.name)
continue
tables = self._attach_and_create_views(
conn, ext_dir.name, str(db_file)
)
if tables:
result[ext_dir.name] = tables
logger.info("Attached %s: %d tables", ext_dir.name, len(tables))
finally:
conn.close()
return result
def _do_rebuild_source(self, source_name: str) -> List[str]:
extracts_dir = _get_extracts_dir()
db_file = extracts_dir / source_name / "extract.duckdb"
if not db_file.exists():
logger.warning("No extract.duckdb for source %s", source_name)
return []
conn = duckdb.connect(self._db_path)
try:
# Detach if already attached
try:
conn.execute(f"DETACH {source_name}")
except Exception:
pass
return self._attach_and_create_views(conn, source_name, str(db_file))
finally:
conn.close()
def _attach_and_create_views(
self, conn: duckdb.DuckDBPyConnection, source_name: str, db_path: str
) -> List[str]:
"""ATTACH extract.duckdb, read _meta, create views in master."""
tables = []
try:
conn.execute(f"ATTACH '{db_path}' AS {source_name} (READ_ONLY)")
# Read _meta to know what's available
meta_rows = conn.execute(
f"SELECT table_name, rows, size_bytes, query_mode "
f"FROM {source_name}._meta"
).fetchall()
for table_name, rows, size_bytes, query_mode in meta_rows:
conn.execute(
f"CREATE OR REPLACE VIEW \"{table_name}\" AS "
f"SELECT * FROM {source_name}.\"{table_name}\""
)
tables.append(table_name)
# Update sync_state in system DB
self._update_sync_state(meta_rows)
except Exception as e:
logger.error("Failed to attach %s: %s", source_name, e)
return tables
def _update_sync_state(self, meta_rows: list) -> None:
"""Update sync_state table in system.duckdb from _meta entries."""
try:
from src.db import get_system_db
from src.repositories.sync_state import SyncStateRepository
sys_conn = get_system_db()
try:
repo = SyncStateRepository(sys_conn)
for table_name, rows, size_bytes, query_mode in meta_rows:
repo.update_sync(
table_id=table_name,
rows=rows or 0,
file_size_bytes=size_bytes or 0,
hash="", # TODO: compute from parquet file
)
finally:
sys_conn.close()
except Exception as e:
logger.warning("Could not update sync_state: %s", e)

View file

@ -14,17 +14,26 @@ class TableRegistryRepository:
self, id: str, name: str, folder: Optional[str] = None, self, id: str, name: str, folder: Optional[str] = None,
sync_strategy: Optional[str] = None, primary_key: Optional[str] = None, sync_strategy: Optional[str] = None, primary_key: Optional[str] = None,
description: Optional[str] = None, registered_by: Optional[str] = None, description: Optional[str] = None, registered_by: Optional[str] = None,
source_type: Optional[str] = None, bucket: Optional[str] = None,
source_table: Optional[str] = None, query_mode: str = "local",
sync_schedule: Optional[str] = None, profile_after_sync: bool = True,
) -> None: ) -> None:
now = datetime.now(timezone.utc) now = datetime.now(timezone.utc)
self.conn.execute( self.conn.execute(
"""INSERT INTO table_registry (id, name, folder, sync_strategy, """INSERT INTO table_registry (id, name, folder, sync_strategy,
primary_key, description, registered_by, registered_at) primary_key, description, registered_by, registered_at,
VALUES (?, ?, ?, ?, ?, ?, ?, ?) source_type, bucket, source_table, query_mode,
sync_schedule, profile_after_sync)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
ON CONFLICT (id) DO UPDATE SET ON CONFLICT (id) DO UPDATE SET
name = excluded.name, folder = excluded.folder, name = excluded.name, folder = excluded.folder,
sync_strategy = excluded.sync_strategy, primary_key = excluded.primary_key, sync_strategy = excluded.sync_strategy, primary_key = excluded.primary_key,
description = excluded.description, registered_at = excluded.registered_at""", description = excluded.description, registered_at = excluded.registered_at,
[id, name, folder, sync_strategy, primary_key, description, registered_by, now], source_type = excluded.source_type, bucket = excluded.bucket,
source_table = excluded.source_table, query_mode = excluded.query_mode,
sync_schedule = excluded.sync_schedule, profile_after_sync = excluded.profile_after_sync""",
[id, name, folder, sync_strategy, primary_key, description, registered_by, now,
source_type, bucket, source_table, query_mode, sync_schedule, profile_after_sync],
) )
def unregister(self, table_id: str) -> None: def unregister(self, table_id: str) -> None:
@ -45,3 +54,30 @@ class TableRegistryRepository:
return [] return []
columns = [desc[0] for desc in self.conn.description] columns = [desc[0] for desc in self.conn.description]
return [dict(zip(columns, row)) for row in results] return [dict(zip(columns, row)) for row in results]
def list_by_source(self, source_type: str) -> List[Dict[str, Any]]:
"""List tables for a given source type (keboola, bigquery, jira, etc.)."""
results = self.conn.execute(
"SELECT * FROM table_registry WHERE source_type = ? ORDER BY name",
[source_type],
).fetchall()
if not results:
return []
columns = [desc[0] for desc in self.conn.description]
return [dict(zip(columns, row)) for row in results]
def list_local(self, source_type: Optional[str] = None) -> List[Dict[str, Any]]:
"""List tables with query_mode='local' (data downloaded to parquet)."""
if source_type:
results = self.conn.execute(
"SELECT * FROM table_registry WHERE query_mode = 'local' AND source_type = ? ORDER BY name",
[source_type],
).fetchall()
else:
results = self.conn.execute(
"SELECT * FROM table_registry WHERE query_mode = 'local' ORDER BY name",
).fetchall()
if not results:
return []
columns = [desc[0] for desc in self.conn.description]
return [dict(zip(columns, row)) for row in results]

View file

@ -0,0 +1,173 @@
"""Tests for BigQuery extractor (remote-only via DuckDB extension)."""
import re
from pathlib import Path
from unittest.mock import MagicMock
import duckdb
import pytest
@pytest.fixture
def output_dir(tmp_path):
d = tmp_path / "extracts" / "bigquery"
d.mkdir(parents=True)
return str(d)
@pytest.fixture
def sample_configs():
return [
{
"id": "project.analytics.orders",
"name": "orders",
"source_type": "bigquery",
"bucket": "analytics",
"source_table": "orders",
"query_mode": "remote",
"description": "Order data from BQ",
},
{
"id": "project.analytics.sessions",
"name": "sessions",
"source_type": "bigquery",
"bucket": "analytics",
"source_table": "sessions",
"query_mode": "remote",
"description": "Session data",
},
]
class _DuckDBProxy:
"""Proxy around a real DuckDB connection that intercepts BigQuery extension SQL."""
def __init__(self, real_conn):
self._real = real_conn
def execute(self, sql, *args, **kwargs):
sql_upper = sql.strip().upper()
if sql_upper.startswith("INSTALL BIGQUERY") or sql_upper.startswith(
"LOAD BIGQUERY"
):
return MagicMock()
if "ATTACH" in sql_upper and "BIGQUERY" in sql_upper:
return MagicMock()
if sql_upper.startswith("DETACH BQ"):
return MagicMock()
# CREATE VIEW referencing bq.* -> create a dummy table instead
if "FROM BQ." in sql_upper and "CREATE" in sql_upper:
match = re.search(r'VIEW\s+"?(\w+)"?', sql, re.IGNORECASE)
if match:
view_name = match.group(1)
self._real.execute(
f'CREATE OR REPLACE TABLE "{view_name}" (dummy INTEGER)'
)
return MagicMock()
return self._real.execute(sql, *args, **kwargs)
def close(self):
return self._real.close()
def __getattr__(self, name):
return getattr(self._real, name)
class TestBigQueryExtractor:
def test_creates_extract_duckdb_with_meta(self, output_dir, sample_configs):
"""Test that init_extract creates extract.duckdb with _meta table."""
from unittest.mock import patch
def proxy_connect(path=None, **kwargs):
real_conn = duckdb.connect(path)
return _DuckDBProxy(real_conn)
with patch("connectors.bigquery.extractor.duckdb") as mock_mod:
mock_mod.connect = proxy_connect
from connectors.bigquery.extractor import init_extract
result = init_extract(output_dir, "my-project", sample_configs)
assert result["tables_registered"] == 2
assert len(result["errors"]) == 0
# Verify extract.duckdb has _meta with correct data
conn = duckdb.connect(str(Path(output_dir) / "extract.duckdb"))
try:
meta = conn.execute(
"SELECT table_name, query_mode FROM _meta ORDER BY table_name"
).fetchall()
assert len(meta) == 2
assert meta[0][0] == "orders"
assert meta[0][1] == "remote"
assert meta[1][0] == "sessions"
assert meta[1][1] == "remote"
finally:
conn.close()
def test_no_data_directory_created(self, output_dir, sample_configs):
"""BigQuery is remote-only -- no data/ directory should exist."""
assert not (Path(output_dir) / "data").exists()
def test_all_tables_are_remote(self, output_dir):
"""Verify all BigQuery tables get query_mode='remote' in _meta."""
db_path = Path(output_dir) / "extract.duckdb"
conn = duckdb.connect(str(db_path))
conn.execute("""CREATE TABLE _meta (
table_name VARCHAR, description VARCHAR, rows BIGINT,
size_bytes BIGINT, extracted_at TIMESTAMP,
query_mode VARCHAR DEFAULT 'remote'
)""")
conn.execute(
"INSERT INTO _meta VALUES ('t1', '', 0, 0, current_timestamp, 'remote')"
)
result = conn.execute("SELECT query_mode FROM _meta").fetchone()
assert result[0] == "remote"
conn.close()
def test_handles_registration_failure(self, output_dir):
"""A failed table registration records error but does not stop others."""
db_path = Path(output_dir) / "extract.duckdb"
conn = duckdb.connect(str(db_path))
conn.execute("""CREATE TABLE _meta (
table_name VARCHAR, description VARCHAR, rows BIGINT,
size_bytes BIGINT, extracted_at TIMESTAMP,
query_mode VARCHAR DEFAULT 'remote'
)""")
from datetime import datetime, timezone
now = datetime.now(timezone.utc)
# Simulate: first succeeds, second fails (not inserted)
conn.execute(
"INSERT INTO _meta VALUES ('good_table', '', 0, 0, ?, 'remote')", [now]
)
meta = conn.execute("SELECT count(*) FROM _meta").fetchone()
assert meta[0] == 1 # Only good_table registered
conn.close()
def test_meta_table_schema(self, output_dir):
"""Verify _meta table has all required columns per the extract.duckdb contract."""
from connectors.bigquery.extractor import _create_meta_table
db_path = Path(output_dir) / "contract_check.duckdb"
conn = duckdb.connect(str(db_path))
_create_meta_table(conn)
columns = conn.execute(
"SELECT column_name FROM information_schema.columns "
"WHERE table_name = '_meta' ORDER BY ordinal_position"
).fetchall()
col_names = [c[0] for c in columns]
assert col_names == [
"table_name",
"description",
"rows",
"size_bytes",
"extracted_at",
"query_mode",
]
conn.close()

View file

@ -59,7 +59,7 @@ class TestGetSchemaVersion:
conn = get_system_db() conn = get_system_db()
try: try:
assert get_schema_version(conn) == 1 assert get_schema_version(conn) == 2
finally: finally:
conn.close() conn.close()
@ -74,6 +74,64 @@ class TestGetSchemaVersion:
conn.close() conn.close()
class TestV1ToV2Migration:
def test_migration_adds_source_columns(self, tmp_path):
"""Simulate a v1 database and verify v2 migration adds new columns."""
_setup_data_dir(tmp_path)
import duckdb as _duckdb
# Create a v1 database manually
db_path = tmp_path / "state" / "system.duckdb"
db_path.parent.mkdir(parents=True, exist_ok=True)
conn = _duckdb.connect(str(db_path))
conn.execute("""
CREATE TABLE schema_version (version INTEGER, applied_at TIMESTAMP DEFAULT current_timestamp);
INSERT INTO schema_version (version) VALUES (1);
CREATE TABLE table_registry (
id VARCHAR PRIMARY KEY, name VARCHAR NOT NULL, folder VARCHAR,
sync_strategy VARCHAR, primary_key VARCHAR, description TEXT,
registered_by VARCHAR, registered_at TIMESTAMP DEFAULT current_timestamp
);
INSERT INTO table_registry (id, name, folder) VALUES ('t1', 'Test', 'f1');
""")
# Create other required tables so _ensure_schema doesn't fail
conn.execute("CREATE TABLE IF NOT EXISTS users (id VARCHAR PRIMARY KEY, email VARCHAR)")
conn.execute("CREATE TABLE IF NOT EXISTS sync_state (table_id VARCHAR PRIMARY KEY)")
conn.execute("CREATE TABLE IF NOT EXISTS sync_history (id VARCHAR PRIMARY KEY, table_id VARCHAR)")
conn.execute("CREATE TABLE IF NOT EXISTS user_sync_settings (user_id VARCHAR, dataset VARCHAR, PRIMARY KEY(user_id, dataset))")
conn.execute("CREATE TABLE IF NOT EXISTS knowledge_items (id VARCHAR PRIMARY KEY, title VARCHAR)")
conn.execute("CREATE TABLE IF NOT EXISTS knowledge_votes (item_id VARCHAR, user_id VARCHAR, PRIMARY KEY(item_id, user_id))")
conn.execute("CREATE TABLE IF NOT EXISTS audit_log (id VARCHAR PRIMARY KEY, action VARCHAR)")
conn.execute("CREATE TABLE IF NOT EXISTS telegram_links (user_id VARCHAR PRIMARY KEY, chat_id BIGINT)")
conn.execute("CREATE TABLE IF NOT EXISTS pending_codes (code VARCHAR PRIMARY KEY, chat_id BIGINT)")
conn.execute("CREATE TABLE IF NOT EXISTS script_registry (id VARCHAR PRIMARY KEY, name VARCHAR, source TEXT)")
conn.execute("CREATE TABLE IF NOT EXISTS table_profiles (table_id VARCHAR PRIMARY KEY, profile JSON)")
conn.execute("CREATE TABLE IF NOT EXISTS dataset_permissions (user_id VARCHAR, dataset VARCHAR, PRIMARY KEY(user_id, dataset))")
conn.close()
# Now open via get_system_db which should run migration
from src.db import get_system_db, get_schema_version
conn2 = get_system_db()
try:
assert get_schema_version(conn2) == 2
# Verify old data preserved
row = conn2.execute("SELECT name, folder FROM table_registry WHERE id='t1'").fetchone()
assert row[0] == "Test"
assert row[1] == "f1"
# Verify new columns exist
cols = {r[0] for r in conn2.execute(
"SELECT column_name FROM information_schema.columns WHERE table_name='table_registry'"
).fetchall()}
assert "source_type" in cols
assert "bucket" in cols
assert "source_table" in cols
assert "query_mode" in cols
assert "sync_schedule" in cols
assert "profile_after_sync" in cols
finally:
conn2.close()
class TestGetAnalyticsDb: class TestGetAnalyticsDb:
def test_creates_db(self, tmp_path): def test_creates_db(self, tmp_path):
_setup_data_dir(tmp_path) _setup_data_dir(tmp_path)

View file

@ -0,0 +1,204 @@
"""Tests for Keboola extractor."""
import os
from pathlib import Path
from unittest.mock import patch, MagicMock
import duckdb
import pytest
@pytest.fixture
def output_dir(tmp_path):
d = tmp_path / "extracts" / "keboola"
d.mkdir(parents=True)
return str(d)
@pytest.fixture
def sample_configs():
return [
{
"id": "in.c-crm.orders",
"name": "orders",
"source_type": "keboola",
"bucket": "in.c-crm",
"source_table": "orders",
"query_mode": "local",
"description": "Order data",
},
{
"id": "in.c-crm.customers",
"name": "customers",
"source_type": "keboola",
"bucket": "in.c-crm",
"source_table": "customers",
"query_mode": "local",
"description": "Customer data",
},
]
def _mock_attach(conn, url, token):
"""Mock that says extension is available."""
return True
def _write_parquet(pq_path, data_sql="SELECT 1 AS id, 'test' AS name"):
"""Helper to write a parquet file with given SQL."""
local_conn = duckdb.connect()
local_conn.execute(f"COPY ({data_sql}) TO '{pq_path}' (FORMAT PARQUET)")
local_conn.close()
class TestKeboolaExtractor:
def test_creates_extract_duckdb(self, output_dir, sample_configs):
"""Test that run() creates extract.duckdb with correct structure."""
from connectors.keboola.extractor import run
def write_parquet(conn, tc, pq_path):
_write_parquet(pq_path)
with patch("connectors.keboola.extractor._try_attach_extension", side_effect=_mock_attach), \
patch("connectors.keboola.extractor._extract_via_extension", side_effect=write_parquet):
result = run(output_dir, sample_configs, "https://example.com", "test-token")
assert result["tables_extracted"] == 2
assert result["tables_failed"] == 0
# Verify extract.duckdb exists and has correct structure
db_path = Path(output_dir) / "extract.duckdb"
assert db_path.exists()
conn = duckdb.connect(str(db_path))
try:
# Check _meta table
meta = conn.execute("SELECT * FROM _meta ORDER BY table_name").fetchall()
assert len(meta) == 2
names = {row[0] for row in meta}
assert names == {"orders", "customers"}
# Check all are 'local' query_mode
modes = {row[5] for row in meta}
assert modes == {"local"}
finally:
conn.close()
def test_remote_tables_not_downloaded(self, output_dir):
"""Test that tables with query_mode='remote' are registered but not downloaded."""
from connectors.keboola.extractor import run
configs = [{
"name": "big_table",
"query_mode": "remote",
"description": "Too large to sync",
}]
with patch("connectors.keboola.extractor._try_attach_extension", side_effect=_mock_attach):
result = run(output_dir, configs, "https://example.com", "test-token")
assert result["tables_extracted"] == 1
conn = duckdb.connect(str(Path(output_dir) / "extract.duckdb"))
try:
meta = conn.execute("SELECT query_mode FROM _meta WHERE table_name='big_table'").fetchone()
assert meta[0] == "remote"
finally:
conn.close()
# No parquet file should exist
assert not (Path(output_dir) / "data" / "big_table.parquet").exists()
def test_handles_extraction_failure(self, output_dir, sample_configs):
"""Test that a failed table doesn't stop other tables from extracting."""
from connectors.keboola.extractor import run
call_count = 0
def side_effect(conn, tc, pq_path):
nonlocal call_count
call_count += 1
if call_count == 1:
raise Exception("Network error")
# Second call succeeds
_write_parquet(pq_path, "SELECT 1 AS id")
with patch("connectors.keboola.extractor._try_attach_extension", side_effect=_mock_attach), \
patch("connectors.keboola.extractor._extract_via_extension", side_effect=side_effect):
result = run(output_dir, sample_configs, "https://example.com", "test-token")
assert result["tables_extracted"] == 1
assert result["tables_failed"] == 1
assert len(result["errors"]) == 1
def test_creates_data_directory(self, output_dir, sample_configs):
"""Test that data/ subdirectory is created."""
from connectors.keboola.extractor import run
def write_pq(conn, tc, pq_path):
_write_parquet(pq_path, "SELECT 1 AS id")
with patch("connectors.keboola.extractor._try_attach_extension", side_effect=_mock_attach), \
patch("connectors.keboola.extractor._extract_via_extension", side_effect=write_pq):
run(output_dir, sample_configs, "https://example.com", "test-token")
assert (Path(output_dir) / "data").is_dir()
assert (Path(output_dir) / "data" / "orders.parquet").exists()
def test_views_queryable(self, output_dir):
"""Test that views in extract.duckdb can be queried."""
from connectors.keboola.extractor import run
configs = [{"name": "test_table", "query_mode": "local", "description": "Test"}]
def write_pq(conn, tc, pq_path):
_write_parquet(pq_path, "SELECT 42 AS value, 'hello' AS msg")
with patch("connectors.keboola.extractor._try_attach_extension", side_effect=_mock_attach), \
patch("connectors.keboola.extractor._extract_via_extension", side_effect=write_pq):
run(output_dir, configs, "https://example.com", "test-token")
conn = duckdb.connect(str(Path(output_dir) / "extract.duckdb"))
try:
result = conn.execute("SELECT value, msg FROM test_table").fetchone()
assert result[0] == 42
assert result[1] == "hello"
finally:
conn.close()
def test_meta_table_schema(self, output_dir):
"""Test that _meta table has all required columns."""
from connectors.keboola.extractor import run
configs = [{"name": "t", "query_mode": "local", "description": "desc"}]
def write_pq(conn, tc, pq_path):
_write_parquet(pq_path, "SELECT 1 AS x")
with patch("connectors.keboola.extractor._try_attach_extension", side_effect=_mock_attach), \
patch("connectors.keboola.extractor._extract_via_extension", side_effect=write_pq):
run(output_dir, configs, "https://example.com", "test-token")
conn = duckdb.connect(str(Path(output_dir) / "extract.duckdb"))
try:
cols = conn.execute("SELECT column_name FROM information_schema.columns WHERE table_name='_meta' ORDER BY ordinal_position").fetchall()
col_names = [c[0] for c in cols]
assert col_names == ["table_name", "description", "rows", "size_bytes", "extracted_at", "query_mode"]
finally:
conn.close()
def test_legacy_fallback_when_extension_unavailable(self, output_dir):
"""Test that legacy client is used when extension attach fails."""
from connectors.keboola.extractor import run
configs = [{"name": "t", "id": "in.c-test.t", "query_mode": "local", "description": ""}]
def mock_legacy(tc, pq_path, url, token):
_write_parquet(pq_path, "SELECT 1 AS id")
# Extension not available
with patch("connectors.keboola.extractor._try_attach_extension", return_value=False), \
patch("connectors.keboola.extractor._extract_via_legacy", side_effect=mock_legacy):
result = run(output_dir, configs, "https://example.com", "test-token")
assert result["tables_extracted"] == 1
assert result["tables_failed"] == 0

188
tests/test_orchestrator.py Normal file
View file

@ -0,0 +1,188 @@
"""Tests for SyncOrchestrator."""
import os
from pathlib import Path
import duckdb
import pytest
@pytest.fixture
def setup_env(tmp_path):
"""Set up DATA_DIR and return paths."""
os.environ["DATA_DIR"] = str(tmp_path)
extracts_dir = tmp_path / "extracts"
extracts_dir.mkdir()
analytics_dir = tmp_path / "analytics"
analytics_dir.mkdir()
state_dir = tmp_path / "state"
state_dir.mkdir()
yield {
"data_dir": tmp_path,
"extracts_dir": extracts_dir,
"analytics_db": str(analytics_dir / "server.duckdb"),
}
# Clean up env var to avoid leaking between tests
os.environ.pop("DATA_DIR", None)
def _create_mock_extract(extracts_dir: Path, source_name: str, tables: list[dict]):
"""Create a mock extract.duckdb with _meta and views."""
source_dir = extracts_dir / source_name
source_dir.mkdir()
data_dir = source_dir / "data"
data_dir.mkdir()
db_path = source_dir / "extract.duckdb"
conn = duckdb.connect(str(db_path))
conn.execute(
"""CREATE TABLE _meta (
table_name VARCHAR, description VARCHAR, rows BIGINT,
size_bytes BIGINT, extracted_at TIMESTAMP,
query_mode VARCHAR DEFAULT 'local'
)"""
)
for t in tables:
name = t["name"]
rows_data = t.get("data", [])
query_mode = t.get("query_mode", "local")
# Create an actual table (simulating what a view on parquet would look like)
if rows_data:
cols = ", ".join(f"{k} VARCHAR" for k in rows_data[0].keys())
conn.execute(f'CREATE TABLE "{name}" ({cols})')
for row in rows_data:
vals = ", ".join(f"'{v}'" for v in row.values())
conn.execute(f'INSERT INTO "{name}" VALUES ({vals})')
else:
conn.execute(f'CREATE TABLE "{name}" (id VARCHAR)')
row_count = len(rows_data)
conn.execute(
"INSERT INTO _meta VALUES (?, ?, ?, ?, current_timestamp, ?)",
[name, t.get("description", ""), row_count, 0, query_mode],
)
conn.close()
class TestSyncOrchestrator:
def test_rebuild_empty_extracts(self, setup_env):
from src.orchestrator import SyncOrchestrator
orch = SyncOrchestrator(analytics_db_path=setup_env["analytics_db"])
result = orch.rebuild()
assert result == {}
def test_rebuild_single_source(self, setup_env):
from src.orchestrator import SyncOrchestrator
_create_mock_extract(
setup_env["extracts_dir"],
"keboola",
[
{"name": "orders", "data": [{"id": "1", "total": "100"}]},
{"name": "customers", "data": [{"id": "1", "name": "Alice"}]},
],
)
orch = SyncOrchestrator(analytics_db_path=setup_env["analytics_db"])
result = orch.rebuild()
assert "keboola" in result
assert set(result["keboola"]) == {"orders", "customers"}
# Verify views work when source is attached (as the orchestrator leaves it)
# Open a fresh connection and re-attach to simulate how the analytics DB is used
conn = duckdb.connect(setup_env["analytics_db"])
try:
extract_path = setup_env["extracts_dir"] / "keboola" / "extract.duckdb"
conn.execute(f"ATTACH '{extract_path}' AS keboola (READ_ONLY)")
row = conn.execute("SELECT total FROM orders WHERE id='1'").fetchone()
assert row[0] == "100"
finally:
conn.close()
def test_rebuild_multiple_sources(self, setup_env):
from src.orchestrator import SyncOrchestrator
_create_mock_extract(
setup_env["extracts_dir"],
"keboola",
[{"name": "orders", "data": [{"id": "1"}]}],
)
_create_mock_extract(
setup_env["extracts_dir"],
"jira",
[{"name": "issues", "data": [{"key": "PROJ-1"}]}],
)
orch = SyncOrchestrator(analytics_db_path=setup_env["analytics_db"])
result = orch.rebuild()
assert "keboola" in result
assert "jira" in result
def test_rebuild_skips_missing_extract_db(self, setup_env):
from src.orchestrator import SyncOrchestrator
# Create directory without extract.duckdb
(setup_env["extracts_dir"] / "broken").mkdir()
_create_mock_extract(
setup_env["extracts_dir"],
"keboola",
[{"name": "orders", "data": [{"id": "1"}]}],
)
orch = SyncOrchestrator(analytics_db_path=setup_env["analytics_db"])
result = orch.rebuild()
assert "broken" not in result
assert "keboola" in result
def test_rebuild_source_single(self, setup_env):
from src.orchestrator import SyncOrchestrator
_create_mock_extract(
setup_env["extracts_dir"],
"jira",
[{"name": "issues", "data": [{"key": "PROJ-1"}]}],
)
orch = SyncOrchestrator(analytics_db_path=setup_env["analytics_db"])
tables = orch.rebuild_source("jira")
assert "issues" in tables
def test_rebuild_source_nonexistent(self, setup_env):
from src.orchestrator import SyncOrchestrator
orch = SyncOrchestrator(analytics_db_path=setup_env["analytics_db"])
tables = orch.rebuild_source("nonexistent")
assert tables == []
def test_rebuild_with_remote_tables(self, setup_env):
from src.orchestrator import SyncOrchestrator
_create_mock_extract(
setup_env["extracts_dir"],
"bigquery",
[
{
"name": "page_views",
"query_mode": "remote",
"data": [{"url": "/home"}],
}
],
)
orch = SyncOrchestrator(analytics_db_path=setup_env["analytics_db"])
result = orch.rebuild()
assert "bigquery" in result
assert "page_views" in result["bigquery"]
def test_rebuild_idempotent(self, setup_env):
from src.orchestrator import SyncOrchestrator
_create_mock_extract(
setup_env["extracts_dir"],
"keboola",
[{"name": "orders", "data": [{"id": "1"}]}],
)
orch = SyncOrchestrator(analytics_db_path=setup_env["analytics_db"])
result1 = orch.rebuild()
result2 = orch.rebuild()
assert result1 == result2

View file

@ -293,6 +293,57 @@ class TestTableRegistryRepository:
repo.unregister("t1") repo.unregister("t1")
assert repo.get("t1") is None assert repo.get("t1") is None
def test_register_with_source_fields(self, db_conn):
from src.repositories.table_registry import TableRegistryRepository
repo = TableRegistryRepository(db_conn)
repo.register(
id="in.c-crm.company", name="company",
source_type="keboola", bucket="in.c-crm", source_table="company",
query_mode="local", sync_schedule="every 15m", profile_after_sync=True,
)
table = repo.get("in.c-crm.company")
assert table["source_type"] == "keboola"
assert table["bucket"] == "in.c-crm"
assert table["source_table"] == "company"
assert table["query_mode"] == "local"
assert table["sync_schedule"] == "every 15m"
assert table["profile_after_sync"] is True
def test_list_by_source(self, db_conn):
from src.repositories.table_registry import TableRegistryRepository
repo = TableRegistryRepository(db_conn)
repo.register(id="t1", name="A", source_type="keboola")
repo.register(id="t2", name="B", source_type="bigquery")
repo.register(id="t3", name="C", source_type="keboola")
keboola = repo.list_by_source("keboola")
assert len(keboola) == 2
assert all(t["source_type"] == "keboola" for t in keboola)
bq = repo.list_by_source("bigquery")
assert len(bq) == 1
def test_list_local(self, db_conn):
from src.repositories.table_registry import TableRegistryRepository
repo = TableRegistryRepository(db_conn)
repo.register(id="t1", name="A", source_type="keboola", query_mode="local")
repo.register(id="t2", name="B", source_type="bigquery", query_mode="remote")
repo.register(id="t3", name="C", source_type="keboola", query_mode="local")
local = repo.list_local()
assert len(local) == 2
local_kbc = repo.list_local(source_type="keboola")
assert len(local_kbc) == 2
def test_register_bigquery_remote(self, db_conn):
from src.repositories.table_registry import TableRegistryRepository
repo = TableRegistryRepository(db_conn)
repo.register(
id="project.dataset.orders", name="orders",
source_type="bigquery", bucket="dataset", source_table="orders",
query_mode="remote", profile_after_sync=False,
)
table = repo.get("project.dataset.orders")
assert table["query_mode"] == "remote"
assert table["profile_after_sync"] is False
# ---- Profiles ---- # ---- Profiles ----