"""Keboola extractor — produces extract.duckdb + data/*.parquet using DuckDB Keboola extension.""" import logging import os from datetime import datetime, timezone from pathlib import Path from typing import List, Dict, Any, Optional import duckdb from src.identifier_validation import ( is_safe_quoted_identifier, validate_identifier, validate_quoted_identifier, ) logger = logging.getLogger(__name__) def materialize_query( table_id: str, *, bucket: str, source_table: str, source_query: Optional[str] = None, storage_client=None, # KeboolaStorageClient (avoid circular import) keboola_url: Optional[str] = None, keboola_token: Optional[str] = None, output_dir: Path, ) -> dict: """Materialize a Keboola Storage table to a local parquet via Storage API. Replaces the previous DuckDB-extension path. The extension's QueryService scan is unreliable on linked-bucket projects (keboola/duckdb-extension#17; fix shipped upstream as v0.1.6 but not yet in the community CDN, and on flag-restricted projects the pre-fix workspace role wouldn't have GRANTs on the bucket schema anyway). The Storage API export-async path always works regardless of project flags. Parallel of `connectors/bigquery/extractor.py:materialize_query` in surface — same return shape, same atomic write, same MD5 contract — but the inputs differ because Keboola's structured filter spec replaces BQ's free-form SQL. Args: table_id: parquet filename + sync_state key (must be a safe ident). bucket: Keboola bucket id, e.g. ``in.c-crm``. source_table: table id within the bucket, e.g. ``orders``. source_query: optional JSON string with a Storage API filter spec (see `storage_api.ExportFilter`). Empty / NULL = full table. storage_client: pre-built `KeboolaStorageClient` (preferred — lets sync.py share one across rows). When omitted, ``keboola_url`` and ``keboola_token`` are used to construct a one-shot client. keboola_url, keboola_token: alternative to ``storage_client`` for single-call usage (tests, ad-hoc). output_dir: directory to write `.parquet`. Returns: ``{"table_id", "path", "rows", "bytes", "md5"}`` — same shape the BQ branch returns, so ``app/api/sync.py:_run_materialized_pass`` downstream code stays uniform. """ import re import hashlib import json import duckdb if not re.fullmatch(r"[A-Za-z_][A-Za-z0-9_]*", table_id): raise ValueError(f"unsafe table_id for materialize: {table_id!r}") # Lazy import to avoid pulling `requests` at module import time when only # the sync trigger imports `extractor` for `run()`. from connectors.keboola.storage_api import ( FILE_TYPE_CSV, FILE_TYPE_PARQUET, ExportFilter, KeboolaStorageClient, ) if storage_client is None: if not (keboola_url and keboola_token): raise ValueError( "materialize_query requires either storage_client or " "(keboola_url + keboola_token)" ) storage_client = KeboolaStorageClient(url=keboola_url, token=keboola_token) # Filter spec is optional. Admin can register a row with no # source_query at all (= full-table export), or with a JSON object # describing whereFilters / columns / changedSince / file_type. payload: dict = {} if source_query: try: payload = json.loads(source_query) except json.JSONDecodeError as e: raise ValueError( f"source_query for {table_id} is not valid JSON: {e}" ) from e export_filter = ExportFilter.from_dict(payload) # Default the materialized path to parquet — Storage API serves it # via native Snowflake UNLOAD, the extractor renames it into place, # no CSV intermediate, no DuckDB COPY, no peak-memory load. Admin # can pin `{"file_type":"csv"}` in source_query to fall back (legacy # debugging, or projects whose backend can't UNLOAD parquet — none # known today, but the escape hatch costs nothing). Only override # when the admin spec didn't *explicitly* set a file_type. if "file_type" not in payload and "fileType" not in payload: export_filter.file_type = FILE_TYPE_PARQUET output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) parquet_path = output_dir / f"{table_id}.parquet" tmp_parquet = output_dir / f"{table_id}.parquet.tmp" if tmp_parquet.exists(): tmp_parquet.unlink() # Per-call temp dir for the intermediate file (CSV or parquet) — # separates concurrent exports cleanly without the os.chdir() race # the kbcstorage SDK has. ``ignore_cleanup_errors=True`` keeps # disk-full / permission errors from masking the original # exception, and prevents a half-cleaned dir from sitting around # forever (a 12 GiB stale slice tree was seen after a worker died # mid-write on a saturated boot disk). ``dir=get_temp_root()`` # routes to ``AGNES_TEMP_DIR`` when the operator has steered # tempfiles off the overlayfs (e.g. onto the data disk) — see # storage_api.get_temp_root for the rationale. import tempfile from connectors.keboola.storage_api import get_temp_root with tempfile.TemporaryDirectory( prefix=f"kbc-export-{table_id}-", dir=get_temp_root(), ignore_cleanup_errors=True, ) as tmpdir: full_table_id = f"{bucket}.{source_table}" if export_filter.file_type == FILE_TYPE_PARQUET: # Native parquet path. Storage API serves Snowflake UNLOAD # output directly. Two shapes to handle: # # 1. **Single file** (small exports): file_info.url points at # one signed URL; download to tmp_parquet and we're done. # 2. **Sliced** (large exports — Snowflake UNLOAD respects # MAX_FILE_SIZE, default 16 MiB, so anything past that # arrives as a manifest of N parquet slices). Each slice # is itself a complete parquet file with its own footer; # naively concatenating them like CSV would be invalid. # We download all slices into the per-call tempdir, then # DuckDB-COPY across `read_parquet([slice1, slice2, ...])` # into one consolidated tmp_parquet. DuckDB streams row # groups during this consolidation — peak memory is one # row group (~1 MiB), not the full table. stats = storage_client.prepare_export( full_table_id, export_filter=export_filter, ) file_info = stats["file_info"] if file_info.get("isSliced"): slice_dir = Path(tmpdir) / "slices" slice_paths = storage_client.download_file_slices( file_info, slice_dir ) if not slice_paths: raise RuntimeError( f"sliced parquet export for {full_table_id} " f"yielded no slices" ) quoted = ", ".join( "'" + str(p).replace("'", "''") + "'" for p in slice_paths ) safe_tmp = str(tmp_parquet).replace("'", "''") conv = duckdb.connect() try: conv.execute( f"COPY (SELECT * FROM read_parquet([{quoted}])) " f"TO '{safe_tmp}' (FORMAT PARQUET)" ) finally: conv.close() else: storage_client.download_file(file_info, tmp_parquet) stats["bytes"] = ( tmp_parquet.stat().st_size if tmp_parquet.exists() else 0 ) if not tmp_parquet.exists() or tmp_parquet.stat().st_size == 0: logger.warning( "Storage API parquet export for %s returned no data " "(filter may be too restrictive)", full_table_id, ) # Empty placeholder parquet so the orchestrator doesn't # choke on a missing file. duckdb.connect().execute( f"COPY (SELECT 1 AS _empty WHERE FALSE) TO '{tmp_parquet}' (FORMAT PARQUET)" ).close() else: # Legacy CSV path. Kept for the explicit `{"file_type":"csv"}` # opt-in. Slower (CSV parse + parquet rewrite) and # memory-heavier (DuckDB pulls the CSV into a buffer with # max_line_size headroom), but doesn't depend on Storage # API parquet support if a future project backend lacks it. csv_path = Path(tmpdir) / f"{table_id}.csv" stats = storage_client.export_table( full_table_id, csv_path, export_filter=export_filter, ) if not csv_path.exists() or csv_path.stat().st_size == 0: logger.warning( "Storage API CSV export for %s returned no data " "(filter may be too restrictive)", full_table_id, ) duckdb.connect().execute( f"COPY (SELECT 1 AS _empty WHERE FALSE) TO '{tmp_parquet}' (FORMAT PARQUET)" ).close() else: # CSV → parquet via DuckDB. `all_varchar=True` matches the # legacy client's behavior — preserves the source's exact # character data without DuckDB's type inference rewriting # numeric-looking strings (e.g. "Non-Manager") as NULL. # # `max_line_size=64MB` overrides DuckDB's default 2 MB cap # on any single CSV line. Keboola tables that store # embedded JSON / SQL transformation bodies routinely # have multi-MB cells (e.g. `kbc_component_configuration` # rows ship full Snowflake transformation SQL inline as # a JSON column value); the default 2 MB ceiling rejects # them with `Maximum line size of 2000000 bytes # exceeded`. 64 MB is generous enough to absorb any # reasonable embedded blob; DuckDB allocates a single # buffer of this size per worker thread. safe_csv = str(csv_path).replace("'", "''") safe_tmp = str(tmp_parquet).replace("'", "''") try: conv = duckdb.connect() conv.execute( f"COPY (SELECT * FROM read_csv('{safe_csv}', " f"all_varchar=true, max_line_size=67108864)) " f"TO '{safe_tmp}' (FORMAT PARQUET)" ) conv.close() except Exception: if tmp_parquet.exists(): tmp_parquet.unlink() raise # Row count from the parquet, not from `stats["rows"]` — Storage API # sometimes omits totalRowsCount on small results, and the parquet is # the authoritative count we'll be serving downstream anyway. safe_tmp = str(tmp_parquet).replace("'", "''") cnt_conn = duckdb.connect() try: row_count = cnt_conn.execute( f"SELECT COUNT(*) FROM read_parquet('{safe_tmp}')" ).fetchone()[0] finally: cnt_conn.close() # Streaming MD5 — bounded memory regardless of parquet size. h = hashlib.md5() with open(tmp_parquet, "rb") as f: for chunk in iter(lambda: f.read(8192), b""): h.update(chunk) md5 = h.hexdigest() size = tmp_parquet.stat().st_size os.replace(tmp_parquet, parquet_path) if row_count == 0: logger.warning( "Materialized Keboola export for %s wrote 0 rows — verify the " "filter and that the source bucket has data.", table_id, ) return { "table_id": table_id, "path": str(parquet_path), "rows": row_count, "bytes": size, "md5": md5, } def _create_meta_table(conn: duckdb.DuckDBPyConnection) -> None: """Create the _meta table required by the extract.duckdb contract.""" conn.execute("DROP TABLE IF EXISTS _meta") conn.execute("""CREATE TABLE _meta ( table_name VARCHAR NOT NULL, description VARCHAR, rows BIGINT, size_bytes BIGINT, extracted_at TIMESTAMP, query_mode VARCHAR DEFAULT 'local' )""") def _create_remote_attach_table(conn: duckdb.DuckDBPyConnection, keboola_url: str) -> None: """Write _remote_attach so orchestrator can re-ATTACH the Keboola extension.""" conn.execute("DROP TABLE IF EXISTS _remote_attach") conn.execute("""CREATE TABLE _remote_attach ( alias VARCHAR, extension VARCHAR, url VARCHAR, token_env VARCHAR )""") conn.execute( "INSERT INTO _remote_attach VALUES (?, ?, ?, ?)", ["kbc", "keboola", keboola_url, "KEBOOLA_STORAGE_TOKEN"], ) def _try_attach_extension(conn: duckdb.DuckDBPyConnection, keboola_url: str, keboola_token: str) -> bool: """Try to install and attach the Keboola DuckDB extension. Returns True on success.""" try: conn.execute("INSTALL keboola FROM community; LOAD keboola;") escaped_token = keboola_token.replace("'", "''") # Strip trailing slash — the Keboola DuckDB extension's ATTACH fails # with a network error when the URL ends in `/` (e.g. the canonical # `https://connection.us-east4.gcp.keboola.com/` form). Bare host # works. attach_url = keboola_url.rstrip("/") conn.execute(f"ATTACH '{attach_url}' AS kbc (TYPE keboola, TOKEN '{escaped_token}')") logger.info("Using DuckDB Keboola extension") return True except Exception as e: logger.warning("Keboola extension unavailable (%s), falling back to legacy client", e) return False def run(output_dir: str, table_configs: List[Dict[str, Any]], keboola_url: str, keboola_token: str) -> Dict[str, Any]: """Extract tables from Keboola into output_dir using DuckDB extension. Args: output_dir: Path to write extract.duckdb + data/ table_configs: List of table config dicts from table_registry keboola_url: Keboola stack URL keboola_token: Keboola Storage API token Returns: Dict with extraction stats: {tables_extracted: int, tables_failed: int, errors: list} """ output_path = Path(output_dir) data_dir = output_path / "data" data_dir.mkdir(parents=True, exist_ok=True) # Write to temp file then rename — avoids lock conflict with orchestrator # which may hold a read lock on the existing extract.duckdb db_path = output_path / "extract.duckdb" tmp_db_path = output_path / "extract.duckdb.tmp" if tmp_db_path.exists(): tmp_db_path.unlink() conn = duckdb.connect(str(tmp_db_path)) stats = {"tables_extracted": 0, "tables_failed": 0, "errors": []} now = datetime.now(timezone.utc) # Per-table workitems whose extension scan failed and need the legacy # Storage-API fallback. Drained in a parallel pool below the per-table # serial loop. Items are `(tc, pq_path)` tuples. legacy_queue: List[tuple] = [] try: # Try DuckDB Keboola extension use_extension = _try_attach_extension(conn, keboola_url, keboola_token) _create_meta_table(conn) has_remote = any(tc.get("query_mode") == "remote" for tc in table_configs) if has_remote and use_extension: _create_remote_attach_table(conn, keboola_url) for tc in table_configs: table_name = tc["name"] query_mode = tc.get("query_mode", "local") # Materialized rows are written by the sync trigger pass via # `materialize_query()` — they live as parquets in # /data/extracts/keboola/data/, picked up by the orchestrator's # standard local-parquet discovery. Don't extract here (would # double-write data via the source bucket reference and confuse # sync_state bookkeeping). Mirror of the BQ extractor's skip at # connectors/bigquery/extractor.py:190. if query_mode == "materialized": logger.info( "Skipping legacy extract for %s — query_mode='materialized', " "handled by _run_materialized_pass instead", tc.get("id") or tc.get("name"), ) continue # #81 Group D — refuse rows whose identifiers don't pass the # whitelist. The registry is admin-controlled but anyone with # write access can otherwise inject SQL via the CREATE VIEW / # COPY / SELECT interpolation below. Skip-and-continue rather # than crashing the whole extraction; valid rows still process. # # `table_name` is the DuckDB view name in the master # analytics DB. The orchestrator uses the STRICT validator # (`^[a-zA-Z_][a-zA-Z0-9_]{0,63}$`) when re-creating views, # so any name with `-` or `.` would pass extraction here # but be silently dropped at orchestrator-rebuild time. # Use the strict validator here too so the failure is # caught early and visible in tables_failed. if not validate_identifier(table_name, "Keboola table_name"): stats["tables_failed"] += 1 stats["errors"].append({"table": table_name, "error": "unsafe identifier"}) continue if query_mode == "remote": # Create view pointing to kbc extension (requires re-ATTACH at query time) bucket = tc.get("bucket", "") source_table = tc.get("source_table", table_name) if not ( validate_quoted_identifier(bucket, "Keboola bucket") and validate_quoted_identifier(source_table, "Keboola source_table") ): stats["tables_failed"] += 1 stats["errors"].append({"table": table_name, "error": "unsafe bucket/source_table"}) continue if use_extension and bucket: conn.execute( f'CREATE OR REPLACE VIEW "{table_name}" AS SELECT * FROM kbc."{bucket}"."{source_table}"' ) conn.execute( "INSERT INTO _meta VALUES (?, ?, 0, 0, ?, 'remote')", [table_name, tc.get("description", ""), now], ) stats["tables_extracted"] += 1 continue try: pq_path = str(data_dir / f"{table_name}.parquet") if use_extension: try: _extract_via_extension(conn, tc, pq_path) except Exception as ext_err: # ATTACH succeeded but the per-table COPY failed — # most commonly a Keboola QueryService permission error # (`Schema '..."in.c-..."' does not exist or not # authorized`, see keboola/duckdb-extension#17). The # legacy Storage-API client doesn't go through # QueryService at all, so queue for the parallel # legacy fallback below. logger.warning( "Keboola extension scan failed for %s (%s); queued for legacy Storage-API fallback", table_name, ext_err, ) legacy_queue.append((tc, pq_path)) continue else: legacy_queue.append((tc, pq_path)) continue # Extension path succeeded — register _meta synchronously. _register_local_meta(conn, tc, pq_path, now) stats["tables_extracted"] += 1 rows_log = conn.execute( f"SELECT count(*) FROM read_parquet('{pq_path.replace(chr(39), chr(39)*2)}')" ).fetchone()[0] logger.info("Extracted %s via extension: %d rows", table_name, rows_log) except Exception as e: logger.error("Failed to extract %s: %s", table_name, e) stats["tables_failed"] += 1 stats["errors"].append({"table": table_name, "error": str(e)}) # Detach Keboola if extension was used if use_extension: try: conn.execute("DETACH kbc") except Exception: pass # Phase 2: legacy fallback in parallel. Keboola Storage API export # jobs are independent per table — a worker pool of N workers fans # out the per-table HTTP roundtrips (export job submit + poll + # CSV download) instead of stacking them sequentially. Project-level # concurrency is bounded by the storage.jobsParallelism limit # (typically 10); default to 4 to leave headroom for other clients. # Override via AGNES_KEBOOLA_PARALLELISM env var. # # Workers are PROCESSES, not threads — `connectors/keboola/client.py: # export_table` does `os.chdir(temp_dir)` to redirect kbcstorage's # slice-file downloads into a per-call temp directory, and `os.chdir` # is process-global. With threads, two parallel exports race on CWD # and slice files end up in the wrong directory; the merge step then # fails with `[Errno 2] No such file or directory: # '.csv_X_Y_Z.csv'`. ProcessPoolExecutor gives each worker # its own process and therefore its own CWD. if legacy_queue: parallelism = max(1, int(os.environ.get("AGNES_KEBOOLA_PARALLELISM", "8"))) workers = min(parallelism, len(legacy_queue)) logger.info( "Running legacy Storage-API fallback for %d tables across %d worker processes", len(legacy_queue), workers, ) if workers == 1: legacy_results = [_legacy_worker(item, keboola_url, keboola_token) for item in legacy_queue] else: from concurrent.futures import ProcessPoolExecutor with ProcessPoolExecutor(max_workers=workers) as ex: futures = [ex.submit(_legacy_worker, item, keboola_url, keboola_token) for item in legacy_queue] legacy_results = [f.result() for f in futures] # Phase 3: serial _meta insert for legacy results. DuckDB conn # isn't thread-safe, so we collect parallel work and only touch # `conn` (and `stats`) here on the main thread. for tc_, pq_, err in legacy_results: tn = tc_["name"] if err is not None: logger.error("Failed to extract %s via legacy: %s", tn, err) stats["tables_failed"] += 1 stats["errors"].append({"table": tn, "error": err}) continue try: _register_local_meta(conn, tc_, pq_, now) stats["tables_extracted"] += 1 rows_log = conn.execute( f"SELECT count(*) FROM read_parquet('{pq_.replace(chr(39), chr(39)*2)}')" ).fetchone()[0] logger.info("Extracted %s via legacy: %d rows", tn, rows_log) except Exception as e: logger.error("Failed to register _meta for %s: %s", tn, e) stats["tables_failed"] += 1 stats["errors"].append({"table": tn, "error": str(e)}) finally: conn.execute("CHECKPOINT") conn.close() # Atomic replace: swap temp DB into place, cleaning up any WAL files import shutil old_wal = Path(str(db_path) + ".wal") if old_wal.exists(): old_wal.unlink() if tmp_db_path.exists(): shutil.move(str(tmp_db_path), str(db_path)) tmp_wal = Path(str(tmp_db_path) + ".wal") if tmp_wal.exists(): tmp_wal.unlink() return stats def _register_local_meta( conn: duckdb.DuckDBPyConnection, tc: Dict[str, Any], pq_path: str, extracted_at: datetime, ) -> None: """After a parquet has been written for a local-mode table, create the DuckDB view and register the row in `_meta`. Hoisted out of the run() body so both the serial extension-success path and the parallel legacy-result path share one implementation.""" table_name = tc["name"] safe_pq_lit = pq_path.replace("'", "''") rows = conn.execute(f"SELECT count(*) FROM read_parquet('{safe_pq_lit}')").fetchone()[0] size = os.path.getsize(pq_path) conn.execute( f'CREATE OR REPLACE VIEW "{table_name}" AS SELECT * FROM read_parquet(\'{safe_pq_lit}\')' ) conn.execute( "INSERT INTO _meta VALUES (?, ?, ?, ?, ?, 'local')", [table_name, tc.get("description", ""), rows, size, extracted_at], ) def _extract_via_extension(conn: duckdb.DuckDBPyConnection, tc: Dict[str, Any], pq_path: str) -> None: """Extract a table using the DuckDB Keboola extension.""" bucket = tc.get("bucket", "") source_table = tc.get("source_table", tc["name"]) # #81 Group D — defense-in-depth. The caller already validates these; # refuse here too in case a future caller forgets. Use the relaxed # quoted-identifier check that accepts Keboola's `in.c-foo` form. if not (is_safe_quoted_identifier(bucket) and is_safe_quoted_identifier(source_table)): raise ValueError(f"unsafe bucket/source_table: {bucket!r}/{source_table!r}") safe_pq_lit = pq_path.replace("'", "''") conn.execute(f'COPY (SELECT * FROM kbc."{bucket}"."{source_table}") TO \'{safe_pq_lit}\' (FORMAT PARQUET)') def _legacy_worker(tc_pq, keboola_url: str, keboola_token: str): """Module-level wrapper for ProcessPoolExecutor — must be picklable. Returns `(tc, pq_path, error_str_or_None)` so the main process can aggregate results and update _meta serially on its DuckDB connection. """ tc_, pq_ = tc_pq try: _extract_via_legacy(tc_, pq_, keboola_url, keboola_token) return (tc_, pq_, None) except Exception as exc: return (tc_, pq_, str(exc)) def _extract_via_legacy(tc: Dict[str, Any], pq_path: str, keboola_url: str, keboola_token: str) -> None: """Per-table extract via the Storage API export-async path. Despite the name (kept for caller compatibility with `_legacy_worker`), this no longer goes through the `kbcstorage` SDK — it talks to the Storage API directly via `connectors/keboola/storage_api.py`. The old SDK path had a thread-unsafe `os.chdir(temp_dir)` that broke parallel execution; the direct path uses per-call temp directories and signed-URL downloads, so threads / processes don't trip on each other. Same surface as before — `(tc, pq_path, url, token) → writes parquet at pq_path` — so callers (including the parallel `_legacy_worker`) don't need to change. """ import tempfile from connectors.keboola.storage_api import KeboolaStorageClient, get_temp_root bucket = tc.get("bucket", "") source_table = tc.get("source_table", tc["name"]) table_id = f"{bucket}.{source_table}" if bucket else tc.get("id", tc["name"]) with tempfile.TemporaryDirectory( prefix=f"kbc-export-{tc['name']}-", dir=get_temp_root(), ignore_cleanup_errors=True, ) as tmpdir: csv_path = Path(tmpdir) / f"{tc['name']}.csv" client = KeboolaStorageClient(url=keboola_url, token=keboola_token) client.export_table_to_csv(table_id, csv_path) if not csv_path.exists() or csv_path.stat().st_size == 0: # Storage API succeeded but produced no rows. Emit an empty # parquet rather than crashing — same defensive behavior as # `materialize_query`. duckdb.connect().execute( f"COPY (SELECT 1 AS _empty WHERE FALSE) TO '{pq_path}' (FORMAT PARQUET)" ).close() return # all_varchar=true preserves the source's exact character data — # matches what the kbcstorage path used to do, prevents DuckDB # type inference from rewriting numeric-looking strings as NULL. # max_line_size=64MB overrides DuckDB's 2MB default; matches the # materialize_query path. See comment there for rationale. safe_csv = str(csv_path).replace("'", "''") safe_pq = pq_path.replace("'", "''") conv = duckdb.connect() try: conv.execute( f"COPY (SELECT * FROM read_csv('{safe_csv}', " f"all_varchar=true, max_line_size=67108864)) " f"TO '{safe_pq}' (FORMAT PARQUET)" ) finally: conv.close() def compute_exit_code(stats: Dict[str, Any], total: int) -> int: """Map an extraction `stats` dict to a process exit code. Issue #81 Group B: distinguish full success from partial failure so the sync API and CLI consumers can alert on partial vs. full failure rather than treating any non-zero as one bucket. - ``0`` — every table succeeded (or no tables registered). - ``1`` — every table failed (full failure). - ``2`` — at least one succeeded and at least one failed (partial). `total` is the count of tables the extractor was asked to process. `stats["tables_failed"]` is the count it actually failed. """ failed = stats.get("tables_failed", 0) if total == 0: return 0 if failed == 0: return 0 if failed >= total: return 1 return 2 if __name__ == "__main__": """Standalone: reads config from env + table_registry, runs extraction. Used by sync trigger subprocess. Reads KEBOOLA_STORAGE_TOKEN and KEBOOLA_STACK_URL from environment, table list from DuckDB registry. """ from app.logging_config import setup_logging setup_logging(__name__) # Read Keboola credentials — env first, then instance.yaml fallback url = os.environ.get("KEBOOLA_STACK_URL", "") token = os.environ.get("KEBOOLA_STORAGE_TOKEN", "") if not url or not token: try: from config.loader import load_instance_config config = load_instance_config() kbc_config = config.get("keboola", {}) url = url or kbc_config.get("url", "") token_env = kbc_config.get("token_env", "KEBOOLA_STORAGE_TOKEN") token = token or os.environ.get(token_env, "") except Exception: pass if not url or not token: logger.error("Missing KEBOOLA_STACK_URL or KEBOOLA_STORAGE_TOKEN") exit(1) # Read table list from registry from src.db import get_system_db from src.repositories.table_registry import TableRegistryRepository sys_conn = get_system_db() try: repo = TableRegistryRepository(sys_conn) tables = repo.list_by_source("keboola") finally: sys_conn.close() if not tables: logger.warning("No Keboola tables registered in table_registry") exit(0) logger.info("Extracting %d tables from %s", len(tables), url) data_dir = Path(os.environ.get("DATA_DIR", "./data")) result = run(str(data_dir / "extracts" / "keboola"), tables, url, token) logger.info("Extraction complete: %s", result) code = compute_exit_code(result, len(tables)) if code == 2: logger.error("Partial failure: %d of %d tables failed", result.get("tables_failed", 0), len(tables)) elif code == 1: logger.error("All %d tables failed", len(tables)) exit(code)