"""Sync orchestrator — ATTACHes extract.duckdb files into master analytics.duckdb. Remote table support -------------------- Extractors that create views referencing external DuckDB extensions (e.g. Keboola, BigQuery) must include a ``_remote_attach`` table in their extract.duckdb: CREATE TABLE _remote_attach ( alias VARCHAR, -- DuckDB alias used in views, e.g. 'kbc' extension VARCHAR, -- Extension name, e.g. 'keboola' url VARCHAR, -- Connection URL token_env VARCHAR -- Env-var name holding the auth token (NOT the token itself). -- Empty string for BigQuery — orchestrator detects -- extension='bigquery' and refreshes the token from the -- GCE metadata server on its own. ); At rebuild time the orchestrator reads ``_remote_attach``, installs/loads the extension, then either: (a) for BigQuery, fetches a fresh access token from the GCE metadata server and creates a session-scoped DuckDB SECRET before ATTACH; (b) for sources with a non-empty ``token_env``, reads that env var and passes the token inline; (c) ATTACHes without auth. Views referencing ``bq."dataset"."table"`` or ``kbc."bucket"."table"`` then resolve correctly. Note: BQ secrets are session-scoped, so ``src.db._reattach_remote_extensions`` re-fetches the metadata token and re-creates the secret each time a read-only analytics connection is opened. """ import hashlib import logging import os import threading from pathlib import Path from typing import Dict, List, Optional import duckdb from connectors.bigquery.auth import get_metadata_token, BQMetadataAuthError from src.orchestrator_security import ( escape_sql_string_literal, is_builtin_extension, is_extension_allowed, is_token_env_allowed, ) logger = logging.getLogger(__name__) _rebuild_lock = threading.Lock() # Identifier validation lives in src/identifier_validation.py so the # orchestrator and the extractors share the same regex (#81 Group D). # The local names are kept as aliases so existing call sites need no # rename — they import from a single source of truth now. from src.identifier_validation import ( # noqa: E402 _SAFE_IDENTIFIER, # noqa: F401 (re-exported for any historical caller) validate_identifier as _validate_identifier, ) def _atomic_swap_db(tmp_path: str, target_path: str) -> None: """Atomically replace target DuckDB file, cleaning up WAL files.""" import shutil target = Path(target_path) tmp = Path(tmp_path) # Remove old WAL file if it exists old_wal = Path(str(target) + ".wal") if old_wal.exists(): old_wal.unlink() # Move temp DB into place if tmp.exists(): shutil.move(str(tmp), str(target)) # Clean up temp WAL tmp_wal = Path(str(tmp) + ".wal") if tmp_wal.exists(): tmp_wal.unlink() def _get_extracts_dir() -> Path: data_dir = Path(os.environ.get("DATA_DIR", "./data")) return data_dir / "extracts" class SyncOrchestrator: """Scans /data/extracts/*, ATTACHes each extract.duckdb, creates master views.""" def __init__(self, analytics_db_path: str | None = None): # analytics_db_path allows override for testing if analytics_db_path: self._db_path = analytics_db_path else: data_dir = Path(os.environ.get("DATA_DIR", "./data")) self._db_path = str(data_dir / "analytics" / "server.duckdb") Path(self._db_path).parent.mkdir(parents=True, exist_ok=True) def rebuild(self) -> Dict[str, List[str]]: """Scan all extract directories, ATTACH each, create master views. Returns: {source_name: [table_names]} for logging. """ with _rebuild_lock: return self._do_rebuild() def rebuild_source(self, source_name: str) -> List[str]: """Rebuild views from a single source (e.g. after Jira webhook).""" with _rebuild_lock: return self._do_rebuild_source(source_name) def _scan_meta_pairs(self, extracts_dir: Path) -> tuple: """Read every connector's `_meta` and return (pairs, clean) where: - ``pairs`` — list of (source_name, table_name) tuples successfully gathered from `_meta`. - ``clean`` — True iff every source's pre-scan succeeded. False if any source's `_meta` couldn't be read (transient I/O, mid-write, missing/corrupt extract.duckdb). Used by view_ownership.reconcile to release stale claims before the main rebuild loop tries to claim new names. The ``clean`` flag guards against a correctness bug: if source B's pre-scan fails and we naively reconcile against an incomplete `pairs` list, B's prior ownership is dropped, and another source could claim B's name in the same rebuild — a silent overwrite, exactly what Group C is meant to prevent. Callers MUST skip reconcile when ``clean`` is False; per-row claim-time collision detection still catches actual collisions. """ pairs: List[tuple] = [] clean = True for ext_dir in sorted(extracts_dir.iterdir()): if not ext_dir.is_dir(): continue db_file = ext_dir / "extract.duckdb" if not db_file.exists(): continue if not _validate_identifier(ext_dir.name, "source_name"): continue try: ro_conn = duckdb.connect(str(db_file), read_only=True) try: rows = ro_conn.execute( "SELECT table_name FROM _meta" ).fetchall() for (table_name,) in rows: if _validate_identifier(table_name, "table_name"): pairs.append((ext_dir.name, table_name)) finally: ro_conn.close() except Exception as e: logger.warning( "scan_meta_pairs: failed to read %s (%s) — " "skipping reconcile this rebuild to avoid releasing " "ownerships prematurely", ext_dir.name, e, ) clean = False return pairs, clean def _do_rebuild(self) -> Dict[str, List[str]]: extracts_dir = _get_extracts_dir() if not extracts_dir.exists(): logger.warning("Extracts directory %s does not exist", extracts_dir) return {} # Issue #81 Group C — load view ownership map from system DB so we # can detect cross-connector view-name collisions during this # rebuild and refuse to silently overwrite a previously-claimed # name. The map is kept in system.duckdb (analytics.duckdb is # rebuilt fresh each time and would not survive). from src.db import get_system_db from src.repositories.view_ownership import ViewOwnershipRepository sys_conn_for_views = get_system_db() view_repo = None try: view_repo = ViewOwnershipRepository(sys_conn_for_views) # Pre-scan every connector's _meta so we can run the reconcile # pass BEFORE claims are evaluated. This makes "owner stopped # publishing → name freed → another source can claim" work in # the SAME rebuild rather than requiring two consecutive runs. # # Correctness: only reconcile when EVERY source's pre-scan # succeeded. Otherwise a transient I/O failure on source B # would drop B's prior ownership and let another source steal # B's name — silent overwrite, exactly the bug Group C # prevents. Per-row claim-time collision detection still # catches actual collisions even without reconcile this run. current_pairs, pre_scan_clean = self._scan_meta_pairs(extracts_dir) if pre_scan_clean: view_repo.reconcile(current_pairs) else: logger.warning( "view_ownership: skipping reconcile this rebuild — " "pre-scan was incomplete; renamed tables will release " "their names on the next clean rebuild instead" ) existing_owners = view_repo.get_all() except Exception as e: logger.warning( "view_ownership pre-scan failed: %s — proceeding without " "collision detection", e, ) existing_owners = {} view_repo = None try: sys_conn_for_views.close() except Exception: pass sys_conn_for_views = None # Track every (source, view) pair this rebuild successfully claims. claimed_pairs: List[tuple] = [] result = {} # Write to temp file then rename — avoids lock conflict with query endpoint tmp_path = self._db_path + ".tmp" if Path(tmp_path).exists(): Path(tmp_path).unlink() conn = duckdb.connect(tmp_path) try: # Detach any previously attached databases (except main and temp) attached = [ row[0] for row in conn.execute( "SELECT database_name FROM duckdb_databases() " "WHERE database_name NOT IN ('memory', 'system', 'temp')" ).fetchall() ] for db_name in attached: if db_name != Path(self._db_path).stem: try: conn.execute(f"DETACH {db_name}") except Exception: pass for ext_dir in sorted(extracts_dir.iterdir()): if not ext_dir.is_dir(): continue db_file = ext_dir / "extract.duckdb" if not db_file.exists(): logger.debug("Skipping %s — no extract.duckdb", ext_dir.name) continue if not _validate_identifier(ext_dir.name, "source_name"): continue tables = self._attach_and_create_views( conn, ext_dir.name, str(db_file), existing_owners=existing_owners, claimed_pairs=claimed_pairs, view_repo=view_repo if sys_conn_for_views else None, ) if tables: result[ext_dir.name] = tables logger.info("Attached %s: %d tables", ext_dir.name, len(tables)) # No end-of-rebuild reconcile: the pre-scan reconcile above # already released stale ownerships using a complete view of # every source's `_meta`. Reconciling again here against # `claimed_pairs` (which excludes refused collisions and any # source that failed to attach) would incorrectly drop the # legitimate prior owner of a name when its DB happens to be # transiently unreadable. See test # `test_pre_scan_failure_does_not_release_ownership` for the # contract. finally: conn.execute("CHECKPOINT") conn.close() if sys_conn_for_views is not None: try: sys_conn_for_views.close() except Exception: pass # Atomic swap: replace analytics.duckdb with new version _atomic_swap_db(tmp_path, self._db_path) return result def _do_rebuild_source(self, source_name: str) -> List[str]: """Rebuild views for a single source by doing a full rebuild. A full rebuild is necessary because the analytics DB is created fresh each time (temp file + atomic swap). Rebuilding only one source would destroy views from all other sources. """ extracts_dir = _get_extracts_dir() db_file = extracts_dir / source_name / "extract.duckdb" if not db_file.exists(): logger.warning("No extract.duckdb for source %s", source_name) return [] result = self._do_rebuild() return result.get(source_name, []) def _attach_and_create_views( self, conn: duckdb.DuckDBPyConnection, source_name: str, db_path: str, existing_owners: Optional[Dict[str, str]] = None, claimed_pairs: Optional[List[tuple]] = None, view_repo=None, ) -> List[str]: """ATTACH extract.duckdb, read _meta, create views in master. Issue #81 Group C — when ``existing_owners`` and ``view_repo`` are provided, the orchestrator checks for cross-connector view-name collisions and refuses to overwrite a name owned by another source. ``claimed_pairs`` accumulates the (source, view) tuples this rebuild successfully claims; the caller uses it for end-of-rebuild reconcile. """ if existing_owners is None: existing_owners = {} tables = [] try: conn.execute(f"ATTACH '{db_path}' AS {source_name} (READ_ONLY)") # Re-ATTACH external extensions needed by remote views self._attach_remote_extensions(conn, source_name) # Read _meta to know what's available meta_rows = conn.execute( f"SELECT table_name, rows, size_bytes, query_mode " f"FROM {source_name}._meta" ).fetchall() # Pre-fetch the set of names that actually exist as views/tables in # the attached extract.duckdb. Most connectors emit a `_meta` row # alongside an inner view per registered name; the keboola # extractor with `use_extension=False` (and other connectors) # may insert `_meta` rows whose inner view doesn't exist yet — # skip those to avoid creating a master view that would resolve # to nothing. inner_objects = { row[0] for row in conn.execute( "SELECT table_name FROM information_schema.tables " f"WHERE table_catalog='{source_name}'" ).fetchall() } for table_name, rows, size_bytes, query_mode in meta_rows: if not _validate_identifier(table_name, "table_name"): continue if table_name not in inner_objects: # `_meta` row without an inner object. Post-#160 the # BigQuery extractor no longer emits these for unsupported # entity types (it skips both the view AND the _meta row), # so this branch fires for the keboola use_extension=False # path and any future connector that splits writes across # commits. Skip master-view creation; subsequent rows # continue normally. logger.info( "Skipping master view for %s.%s — no inner object", source_name, table_name, ) continue # Issue #81 Group C — refuse cross-connector collisions. # First-come-first-served: the source already in # view_ownership keeps the name; any other source that # tries to claim it gets logged + skipped until the # operator renames one side. Re-claim by the same source # is fine (idempotent rebuild). if view_repo is not None: if not view_repo.claim(table_name, source_name): prior_owner = ( view_repo.get_owner(table_name) or existing_owners.get(table_name, "") ) logger.error( "view_ownership collision: %s already owns view %r; " "%s.%s will NOT be exposed. Rename `name` in the " "table_registry on one side to resolve.", prior_owner, table_name, source_name, table_name, ) continue if claimed_pairs is not None: claimed_pairs.append((source_name, table_name)) try: conn.execute( f"CREATE OR REPLACE VIEW \"{table_name}\" AS " f"SELECT * FROM {source_name}.\"{table_name}\"" ) tables.append(table_name) except Exception as e: # Per-row catch so one bad row doesn't drop the rest of # the source's master views from the rebuild. logger.error( "Failed to create master view for %s.%s: %s", source_name, table_name, e, ) # Filesystem-fallback master views (0.41.0). The 0.40.0 fix in # `materialize_query` tries to register the parquet in # `extract.duckdb`'s `_meta` + inner view, but the open-as- # second-write-handle from the same uvicorn process collides # with the existing read-only ATTACH that `rebuild()` itself # holds (`Unique file handle conflict: Cannot attach "extract" # — already attached by database ""`). The 0.40.0 # helper logs a WARNING and falls through, parquet is # canonical, but the master view never appears via the meta # path. This second pass scans `/data/*.parquet` # directly and creates a master view via `read_parquet()` for # any parquet that didn't already get one through the meta # path. Decoupled from materialize_query's open-handle race; # robust against any registration drift between materialize # and rebuild. try: extracts_dir = _get_extracts_dir() except Exception: extracts_dir = None if extracts_dir is not None: data_dir = extracts_dir / source_name / "data" if data_dir.exists(): # Resolve the set of registry-known table_ids for this # source. The fallback is a master-view recovery path # for parquets that materialize_query wrote but # couldn't register in `_meta`; an **orphan** parquet # (registry row deleted by `DELETE /api/admin/registry` # but parquet not yet cleaned up) must NOT get a # master view — that would resurrect a deleted table. # Pre-existing test `test_orchestrator_skips_orphan_ # parquet_in_extracts` pins this contract. registered_ids: Optional[set] = None try: from src.db import get_system_db from src.repositories.table_registry import ( TableRegistryRepository, ) sys_conn = get_system_db() try: rows = TableRegistryRepository(sys_conn).list_all() # Match parquet stems against registry rows for # THIS source where query_mode='materialized'. # The parquet filename is keyed by registry # `name` (per `_run_materialized_pass` / # `materialize_query` convention). registered_ids = { str(r.get("name")) for r in rows if (r.get("source_type") or "") == source_name and (r.get("query_mode") or "") == "materialized" and r.get("name") } finally: try: sys_conn.close() except Exception: pass except Exception as e: # No registry access (test fixture, transient DB # error) — skip the fallback rather than risk # exposing orphan parquets. logger.warning( "filesystem-fallback: registry read failed (%s); " "skipping fallback scan for %s — orphan parquets " "from a prior DELETE could otherwise be exposed.", e, source_name, ) registered_ids = None if registered_ids is not None: already_created = set(tables) for parquet_path in sorted(data_dir.glob("*.parquet")): table_id = parquet_path.stem if not _validate_identifier(table_id, "fs_fallback table_id"): continue if table_id in already_created: continue # Only register parquets that have a live # materialized registry row. Orphans skip. if table_id not in registered_ids: logger.debug( "filesystem-fallback: skipping orphan " "parquet %s/%s (no registry row)", source_name, table_id, ) continue # view_repo claim — same first-come-first-served # rule as the meta-path branch above. if view_repo is not None: if not view_repo.claim(table_id, source_name): prior_owner = ( view_repo.get_owner(table_id) or existing_owners.get(table_id, "") ) logger.error( "view_ownership collision: %s already owns view %r; " "%s.%s (filesystem-fallback) will NOT be exposed.", prior_owner, table_id, source_name, table_id, ) continue if claimed_pairs is not None: claimed_pairs.append((source_name, table_id)) try: safe_path = str(parquet_path).replace("'", "''") conn.execute( f"CREATE OR REPLACE VIEW \"{table_id}\" AS " f"SELECT * FROM read_parquet('{safe_path}')" ) tables.append(table_id) logger.info( "filesystem-fallback master view created: " "%s/%s (parquet at %s) — meta row was missing", source_name, table_id, parquet_path, ) except Exception as e: logger.error( "filesystem-fallback master view failed for %s/%s: %s", source_name, table_id, e, ) # Update sync_state in system DB self._update_sync_state(meta_rows, source_name) except Exception as e: logger.error("Failed to attach %s: %s", source_name, e) return tables def _attach_remote_extensions( self, conn: duckdb.DuckDBPyConnection, source_name: str ) -> None: """Read _remote_attach from extract.duckdb and ATTACH external sources.""" try: # DuckDB attached-DB layout: ATTACH 'extract.duckdb' AS # exposes information_schema.tables with table_catalog= # and table_schema='main'. The earlier draft used # table_schema= here, which never matched and made # _attach_remote_extensions a silent no-op for every # connector — defeating the entire Group A hardening in # production. db.py:_reattach_remote_extensions already used # the correct column; this aligns the rebuild path. tables = conn.execute( f"SELECT table_name FROM information_schema.tables " f"WHERE table_catalog='{source_name}' AND table_name='_remote_attach'" ).fetchall() if not tables: return except Exception: return rows = conn.execute( f"SELECT alias, extension, url, token_env FROM {source_name}._remote_attach" ).fetchall() for alias, extension, url, token_env in rows: # Identifier sanity (defense against weird input). The hard # security boundary is the allowlist a few lines down. if not _validate_identifier(alias, "remote_attach alias"): continue if not _validate_identifier(extension, "remote_attach extension"): continue # #81 Group A.1 — extension allowlist. The connector does NOT # get to pick what extensions the orchestrator loads. if not is_extension_allowed(extension): logger.error( "Remote attach %s: extension %r is not in the allowlist; refusing. " "Override via AGNES_REMOTE_ATTACH_EXTENSIONS if intended.", alias, extension, ) continue # #81 Group A.2 — token-env hard allowlist. Refuses well-known # runtime secrets (JWT_SECRET_KEY, OPENAI_API_KEY, …) that a # malicious connector might ask us to send to its server. if token_env and not is_token_env_allowed(token_env): logger.error( "Remote attach %s: token_env %r is not in the allowlist; refusing. " "Override via AGNES_REMOTE_ATTACH_TOKEN_ENVS if intended.", alias, token_env, ) continue token = os.environ.get(token_env, "") if token_env else "" if token_env and not token: logger.warning( "Remote attach %s: env var %s not set, skipping", alias, token_env ) continue try: # Skip if already attached (e.g. multiple sources share same extension) attached = { r[0] for r in conn.execute( "SELECT database_name FROM duckdb_databases()" ).fetchall() } if alias in attached: logger.debug("Remote source %s already attached", alias) continue # #81 Group A.1 — built-ins LOAD only; community needs INSTALL+LOAD. if is_builtin_extension(extension): conn.execute(f"LOAD {extension};") else: conn.execute(f"INSTALL {extension} FROM community; LOAD {extension};") # #81 Group A.3 — escape URL single-quotes (mirrors src/db.py). safe_url = escape_sql_string_literal(url) # BQ-specific: refresh token from GCE metadata, create session-scoped # secret before ATTACH. Empty token_env (set by the BQ extractor) is # the contract that signals "use built-in metadata path". if extension == "bigquery": try: bq_token = get_metadata_token() except BQMetadataAuthError as e: logger.error( "Failed to fetch BQ metadata token for %s: %s — skipping ATTACH", alias, e, ) continue escaped = escape_sql_string_literal(bq_token) secret_name = f"bq_secret_{alias}" conn.execute( f"CREATE OR REPLACE SECRET {secret_name} " f"(TYPE bigquery, ACCESS_TOKEN '{escaped}')" ) from connectors.bigquery.access import apply_bq_session_settings apply_bq_session_settings(conn) conn.execute( f"ATTACH '{safe_url}' AS {alias} (TYPE {extension}, READ_ONLY)" ) elif token: escaped_token = escape_sql_string_literal(token) conn.execute( f"ATTACH '{safe_url}' AS {alias} (TYPE {extension}, TOKEN '{escaped_token}')" ) # Apply BQ session settings on every BQ-extension attach, # not only the metadata-token branch above. The token-based # branch previously fell through without calling # apply_bq_session_settings, leaving the 90 s extension # default for bq_query_timeout_ms in place. if extension == "bigquery": from connectors.bigquery.access import apply_bq_session_settings apply_bq_session_settings(conn) else: # No auth required (or extension handles it via env automatically) conn.execute( f"ATTACH '{safe_url}' AS {alias} (TYPE {extension}, READ_ONLY)" ) if extension == "bigquery": from connectors.bigquery.access import apply_bq_session_settings apply_bq_session_settings(conn) logger.info("Attached remote source %s via %s extension", alias, extension) except Exception as e: logger.error("Failed to attach remote source %s: %s", alias, e) def _update_sync_state(self, meta_rows: list, source_name: str) -> None: """Update sync_state table in system.duckdb from _meta entries. The hash stored here MUST match what `agnes pull` computes client-side via `cli/commands/sync.py:_md5_file` and what the materialized SQL path stores via `app/api/sync.py:_file_hash` — otherwise the CLI's post-download integrity check fails for every local-mode table with `hash mismatch: expected … got …`. That's a full content MD5 (`hashlib.md5(bytes).hexdigest()`), no truncation. Pre-fix this method computed `md5(f"{mtime_ns}:{size}")[:12]` — a fingerprint, not a content hash, and 12-char truncated to boot — which the CLI's full-32-char content MD5 could never match. Symptom: `agnes pull` failed with hash mismatch on every Keboola local-mode table because their sync_state hashes came from this path while their on-disk content was unrelated. """ try: from src.db import get_system_db from src.repositories.sync_state import SyncStateRepository extracts_dir = _get_extracts_dir() sys_conn = get_system_db() try: repo = SyncStateRepository(sys_conn) for table_name, rows, size_bytes, query_mode in meta_rows: pq_path = extracts_dir / source_name / "data" / f"{table_name}.parquet" file_hash = "" if pq_path.exists(): h = hashlib.md5() with open(pq_path, "rb") as f: for chunk in iter(lambda: f.read(8192), b""): h.update(chunk) file_hash = h.hexdigest() repo.update_sync( table_id=table_name, rows=rows or 0, file_size_bytes=size_bytes or 0, hash=file_hash, ) finally: sys_conn.close() except Exception as e: logger.warning("Could not update sync_state: %s", e)