feat: adapt Jira connector to extract.duckdb format

- New extract_init.py: creates extract.duckdb with _meta + views for 6 entity types - Update default paths to /data/extracts/jira/data/ and /data/extracts/jira/raw/ - After parquet writes, update _meta table in extract.duckdb - Trigger SyncOrchestrator.rebuild_source("jira") after successful transform
2026-03-30 20:19:27 +02:00 · 2026-03-30 20:19:27 +02:00 · e058c71777
commit e058c71777
parent 1bf97c725c
3 changed files with 138 additions and 2 deletions
--- a/connectors/jira/extract_init.py
+++ b/connectors/jira/extract_init.py
@ -0,0 +1,121 @@
+"""Initialize Jira extract.duckdb with _meta table and views for all entity types.
+
+Called once on first webhook or manually via CLI. Creates the extract.duckdb
+contract structure for the Jira connector.
+"""
+
+import logging
+import os
+from datetime import datetime, timezone
+from pathlib import Path
+
+import duckdb
+
+logger = logging.getLogger(__name__)
+
+JIRA_TABLES = ["issues", "comments", "attachments", "changelog", "issuelinks", "remote_links"]
+
+
+def init_extract(output_dir: str | Path) -> None:
+    """Create /data/extracts/jira/extract.duckdb with _meta and views.
+
+    Views point to monthly parquet partitions in data/{table}/*.parquet.
+    Safe to call multiple times — recreates _meta and views.
+    """
+    output_path = Path(output_dir)
+    data_dir = output_path / "data"
+    data_dir.mkdir(parents=True, exist_ok=True)
+
+    db_path = output_path / "extract.duckdb"
+    conn = duckdb.connect(str(db_path))
+
+    try:
+        # Create _meta table
+        conn.execute("DROP TABLE IF EXISTS _meta")
+        conn.execute("""CREATE TABLE _meta (
+            table_name VARCHAR NOT NULL,
+            description VARCHAR,
+            rows BIGINT,
+            size_bytes BIGINT,
+            extracted_at TIMESTAMP,
+            query_mode VARCHAR DEFAULT 'local'
+        )""")
+
+        now = datetime.now(timezone.utc)
+        for table_name in JIRA_TABLES:
+            table_dir = data_dir / table_name
+            table_dir.mkdir(exist_ok=True)
+
+            # Create view that reads all parquet files in the table directory
+            glob_path = str(table_dir / "*.parquet")
+            conn.execute(
+                f'CREATE OR REPLACE VIEW "{table_name}" AS '
+                f"SELECT * FROM read_parquet('{glob_path}', union_by_name=true, hive_partitioning=false)"
+            )
+
+            # Count existing rows if any parquets exist
+            rows = 0
+            size_bytes = 0
+            parquets = list(table_dir.glob("*.parquet"))
+            if parquets:
+                try:
+                    rows = conn.execute(f'SELECT count(*) FROM "{table_name}"').fetchone()[0]
+                    size_bytes = sum(f.stat().st_size for f in parquets)
+                except Exception:
+                    pass
+
+            conn.execute(
+                "INSERT INTO _meta VALUES (?, ?, ?, ?, ?, 'local')",
+                [table_name, f"Jira {table_name}", rows, size_bytes, now],
+            )
+
+        logger.info("Initialized Jira extract.duckdb at %s with %d tables", db_path, len(JIRA_TABLES))
+    finally:
+        conn.close()
+
+
+def update_meta(output_dir: str | Path, table_name: str) -> None:
+    """Update _meta entry for a table after parquet write.
+
+    Called after incremental_transform writes/updates a parquet file.
+    """
+    output_path = Path(output_dir)
+    db_path = output_path / "extract.duckdb"
+
+    if not db_path.exists():
+        init_extract(output_dir)
+        return
+
+    conn = duckdb.connect(str(db_path))
+    try:
+        table_dir = output_path / "data" / table_name
+        parquets = list(table_dir.glob("*.parquet"))
+
+        rows = 0
+        size_bytes = 0
+        if parquets:
+            try:
+                glob_path = str(table_dir / "*.parquet")
+                rows = conn.execute(f"SELECT count(*) FROM read_parquet('{glob_path}', union_by_name=true)").fetchone()[0]
+                size_bytes = sum(f.stat().st_size for f in parquets)
+            except Exception as e:
+                logger.warning("Could not count rows for %s: %s", table_name, e)
+
+        now = datetime.now(timezone.utc)
+        conn.execute(
+            "UPDATE _meta SET rows = ?, size_bytes = ?, extracted_at = ? WHERE table_name = ?",
+            [rows, size_bytes, now, table_name],
+        )
+    finally:
+        conn.close()
+
+
+def get_default_output_dir() -> Path:
+    """Get the default Jira extract output directory."""
+    data_dir = Path(os.environ.get("DATA_DIR", "/data"))
+    return data_dir / "extracts" / "jira"
+
+
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO)
+    init_extract(get_default_output_dir())
--- a/connectors/jira/incremental_transform.py
+++ b/connectors/jira/incremental_transform.py
@ -38,8 +38,8 @@ logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)

 # Default paths (can be overridden via environment)
-DEFAULT_RAW_DIR = Path("/data/src_data/raw/jira")
-DEFAULT_OUTPUT_DIR = Path("/data/src_data/parquet/jira")
+DEFAULT_RAW_DIR = Path(os.environ.get("DATA_DIR", "/data")) / "extracts" / "jira" / "raw"
+DEFAULT_OUTPUT_DIR = Path(os.environ.get("DATA_DIR", "/data")) / "extracts" / "jira" / "data"


 def upsert_dataframe(
@ -214,6 +214,15 @@ def transform_single_issue(
            path = save_parquet_month(updated_remote_links, REMOTE_LINKS_SCHEMA, output_dir / "remote_links", month_key)
            updated_paths.append(path)

+        # Update extract.duckdb _meta for all affected tables
+        try:
+            from .extract_init import update_meta
+            extract_dir = output_dir.parent  # output_dir is .../data, parent is .../jira
+            for table_name in ["issues", "comments", "attachments", "changelog", "issuelinks", "remote_links"]:
+                update_meta(extract_dir, table_name)
+        except Exception as meta_err:
+            logger.warning(f"Could not update extract.duckdb _meta: {meta_err}")
+
        logger.info(f"Successfully updated {issue_key} in Parquet files")
        return True

--- a/connectors/jira/service.py
+++ b/connectors/jira/service.py
@ -47,6 +47,12 @@ def trigger_incremental_transform(issue_key: str, deleted: bool = False) -> bool

        if success:
            logger.info(f"Incremental transform completed for {issue_key}")
+            # Rebuild Jira views in master analytics.duckdb
+            try:
+                from src.orchestrator import SyncOrchestrator
+                SyncOrchestrator().rebuild_source("jira")
+            except Exception as orch_err:
+                logger.warning(f"Orchestrator rebuild failed: {orch_err}")
        else:
            logger.warning(f"Incremental transform failed for {issue_key}")