feat: adapt Jira connector to extract.duckdb format
- New extract_init.py: creates extract.duckdb with _meta + views for 6 entity types
- Update default paths to /data/extracts/jira/data/ and /data/extracts/jira/raw/
- After parquet writes, update _meta table in extract.duckdb
- Trigger SyncOrchestrator.rebuild_source("jira") after successful transform
This commit is contained in:
parent
1bf97c725c
commit
e058c71777
3 changed files with 138 additions and 2 deletions
121
connectors/jira/extract_init.py
Normal file
121
connectors/jira/extract_init.py
Normal file
|
|
@ -0,0 +1,121 @@
|
||||||
|
"""Initialize Jira extract.duckdb with _meta table and views for all entity types.
|
||||||
|
|
||||||
|
Called once on first webhook or manually via CLI. Creates the extract.duckdb
|
||||||
|
contract structure for the Jira connector.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import duckdb
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
JIRA_TABLES = ["issues", "comments", "attachments", "changelog", "issuelinks", "remote_links"]
|
||||||
|
|
||||||
|
|
||||||
|
def init_extract(output_dir: str | Path) -> None:
|
||||||
|
"""Create /data/extracts/jira/extract.duckdb with _meta and views.
|
||||||
|
|
||||||
|
Views point to monthly parquet partitions in data/{table}/*.parquet.
|
||||||
|
Safe to call multiple times — recreates _meta and views.
|
||||||
|
"""
|
||||||
|
output_path = Path(output_dir)
|
||||||
|
data_dir = output_path / "data"
|
||||||
|
data_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
db_path = output_path / "extract.duckdb"
|
||||||
|
conn = duckdb.connect(str(db_path))
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Create _meta table
|
||||||
|
conn.execute("DROP TABLE IF EXISTS _meta")
|
||||||
|
conn.execute("""CREATE TABLE _meta (
|
||||||
|
table_name VARCHAR NOT NULL,
|
||||||
|
description VARCHAR,
|
||||||
|
rows BIGINT,
|
||||||
|
size_bytes BIGINT,
|
||||||
|
extracted_at TIMESTAMP,
|
||||||
|
query_mode VARCHAR DEFAULT 'local'
|
||||||
|
)""")
|
||||||
|
|
||||||
|
now = datetime.now(timezone.utc)
|
||||||
|
for table_name in JIRA_TABLES:
|
||||||
|
table_dir = data_dir / table_name
|
||||||
|
table_dir.mkdir(exist_ok=True)
|
||||||
|
|
||||||
|
# Create view that reads all parquet files in the table directory
|
||||||
|
glob_path = str(table_dir / "*.parquet")
|
||||||
|
conn.execute(
|
||||||
|
f'CREATE OR REPLACE VIEW "{table_name}" AS '
|
||||||
|
f"SELECT * FROM read_parquet('{glob_path}', union_by_name=true, hive_partitioning=false)"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Count existing rows if any parquets exist
|
||||||
|
rows = 0
|
||||||
|
size_bytes = 0
|
||||||
|
parquets = list(table_dir.glob("*.parquet"))
|
||||||
|
if parquets:
|
||||||
|
try:
|
||||||
|
rows = conn.execute(f'SELECT count(*) FROM "{table_name}"').fetchone()[0]
|
||||||
|
size_bytes = sum(f.stat().st_size for f in parquets)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
conn.execute(
|
||||||
|
"INSERT INTO _meta VALUES (?, ?, ?, ?, ?, 'local')",
|
||||||
|
[table_name, f"Jira {table_name}", rows, size_bytes, now],
|
||||||
|
)
|
||||||
|
|
||||||
|
logger.info("Initialized Jira extract.duckdb at %s with %d tables", db_path, len(JIRA_TABLES))
|
||||||
|
finally:
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
|
||||||
|
def update_meta(output_dir: str | Path, table_name: str) -> None:
|
||||||
|
"""Update _meta entry for a table after parquet write.
|
||||||
|
|
||||||
|
Called after incremental_transform writes/updates a parquet file.
|
||||||
|
"""
|
||||||
|
output_path = Path(output_dir)
|
||||||
|
db_path = output_path / "extract.duckdb"
|
||||||
|
|
||||||
|
if not db_path.exists():
|
||||||
|
init_extract(output_dir)
|
||||||
|
return
|
||||||
|
|
||||||
|
conn = duckdb.connect(str(db_path))
|
||||||
|
try:
|
||||||
|
table_dir = output_path / "data" / table_name
|
||||||
|
parquets = list(table_dir.glob("*.parquet"))
|
||||||
|
|
||||||
|
rows = 0
|
||||||
|
size_bytes = 0
|
||||||
|
if parquets:
|
||||||
|
try:
|
||||||
|
glob_path = str(table_dir / "*.parquet")
|
||||||
|
rows = conn.execute(f"SELECT count(*) FROM read_parquet('{glob_path}', union_by_name=true)").fetchone()[0]
|
||||||
|
size_bytes = sum(f.stat().st_size for f in parquets)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("Could not count rows for %s: %s", table_name, e)
|
||||||
|
|
||||||
|
now = datetime.now(timezone.utc)
|
||||||
|
conn.execute(
|
||||||
|
"UPDATE _meta SET rows = ?, size_bytes = ?, extracted_at = ? WHERE table_name = ?",
|
||||||
|
[rows, size_bytes, now, table_name],
|
||||||
|
)
|
||||||
|
finally:
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
|
||||||
|
def get_default_output_dir() -> Path:
|
||||||
|
"""Get the default Jira extract output directory."""
|
||||||
|
data_dir = Path(os.environ.get("DATA_DIR", "/data"))
|
||||||
|
return data_dir / "extracts" / "jira"
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
logging.basicConfig(level=logging.INFO)
|
||||||
|
init_extract(get_default_output_dir())
|
||||||
|
|
@ -38,8 +38,8 @@ logging.basicConfig(level=logging.INFO)
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
# Default paths (can be overridden via environment)
|
# Default paths (can be overridden via environment)
|
||||||
DEFAULT_RAW_DIR = Path("/data/src_data/raw/jira")
|
DEFAULT_RAW_DIR = Path(os.environ.get("DATA_DIR", "/data")) / "extracts" / "jira" / "raw"
|
||||||
DEFAULT_OUTPUT_DIR = Path("/data/src_data/parquet/jira")
|
DEFAULT_OUTPUT_DIR = Path(os.environ.get("DATA_DIR", "/data")) / "extracts" / "jira" / "data"
|
||||||
|
|
||||||
|
|
||||||
def upsert_dataframe(
|
def upsert_dataframe(
|
||||||
|
|
@ -214,6 +214,15 @@ def transform_single_issue(
|
||||||
path = save_parquet_month(updated_remote_links, REMOTE_LINKS_SCHEMA, output_dir / "remote_links", month_key)
|
path = save_parquet_month(updated_remote_links, REMOTE_LINKS_SCHEMA, output_dir / "remote_links", month_key)
|
||||||
updated_paths.append(path)
|
updated_paths.append(path)
|
||||||
|
|
||||||
|
# Update extract.duckdb _meta for all affected tables
|
||||||
|
try:
|
||||||
|
from .extract_init import update_meta
|
||||||
|
extract_dir = output_dir.parent # output_dir is .../data, parent is .../jira
|
||||||
|
for table_name in ["issues", "comments", "attachments", "changelog", "issuelinks", "remote_links"]:
|
||||||
|
update_meta(extract_dir, table_name)
|
||||||
|
except Exception as meta_err:
|
||||||
|
logger.warning(f"Could not update extract.duckdb _meta: {meta_err}")
|
||||||
|
|
||||||
logger.info(f"Successfully updated {issue_key} in Parquet files")
|
logger.info(f"Successfully updated {issue_key} in Parquet files")
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -47,6 +47,12 @@ def trigger_incremental_transform(issue_key: str, deleted: bool = False) -> bool
|
||||||
|
|
||||||
if success:
|
if success:
|
||||||
logger.info(f"Incremental transform completed for {issue_key}")
|
logger.info(f"Incremental transform completed for {issue_key}")
|
||||||
|
# Rebuild Jira views in master analytics.duckdb
|
||||||
|
try:
|
||||||
|
from src.orchestrator import SyncOrchestrator
|
||||||
|
SyncOrchestrator().rebuild_source("jira")
|
||||||
|
except Exception as orch_err:
|
||||||
|
logger.warning(f"Orchestrator rebuild failed: {orch_err}")
|
||||||
else:
|
else:
|
||||||
logger.warning(f"Incremental transform failed for {issue_key}")
|
logger.warning(f"Incremental transform failed for {issue_key}")
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue