#!/usr/bin/env python3 """ Jira Remote Links Backfill - Add _remote_links to existing issue JSONs. One-time migration script that fetches remote links from Jira API and embeds them into existing issue JSON files. This enables the Parquet transform to extract remote_links table data. Usage: # On server (loads .env from /.env or the current directory): python -m connectors.jira.scripts.backfill_remote_links # With parallel workers: python -m connectors.jira.scripts.backfill_remote_links --parallel 4 # Dry run: python -m connectors.jira.scripts.backfill_remote_links --dry-run Environment variables (loaded from .env): JIRA_DOMAIN - Jira Cloud domain JIRA_EMAIL - Email for API authentication JIRA_API_TOKEN - API token from Atlassian JIRA_DATA_DIR - Directory for storing data (default: /data/src_data/raw/jira) """ import argparse import json import logging import os import sys import tempfile import time from concurrent.futures import ThreadPoolExecutor, as_completed from pathlib import Path import httpx from dotenv import load_dotenv from app.logging_config import setup_logging from connectors.jira.service import JiraFetchError setup_logging(__name__) logger = logging.getLogger(__name__) def load_config() -> dict: """Load configuration from environment variables.""" # Customer-specific install paths (e.g. /opt//.env) can be # injected via the AGNES_ENV_FILE env var without editing this list. env_paths = [ Path(os.environ["AGNES_ENV_FILE"]) if os.environ.get("AGNES_ENV_FILE") else None, Path.cwd() / ".env", Path(__file__).parent.parent / ".env", ] env_paths = [p for p in env_paths if p is not None] for env_path in env_paths: if env_path.exists(): load_dotenv(env_path) logger.info(f"Loaded environment from {env_path}") break required = ["JIRA_DOMAIN", "JIRA_EMAIL", "JIRA_API_TOKEN"] missing = [var for var in required if not os.environ.get(var)] if missing: raise ValueError(f"Missing required environment variables: {', '.join(missing)}") return { "domain": os.environ["JIRA_DOMAIN"], "email": os.environ["JIRA_EMAIL"], "api_token": os.environ["JIRA_API_TOKEN"], "data_dir": Path(os.environ.get("JIRA_DATA_DIR", "/data/src_data/raw/jira")), } def fetch_remote_links(base_url: str, auth: tuple[str, str], issue_key: str) -> list[dict]: """Fetch remote links for a single issue from Jira API. Raises JiraFetchError on auth/server failure so the caller can skip the overlay rather than write [] and let the next incremental transform wipe existing parquet rows. """ url = f"{base_url}/issue/{issue_key}/remotelink" try: with httpx.Client(timeout=30) as client: response = client.get( url, auth=auth, headers={"Accept": "application/json"}, ) except httpx.RequestError as e: raise JiraFetchError( f"Targeted backfill remote-links fetch for {issue_key} failed: connection — {e}" ) from e if response.status_code == 200: return response.json() if response.status_code == 404: return [] if response.status_code == 429: retry_after = int(response.headers.get("Retry-After", 60)) logger.warning(f"Rate limited, waiting {retry_after}s...") time.sleep(retry_after) return fetch_remote_links(base_url, auth, issue_key) if response.status_code in (401, 403): raise JiraFetchError( f"Targeted backfill remote-links fetch for {issue_key} failed: auth error " f"({response.status_code}) — token may be expired/revoked" ) if response.status_code >= 500: raise JiraFetchError( f"Targeted backfill remote-links fetch for {issue_key} failed: server error " f"({response.status_code})" ) raise JiraFetchError( f"Targeted backfill remote-links fetch for {issue_key} failed: unexpected status " f"{response.status_code}" ) def process_file(json_path: Path, base_url: str, auth: tuple[str, str]) -> str: """ Process a single issue JSON file. Returns: "processed", "skipped", or "failed" """ try: with open(json_path) as f: data = json.load(f) # Skip if already has _remote_links if "_remote_links" in data: return "skipped" issue_key = data.get("key") if not issue_key: return "failed" # Fetch remote links. If fetch fails, return "failed" without writing # the JSON — leaves the file as-is (no _remote_links key) so the next # incremental transform preserves existing parquet rows. try: remote_links = fetch_remote_links(base_url, auth, issue_key) except JiraFetchError as e: logger.warning( f"Skipping _remote_links embed for {issue_key}: {e}. " f"Existing parquet rows will be preserved." ) return "failed" # Embed in data data["_remote_links"] = remote_links # Atomic write: temp file + replace fd, tmp_path = tempfile.mkstemp(dir=str(json_path.parent), suffix=".tmp") try: with os.fdopen(fd, "w") as f: json.dump(data, f, indent=2, default=str) os.replace(tmp_path, str(json_path)) except Exception: try: os.unlink(tmp_path) except OSError: pass raise return "processed" except Exception as e: logger.error(f"Error processing {json_path.name}: {e}") return "failed" def main(): parser = argparse.ArgumentParser( description="Backfill _remote_links into existing Jira issue JSONs", ) parser.add_argument( "--parallel", type=int, default=4, help="Number of parallel workers (default: 4)", ) parser.add_argument( "--dry-run", action="store_true", help="Only count files, don't fetch or modify", ) parser.add_argument( "--data-dir", type=Path, help="Override data directory", ) args = parser.parse_args() config = load_config() data_dir = args.data_dir or config["data_dir"] issues_dir = data_dir / "issues" if not issues_dir.exists(): logger.error(f"Issues directory not found: {issues_dir}") sys.exit(1) base_url = f"https://{config['domain']}/rest/api/3" auth = (config["email"], config["api_token"]) # Enumerate JSON files json_files = list(issues_dir.glob("*.json")) total = len(json_files) logger.info(f"Found {total} issue JSON files in {issues_dir}") if args.dry_run: # Count how many already have _remote_links already_done = 0 for jf in json_files: try: with open(jf) as f: data = json.load(f) if "_remote_links" in data: already_done += 1 except Exception: pass logger.info(f"Already have _remote_links: {already_done}") logger.info(f"Would process: {total - already_done}") return # Process files stats = {"processed": 0, "skipped": 0, "failed": 0} start_time = time.time() with ThreadPoolExecutor(max_workers=args.parallel) as executor: futures = {executor.submit(process_file, jf, base_url, auth): jf for jf in json_files} done_count = 0 for future in as_completed(futures): done_count += 1 result = future.result() stats[result] += 1 if done_count % 200 == 0: elapsed = time.time() - start_time rate = done_count / elapsed if elapsed > 0 else 0 logger.info( f"Progress: {done_count}/{total} " f"({rate:.1f}/s) - " f"processed: {stats['processed']}, " f"skipped: {stats['skipped']}, " f"failed: {stats['failed']}" ) elapsed = time.time() - start_time logger.info("=" * 60) logger.info("Remote links backfill completed!") logger.info(f"Total files: {total}") logger.info(f"Processed: {stats['processed']}") logger.info(f"Skipped (already had _remote_links): {stats['skipped']}") logger.info(f"Failed: {stats['failed']}") logger.info(f"Time: {elapsed:.1f}s") logger.info("=" * 60) if stats["failed"] > 0: sys.exit(1) if __name__ == "__main__": main()