agnes-the-ai-analyst/connectors/jira/scripts/backfill_remote_links.py
ZdenekSrotyr ed3e8337ab
fix(jira): harden _remote_links fetch — prevent transient outage from wiping parquet rows (#319)
* fix(jira): harden _remote_links fetch — transient API failure no longer wipes parquet rows

Pre-fix, all three fetch_remote_links call sites (service.py,
scripts/backfill.py, scripts/backfill_remote_links.py) silently
returned [] on 401/403/429/5xx or httpx.RequestError. Callers overlaid
that [] onto cached issue JSON, and transform_remote_links interpreted
the empty list as 'issue legitimately has no remote links — delete
existing rows', so a transient Jira auth blip permanently wiped
remote-link history.

Now:
- Every fetch site raises JiraFetchError on non-200/non-404 status,
  on httpx.RequestError, and on the 'service not configured' path.
- Overlay sites skip the _remote_links key when fetch raises, leaving
  it ABSENT (not present-but-empty).
- transform_remote_links returns None for absent/null keys (preserve
  existing rows) vs [] (legitimate empty — wipe).
- Both consumers (batch transform_all, incremental
  transform_single_issue) honor the new contract.
- End-to-end tests
  test_incremental_preserves_remote_links_when_overlay_absent and
  test_incremental_wipes_remote_links_when_overlay_present_but_empty
  lock both halves.

Adversarial-review fixes bundled:
- service.py: unconfigured-service path now raises JiraFetchError
  instead of returning [] (a webhook can arrive while API creds are
  missing — HMAC verification uses a separate JIRA_WEBHOOK_SECRET).
  Regression guard test_raises_when_unconfigured added.
- consistency_check.py: AUTO_FIX_THRESHOLD bumped 10 -> 20 to cover
  typical SLA-poller hiccups before escalating to ERROR.
- CLAUDE.md: connectors/jira/transform.py removed from 'Files NOT to
  modify' (overlay-contract change required touching it; module
  remains sensitive but is no longer off-limits).

* release: 0.54.19 — jira remote_links hardening (transient API failure no longer wipes parquet rows)
2026-05-15 19:09:46 +02:00

268 lines
8.6 KiB
Python

#!/usr/bin/env python3
"""
Jira Remote Links Backfill - Add _remote_links to existing issue JSONs.
One-time migration script that fetches remote links from Jira API
and embeds them into existing issue JSON files. This enables the
Parquet transform to extract remote_links table data.
Usage:
# On server (loads .env from <install-dir>/.env or the current directory):
python -m connectors.jira.scripts.backfill_remote_links
# With parallel workers:
python -m connectors.jira.scripts.backfill_remote_links --parallel 4
# Dry run:
python -m connectors.jira.scripts.backfill_remote_links --dry-run
Environment variables (loaded from .env):
JIRA_DOMAIN - Jira Cloud domain
JIRA_EMAIL - Email for API authentication
JIRA_API_TOKEN - API token from Atlassian
JIRA_DATA_DIR - Directory for storing data (default: /data/src_data/raw/jira)
"""
import argparse
import json
import logging
import os
import sys
import tempfile
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from pathlib import Path
import httpx
from dotenv import load_dotenv
from app.logging_config import setup_logging
from connectors.jira.service import JiraFetchError
setup_logging(__name__)
logger = logging.getLogger(__name__)
def load_config() -> dict:
"""Load configuration from environment variables."""
# Customer-specific install paths (e.g. /opt/<deployment>/.env) can be
# injected via the AGNES_ENV_FILE env var without editing this list.
env_paths = [
Path(os.environ["AGNES_ENV_FILE"]) if os.environ.get("AGNES_ENV_FILE") else None,
Path.cwd() / ".env",
Path(__file__).parent.parent / ".env",
]
env_paths = [p for p in env_paths if p is not None]
for env_path in env_paths:
if env_path.exists():
load_dotenv(env_path)
logger.info(f"Loaded environment from {env_path}")
break
required = ["JIRA_DOMAIN", "JIRA_EMAIL", "JIRA_API_TOKEN"]
missing = [var for var in required if not os.environ.get(var)]
if missing:
raise ValueError(f"Missing required environment variables: {', '.join(missing)}")
return {
"domain": os.environ["JIRA_DOMAIN"],
"email": os.environ["JIRA_EMAIL"],
"api_token": os.environ["JIRA_API_TOKEN"],
"data_dir": Path(os.environ.get("JIRA_DATA_DIR", "/data/src_data/raw/jira")),
}
def fetch_remote_links(base_url: str, auth: tuple[str, str], issue_key: str) -> list[dict]:
"""Fetch remote links for a single issue from Jira API.
Raises JiraFetchError on auth/server failure so the caller can
skip the overlay rather than write [] and let the next incremental
transform wipe existing parquet rows.
"""
url = f"{base_url}/issue/{issue_key}/remotelink"
try:
with httpx.Client(timeout=30) as client:
response = client.get(
url,
auth=auth,
headers={"Accept": "application/json"},
)
except httpx.RequestError as e:
raise JiraFetchError(
f"Targeted backfill remote-links fetch for {issue_key} failed: connection — {e}"
) from e
if response.status_code == 200:
return response.json()
if response.status_code == 404:
return []
if response.status_code == 429:
retry_after = int(response.headers.get("Retry-After", 60))
logger.warning(f"Rate limited, waiting {retry_after}s...")
time.sleep(retry_after)
return fetch_remote_links(base_url, auth, issue_key)
if response.status_code in (401, 403):
raise JiraFetchError(
f"Targeted backfill remote-links fetch for {issue_key} failed: auth error "
f"({response.status_code}) — token may be expired/revoked"
)
if response.status_code >= 500:
raise JiraFetchError(
f"Targeted backfill remote-links fetch for {issue_key} failed: server error "
f"({response.status_code})"
)
raise JiraFetchError(
f"Targeted backfill remote-links fetch for {issue_key} failed: unexpected status "
f"{response.status_code}"
)
def process_file(json_path: Path, base_url: str, auth: tuple[str, str]) -> str:
"""
Process a single issue JSON file.
Returns: "processed", "skipped", or "failed"
"""
try:
with open(json_path) as f:
data = json.load(f)
# Skip if already has _remote_links
if "_remote_links" in data:
return "skipped"
issue_key = data.get("key")
if not issue_key:
return "failed"
# Fetch remote links. If fetch fails, return "failed" without writing
# the JSON — leaves the file as-is (no _remote_links key) so the next
# incremental transform preserves existing parquet rows.
try:
remote_links = fetch_remote_links(base_url, auth, issue_key)
except JiraFetchError as e:
logger.warning(
f"Skipping _remote_links embed for {issue_key}: {e}. "
f"Existing parquet rows will be preserved."
)
return "failed"
# Embed in data
data["_remote_links"] = remote_links
# Atomic write: temp file + replace
fd, tmp_path = tempfile.mkstemp(dir=str(json_path.parent), suffix=".tmp")
try:
with os.fdopen(fd, "w") as f:
json.dump(data, f, indent=2, default=str)
os.replace(tmp_path, str(json_path))
except Exception:
try:
os.unlink(tmp_path)
except OSError:
pass
raise
return "processed"
except Exception as e:
logger.error(f"Error processing {json_path.name}: {e}")
return "failed"
def main():
parser = argparse.ArgumentParser(
description="Backfill _remote_links into existing Jira issue JSONs",
)
parser.add_argument(
"--parallel",
type=int,
default=4,
help="Number of parallel workers (default: 4)",
)
parser.add_argument(
"--dry-run",
action="store_true",
help="Only count files, don't fetch or modify",
)
parser.add_argument(
"--data-dir",
type=Path,
help="Override data directory",
)
args = parser.parse_args()
config = load_config()
data_dir = args.data_dir or config["data_dir"]
issues_dir = data_dir / "issues"
if not issues_dir.exists():
logger.error(f"Issues directory not found: {issues_dir}")
sys.exit(1)
base_url = f"https://{config['domain']}/rest/api/3"
auth = (config["email"], config["api_token"])
# Enumerate JSON files
json_files = list(issues_dir.glob("*.json"))
total = len(json_files)
logger.info(f"Found {total} issue JSON files in {issues_dir}")
if args.dry_run:
# Count how many already have _remote_links
already_done = 0
for jf in json_files:
try:
with open(jf) as f:
data = json.load(f)
if "_remote_links" in data:
already_done += 1
except Exception:
pass
logger.info(f"Already have _remote_links: {already_done}")
logger.info(f"Would process: {total - already_done}")
return
# Process files
stats = {"processed": 0, "skipped": 0, "failed": 0}
start_time = time.time()
with ThreadPoolExecutor(max_workers=args.parallel) as executor:
futures = {executor.submit(process_file, jf, base_url, auth): jf for jf in json_files}
done_count = 0
for future in as_completed(futures):
done_count += 1
result = future.result()
stats[result] += 1
if done_count % 200 == 0:
elapsed = time.time() - start_time
rate = done_count / elapsed if elapsed > 0 else 0
logger.info(
f"Progress: {done_count}/{total} "
f"({rate:.1f}/s) - "
f"processed: {stats['processed']}, "
f"skipped: {stats['skipped']}, "
f"failed: {stats['failed']}"
)
elapsed = time.time() - start_time
logger.info("=" * 60)
logger.info("Remote links backfill completed!")
logger.info(f"Total files: {total}")
logger.info(f"Processed: {stats['processed']}")
logger.info(f"Skipped (already had _remote_links): {stats['skipped']}")
logger.info(f"Failed: {stats['failed']}")
logger.info(f"Time: {elapsed:.1f}s")
logger.info("=" * 60)
if stats["failed"] > 0:
sys.exit(1)
if __name__ == "__main__":
main()