* fix(jira): harden _remote_links fetch — transient API failure no longer wipes parquet rows Pre-fix, all three fetch_remote_links call sites (service.py, scripts/backfill.py, scripts/backfill_remote_links.py) silently returned [] on 401/403/429/5xx or httpx.RequestError. Callers overlaid that [] onto cached issue JSON, and transform_remote_links interpreted the empty list as 'issue legitimately has no remote links — delete existing rows', so a transient Jira auth blip permanently wiped remote-link history. Now: - Every fetch site raises JiraFetchError on non-200/non-404 status, on httpx.RequestError, and on the 'service not configured' path. - Overlay sites skip the _remote_links key when fetch raises, leaving it ABSENT (not present-but-empty). - transform_remote_links returns None for absent/null keys (preserve existing rows) vs [] (legitimate empty — wipe). - Both consumers (batch transform_all, incremental transform_single_issue) honor the new contract. - End-to-end tests test_incremental_preserves_remote_links_when_overlay_absent and test_incremental_wipes_remote_links_when_overlay_present_but_empty lock both halves. Adversarial-review fixes bundled: - service.py: unconfigured-service path now raises JiraFetchError instead of returning [] (a webhook can arrive while API creds are missing — HMAC verification uses a separate JIRA_WEBHOOK_SECRET). Regression guard test_raises_when_unconfigured added. - consistency_check.py: AUTO_FIX_THRESHOLD bumped 10 -> 20 to cover typical SLA-poller hiccups before escalating to ERROR. - CLAUDE.md: connectors/jira/transform.py removed from 'Files NOT to modify' (overlay-contract change required touching it; module remains sensitive but is no longer off-limits). * release: 0.54.19 — jira remote_links hardening (transient API failure no longer wipes parquet rows)
268 lines
8.6 KiB
Python
268 lines
8.6 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Jira Remote Links Backfill - Add _remote_links to existing issue JSONs.
|
|
|
|
One-time migration script that fetches remote links from Jira API
|
|
and embeds them into existing issue JSON files. This enables the
|
|
Parquet transform to extract remote_links table data.
|
|
|
|
Usage:
|
|
# On server (loads .env from <install-dir>/.env or the current directory):
|
|
python -m connectors.jira.scripts.backfill_remote_links
|
|
|
|
# With parallel workers:
|
|
python -m connectors.jira.scripts.backfill_remote_links --parallel 4
|
|
|
|
# Dry run:
|
|
python -m connectors.jira.scripts.backfill_remote_links --dry-run
|
|
|
|
Environment variables (loaded from .env):
|
|
JIRA_DOMAIN - Jira Cloud domain
|
|
JIRA_EMAIL - Email for API authentication
|
|
JIRA_API_TOKEN - API token from Atlassian
|
|
JIRA_DATA_DIR - Directory for storing data (default: /data/src_data/raw/jira)
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import logging
|
|
import os
|
|
import sys
|
|
import tempfile
|
|
import time
|
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
from pathlib import Path
|
|
|
|
import httpx
|
|
from dotenv import load_dotenv
|
|
|
|
from app.logging_config import setup_logging
|
|
from connectors.jira.service import JiraFetchError
|
|
|
|
setup_logging(__name__)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
def load_config() -> dict:
|
|
"""Load configuration from environment variables."""
|
|
# Customer-specific install paths (e.g. /opt/<deployment>/.env) can be
|
|
# injected via the AGNES_ENV_FILE env var without editing this list.
|
|
env_paths = [
|
|
Path(os.environ["AGNES_ENV_FILE"]) if os.environ.get("AGNES_ENV_FILE") else None,
|
|
Path.cwd() / ".env",
|
|
Path(__file__).parent.parent / ".env",
|
|
]
|
|
env_paths = [p for p in env_paths if p is not None]
|
|
for env_path in env_paths:
|
|
if env_path.exists():
|
|
load_dotenv(env_path)
|
|
logger.info(f"Loaded environment from {env_path}")
|
|
break
|
|
|
|
required = ["JIRA_DOMAIN", "JIRA_EMAIL", "JIRA_API_TOKEN"]
|
|
missing = [var for var in required if not os.environ.get(var)]
|
|
if missing:
|
|
raise ValueError(f"Missing required environment variables: {', '.join(missing)}")
|
|
|
|
return {
|
|
"domain": os.environ["JIRA_DOMAIN"],
|
|
"email": os.environ["JIRA_EMAIL"],
|
|
"api_token": os.environ["JIRA_API_TOKEN"],
|
|
"data_dir": Path(os.environ.get("JIRA_DATA_DIR", "/data/src_data/raw/jira")),
|
|
}
|
|
|
|
|
|
def fetch_remote_links(base_url: str, auth: tuple[str, str], issue_key: str) -> list[dict]:
|
|
"""Fetch remote links for a single issue from Jira API.
|
|
|
|
Raises JiraFetchError on auth/server failure so the caller can
|
|
skip the overlay rather than write [] and let the next incremental
|
|
transform wipe existing parquet rows.
|
|
"""
|
|
url = f"{base_url}/issue/{issue_key}/remotelink"
|
|
|
|
try:
|
|
with httpx.Client(timeout=30) as client:
|
|
response = client.get(
|
|
url,
|
|
auth=auth,
|
|
headers={"Accept": "application/json"},
|
|
)
|
|
except httpx.RequestError as e:
|
|
raise JiraFetchError(
|
|
f"Targeted backfill remote-links fetch for {issue_key} failed: connection — {e}"
|
|
) from e
|
|
|
|
if response.status_code == 200:
|
|
return response.json()
|
|
if response.status_code == 404:
|
|
return []
|
|
if response.status_code == 429:
|
|
retry_after = int(response.headers.get("Retry-After", 60))
|
|
logger.warning(f"Rate limited, waiting {retry_after}s...")
|
|
time.sleep(retry_after)
|
|
return fetch_remote_links(base_url, auth, issue_key)
|
|
if response.status_code in (401, 403):
|
|
raise JiraFetchError(
|
|
f"Targeted backfill remote-links fetch for {issue_key} failed: auth error "
|
|
f"({response.status_code}) — token may be expired/revoked"
|
|
)
|
|
if response.status_code >= 500:
|
|
raise JiraFetchError(
|
|
f"Targeted backfill remote-links fetch for {issue_key} failed: server error "
|
|
f"({response.status_code})"
|
|
)
|
|
raise JiraFetchError(
|
|
f"Targeted backfill remote-links fetch for {issue_key} failed: unexpected status "
|
|
f"{response.status_code}"
|
|
)
|
|
|
|
|
|
def process_file(json_path: Path, base_url: str, auth: tuple[str, str]) -> str:
|
|
"""
|
|
Process a single issue JSON file.
|
|
|
|
Returns: "processed", "skipped", or "failed"
|
|
"""
|
|
try:
|
|
with open(json_path) as f:
|
|
data = json.load(f)
|
|
|
|
# Skip if already has _remote_links
|
|
if "_remote_links" in data:
|
|
return "skipped"
|
|
|
|
issue_key = data.get("key")
|
|
if not issue_key:
|
|
return "failed"
|
|
|
|
# Fetch remote links. If fetch fails, return "failed" without writing
|
|
# the JSON — leaves the file as-is (no _remote_links key) so the next
|
|
# incremental transform preserves existing parquet rows.
|
|
try:
|
|
remote_links = fetch_remote_links(base_url, auth, issue_key)
|
|
except JiraFetchError as e:
|
|
logger.warning(
|
|
f"Skipping _remote_links embed for {issue_key}: {e}. "
|
|
f"Existing parquet rows will be preserved."
|
|
)
|
|
return "failed"
|
|
|
|
# Embed in data
|
|
data["_remote_links"] = remote_links
|
|
|
|
# Atomic write: temp file + replace
|
|
fd, tmp_path = tempfile.mkstemp(dir=str(json_path.parent), suffix=".tmp")
|
|
try:
|
|
with os.fdopen(fd, "w") as f:
|
|
json.dump(data, f, indent=2, default=str)
|
|
os.replace(tmp_path, str(json_path))
|
|
except Exception:
|
|
try:
|
|
os.unlink(tmp_path)
|
|
except OSError:
|
|
pass
|
|
raise
|
|
|
|
return "processed"
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error processing {json_path.name}: {e}")
|
|
return "failed"
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="Backfill _remote_links into existing Jira issue JSONs",
|
|
)
|
|
parser.add_argument(
|
|
"--parallel",
|
|
type=int,
|
|
default=4,
|
|
help="Number of parallel workers (default: 4)",
|
|
)
|
|
parser.add_argument(
|
|
"--dry-run",
|
|
action="store_true",
|
|
help="Only count files, don't fetch or modify",
|
|
)
|
|
parser.add_argument(
|
|
"--data-dir",
|
|
type=Path,
|
|
help="Override data directory",
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
config = load_config()
|
|
data_dir = args.data_dir or config["data_dir"]
|
|
issues_dir = data_dir / "issues"
|
|
|
|
if not issues_dir.exists():
|
|
logger.error(f"Issues directory not found: {issues_dir}")
|
|
sys.exit(1)
|
|
|
|
base_url = f"https://{config['domain']}/rest/api/3"
|
|
auth = (config["email"], config["api_token"])
|
|
|
|
# Enumerate JSON files
|
|
json_files = list(issues_dir.glob("*.json"))
|
|
total = len(json_files)
|
|
logger.info(f"Found {total} issue JSON files in {issues_dir}")
|
|
|
|
if args.dry_run:
|
|
# Count how many already have _remote_links
|
|
already_done = 0
|
|
for jf in json_files:
|
|
try:
|
|
with open(jf) as f:
|
|
data = json.load(f)
|
|
if "_remote_links" in data:
|
|
already_done += 1
|
|
except Exception:
|
|
pass
|
|
logger.info(f"Already have _remote_links: {already_done}")
|
|
logger.info(f"Would process: {total - already_done}")
|
|
return
|
|
|
|
# Process files
|
|
stats = {"processed": 0, "skipped": 0, "failed": 0}
|
|
start_time = time.time()
|
|
|
|
with ThreadPoolExecutor(max_workers=args.parallel) as executor:
|
|
futures = {executor.submit(process_file, jf, base_url, auth): jf for jf in json_files}
|
|
|
|
done_count = 0
|
|
for future in as_completed(futures):
|
|
done_count += 1
|
|
result = future.result()
|
|
stats[result] += 1
|
|
|
|
if done_count % 200 == 0:
|
|
elapsed = time.time() - start_time
|
|
rate = done_count / elapsed if elapsed > 0 else 0
|
|
logger.info(
|
|
f"Progress: {done_count}/{total} "
|
|
f"({rate:.1f}/s) - "
|
|
f"processed: {stats['processed']}, "
|
|
f"skipped: {stats['skipped']}, "
|
|
f"failed: {stats['failed']}"
|
|
)
|
|
|
|
elapsed = time.time() - start_time
|
|
|
|
logger.info("=" * 60)
|
|
logger.info("Remote links backfill completed!")
|
|
logger.info(f"Total files: {total}")
|
|
logger.info(f"Processed: {stats['processed']}")
|
|
logger.info(f"Skipped (already had _remote_links): {stats['skipped']}")
|
|
logger.info(f"Failed: {stats['failed']}")
|
|
logger.info(f"Time: {elapsed:.1f}s")
|
|
logger.info("=" * 60)
|
|
|
|
if stats["failed"] > 0:
|
|
sys.exit(1)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|