Open-source AI data analyst platform extracted from internal repo. Includes data sync engine, Keboola adapter, Flask web portal, server deployment scripts, and configuration templates.
252 lines
7.3 KiB
Python
252 lines
7.3 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Jira Remote Links Backfill - Add _remote_links to existing issue JSONs.
|
|
|
|
One-time migration script that fetches remote links from Jira API
|
|
and embeds them into existing issue JSON files. This enables the
|
|
Parquet transform to extract remote_links table data.
|
|
|
|
Usage:
|
|
# On server (uses /opt/data-analyst/.env):
|
|
python scripts/jira_backfill_remote_links.py
|
|
|
|
# With parallel workers:
|
|
python scripts/jira_backfill_remote_links.py --parallel 4
|
|
|
|
# Dry run:
|
|
python scripts/jira_backfill_remote_links.py --dry-run
|
|
|
|
Environment variables (loaded from .env):
|
|
JIRA_DOMAIN - Jira Cloud domain
|
|
JIRA_EMAIL - Email for API authentication
|
|
JIRA_API_TOKEN - API token from Atlassian
|
|
JIRA_DATA_DIR - Directory for storing data (default: /data/src_data/raw/jira)
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import logging
|
|
import os
|
|
import sys
|
|
import tempfile
|
|
import time
|
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
from pathlib import Path
|
|
|
|
import httpx
|
|
from dotenv import load_dotenv
|
|
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format="%(asctime)s [%(levelname)s] %(message)s",
|
|
datefmt="%Y-%m-%d %H:%M:%S",
|
|
)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
def load_config() -> dict:
|
|
"""Load configuration from environment variables."""
|
|
env_paths = [
|
|
Path("/opt/data-analyst/.env"),
|
|
Path.cwd() / ".env",
|
|
Path(__file__).parent.parent / ".env",
|
|
]
|
|
for env_path in env_paths:
|
|
if env_path.exists():
|
|
load_dotenv(env_path)
|
|
logger.info(f"Loaded environment from {env_path}")
|
|
break
|
|
|
|
required = ["JIRA_DOMAIN", "JIRA_EMAIL", "JIRA_API_TOKEN"]
|
|
missing = [var for var in required if not os.environ.get(var)]
|
|
if missing:
|
|
raise ValueError(f"Missing required environment variables: {', '.join(missing)}")
|
|
|
|
return {
|
|
"domain": os.environ["JIRA_DOMAIN"],
|
|
"email": os.environ["JIRA_EMAIL"],
|
|
"api_token": os.environ["JIRA_API_TOKEN"],
|
|
"data_dir": Path(os.environ.get("JIRA_DATA_DIR", "/data/src_data/raw/jira")),
|
|
}
|
|
|
|
|
|
def fetch_remote_links(
|
|
base_url: str, auth: tuple[str, str], issue_key: str
|
|
) -> list[dict]:
|
|
"""Fetch remote links for a single issue from Jira API."""
|
|
url = f"{base_url}/issue/{issue_key}/remotelink"
|
|
|
|
try:
|
|
with httpx.Client(timeout=30) as client:
|
|
response = client.get(
|
|
url,
|
|
auth=auth,
|
|
headers={"Accept": "application/json"},
|
|
)
|
|
|
|
if response.status_code == 200:
|
|
return response.json()
|
|
elif response.status_code == 404:
|
|
return []
|
|
elif response.status_code == 429:
|
|
retry_after = int(response.headers.get("Retry-After", 60))
|
|
logger.warning(f"Rate limited, waiting {retry_after}s...")
|
|
time.sleep(retry_after)
|
|
return fetch_remote_links(base_url, auth, issue_key)
|
|
else:
|
|
logger.debug(
|
|
f"Failed to fetch remote links for {issue_key}: {response.status_code}"
|
|
)
|
|
return []
|
|
|
|
except httpx.RequestError as e:
|
|
logger.debug(f"Error fetching remote links for {issue_key}: {e}")
|
|
return []
|
|
|
|
|
|
def process_file(
|
|
json_path: Path, base_url: str, auth: tuple[str, str]
|
|
) -> str:
|
|
"""
|
|
Process a single issue JSON file.
|
|
|
|
Returns: "processed", "skipped", or "failed"
|
|
"""
|
|
try:
|
|
with open(json_path) as f:
|
|
data = json.load(f)
|
|
|
|
# Skip if already has _remote_links
|
|
if "_remote_links" in data:
|
|
return "skipped"
|
|
|
|
issue_key = data.get("key")
|
|
if not issue_key:
|
|
return "failed"
|
|
|
|
# Fetch remote links
|
|
remote_links = fetch_remote_links(base_url, auth, issue_key)
|
|
|
|
# Embed in data
|
|
data["_remote_links"] = remote_links
|
|
|
|
# Atomic write: temp file + replace
|
|
fd, tmp_path = tempfile.mkstemp(
|
|
dir=str(json_path.parent), suffix=".tmp"
|
|
)
|
|
try:
|
|
with os.fdopen(fd, "w") as f:
|
|
json.dump(data, f, indent=2, default=str)
|
|
os.replace(tmp_path, str(json_path))
|
|
except Exception:
|
|
try:
|
|
os.unlink(tmp_path)
|
|
except OSError:
|
|
pass
|
|
raise
|
|
|
|
return "processed"
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error processing {json_path.name}: {e}")
|
|
return "failed"
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="Backfill _remote_links into existing Jira issue JSONs",
|
|
)
|
|
parser.add_argument(
|
|
"--parallel",
|
|
type=int,
|
|
default=4,
|
|
help="Number of parallel workers (default: 4)",
|
|
)
|
|
parser.add_argument(
|
|
"--dry-run",
|
|
action="store_true",
|
|
help="Only count files, don't fetch or modify",
|
|
)
|
|
parser.add_argument(
|
|
"--data-dir",
|
|
type=Path,
|
|
help="Override data directory",
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
config = load_config()
|
|
data_dir = args.data_dir or config["data_dir"]
|
|
issues_dir = data_dir / "issues"
|
|
|
|
if not issues_dir.exists():
|
|
logger.error(f"Issues directory not found: {issues_dir}")
|
|
sys.exit(1)
|
|
|
|
base_url = f"https://{config['domain']}/rest/api/3"
|
|
auth = (config["email"], config["api_token"])
|
|
|
|
# Enumerate JSON files
|
|
json_files = list(issues_dir.glob("*.json"))
|
|
total = len(json_files)
|
|
logger.info(f"Found {total} issue JSON files in {issues_dir}")
|
|
|
|
if args.dry_run:
|
|
# Count how many already have _remote_links
|
|
already_done = 0
|
|
for jf in json_files:
|
|
try:
|
|
with open(jf) as f:
|
|
data = json.load(f)
|
|
if "_remote_links" in data:
|
|
already_done += 1
|
|
except Exception:
|
|
pass
|
|
logger.info(f"Already have _remote_links: {already_done}")
|
|
logger.info(f"Would process: {total - already_done}")
|
|
return
|
|
|
|
# Process files
|
|
stats = {"processed": 0, "skipped": 0, "failed": 0}
|
|
start_time = time.time()
|
|
|
|
with ThreadPoolExecutor(max_workers=args.parallel) as executor:
|
|
futures = {
|
|
executor.submit(process_file, jf, base_url, auth): jf
|
|
for jf in json_files
|
|
}
|
|
|
|
done_count = 0
|
|
for future in as_completed(futures):
|
|
done_count += 1
|
|
result = future.result()
|
|
stats[result] += 1
|
|
|
|
if done_count % 200 == 0:
|
|
elapsed = time.time() - start_time
|
|
rate = done_count / elapsed if elapsed > 0 else 0
|
|
logger.info(
|
|
f"Progress: {done_count}/{total} "
|
|
f"({rate:.1f}/s) - "
|
|
f"processed: {stats['processed']}, "
|
|
f"skipped: {stats['skipped']}, "
|
|
f"failed: {stats['failed']}"
|
|
)
|
|
|
|
elapsed = time.time() - start_time
|
|
|
|
logger.info("=" * 60)
|
|
logger.info("Remote links backfill completed!")
|
|
logger.info(f"Total files: {total}")
|
|
logger.info(f"Processed: {stats['processed']}")
|
|
logger.info(f"Skipped (already had _remote_links): {stats['skipped']}")
|
|
logger.info(f"Failed: {stats['failed']}")
|
|
logger.info(f"Time: {elapsed:.1f}s")
|
|
logger.info("=" * 60)
|
|
|
|
if stats["failed"] > 0:
|
|
sys.exit(1)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|