Move all Jira-specific code into a self-contained connector module: - 22 files moved via git mv (transform, service, webhook, scripts, systemd units, tests, docs, bin helper) - All imports updated to use connectors.jira.* paths - Jira is now conditional: auto-detected via JIRA_DOMAIN env var - Webapp registers Jira blueprint only when available - Health service monitors Jira timers only when enabled - Profiler loads Jira tables dynamically from filesystem - Sync settings uses config-driven dependency validation - Renamed keboola_platform_url -> custom_url in transform - Updated deploy.sh, sudoers-deploy, backfill_gap.sh paths - Fixed pytest.ini to skip live tests by default
643 lines
21 KiB
Python
Executable file
643 lines
21 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Jira Backfill Script - Download all historical Jira issues.
|
|
|
|
Downloads all issues from Jira using JQL search with pagination.
|
|
Reuses the webapp's JiraService for consistent data handling.
|
|
|
|
Usage:
|
|
# On server (uses /opt/data-analyst/.env):
|
|
python -m connectors.jira.scripts.backfill
|
|
|
|
# With custom settings:
|
|
python -m connectors.jira.scripts.backfill --jql "project = MY_PROJECT AND created >= 2025-01-01"
|
|
|
|
# Skip already downloaded issues:
|
|
python -m connectors.jira.scripts.backfill --skip-existing
|
|
|
|
# Dry run (show what would be downloaded):
|
|
python -m connectors.jira.scripts.backfill --dry-run
|
|
|
|
Environment variables (loaded from .env or set manually):
|
|
JIRA_DOMAIN - Jira Cloud domain (e.g., your-org.atlassian.net)
|
|
JIRA_EMAIL - Email for API authentication
|
|
JIRA_API_TOKEN - API token from Atlassian
|
|
JIRA_DATA_DIR - Directory for storing data (default: /data/src_data/raw/jira)
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import logging
|
|
import os
|
|
import sys
|
|
import time
|
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
from dataclasses import dataclass
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
from typing import Iterator
|
|
|
|
import httpx
|
|
from dotenv import load_dotenv
|
|
|
|
# Configure logging
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format="%(asctime)s [%(levelname)s] %(message)s",
|
|
datefmt="%Y-%m-%d %H:%M:%S",
|
|
)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
@dataclass
|
|
class Config:
|
|
"""Configuration loaded from environment."""
|
|
|
|
jira_domain: str
|
|
jira_email: str
|
|
jira_api_token: str
|
|
data_dir: Path
|
|
|
|
@classmethod
|
|
def from_env(cls) -> "Config":
|
|
"""Load configuration from environment variables."""
|
|
# Try to load .env file from common locations
|
|
env_paths = [
|
|
Path("/opt/data-analyst/.env"),
|
|
Path.cwd() / ".env",
|
|
Path(__file__).parent.parent / ".env",
|
|
]
|
|
for env_path in env_paths:
|
|
if env_path.exists():
|
|
load_dotenv(env_path)
|
|
logger.info(f"Loaded environment from {env_path}")
|
|
break
|
|
|
|
# Validate required variables
|
|
required = ["JIRA_DOMAIN", "JIRA_EMAIL", "JIRA_API_TOKEN"]
|
|
missing = [var for var in required if not os.environ.get(var)]
|
|
if missing:
|
|
raise ValueError(f"Missing required environment variables: {', '.join(missing)}")
|
|
|
|
return cls(
|
|
jira_domain=os.environ["JIRA_DOMAIN"],
|
|
jira_email=os.environ["JIRA_EMAIL"],
|
|
jira_api_token=os.environ["JIRA_API_TOKEN"],
|
|
data_dir=Path(os.environ.get("JIRA_DATA_DIR", "/data/src_data/raw/jira")),
|
|
)
|
|
|
|
|
|
class JiraBackfill:
|
|
"""Backfill handler for downloading all Jira issues."""
|
|
|
|
# Jira API limits
|
|
MAX_RESULTS_PER_PAGE = 100
|
|
MAX_ATTACHMENT_SIZE = 50 * 1024 * 1024 # 50 MB
|
|
|
|
def __init__(self, config: Config):
|
|
self.config = config
|
|
self.base_url = f"https://{config.jira_domain}/rest/api/3"
|
|
self.auth = (config.jira_email, config.jira_api_token)
|
|
self.issues_dir = config.data_dir / "issues"
|
|
self.attachments_dir = config.data_dir / "attachments"
|
|
|
|
# Ensure directories exist
|
|
self.issues_dir.mkdir(parents=True, exist_ok=True)
|
|
self.attachments_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Statistics
|
|
self.stats = {
|
|
"searched": 0,
|
|
"downloaded": 0,
|
|
"skipped": 0,
|
|
"failed": 0,
|
|
"attachments": 0,
|
|
}
|
|
|
|
def search_issues(self, jql: str, next_page_token: str | None = None) -> dict:
|
|
"""
|
|
Search for issues using JQL (new /search/jql endpoint).
|
|
|
|
Args:
|
|
jql: JQL query string
|
|
next_page_token: Pagination token from previous response
|
|
|
|
Returns:
|
|
Search results dict with issues and nextPageToken
|
|
"""
|
|
url = f"{self.base_url}/search/jql"
|
|
payload = {
|
|
"jql": jql,
|
|
"maxResults": self.MAX_RESULTS_PER_PAGE,
|
|
"fields": ["key"], # Only need keys, we'll fetch full data separately
|
|
}
|
|
|
|
if next_page_token:
|
|
payload["nextPageToken"] = next_page_token
|
|
|
|
with httpx.Client(timeout=60) as client:
|
|
response = client.post(
|
|
url,
|
|
auth=self.auth,
|
|
json=payload,
|
|
headers={"Content-Type": "application/json"},
|
|
)
|
|
|
|
if response.status_code != 200:
|
|
raise RuntimeError(f"Search failed: {response.status_code} - {response.text[:200]}")
|
|
|
|
return response.json()
|
|
|
|
def iter_issue_keys(self, jql: str) -> Iterator[str]:
|
|
"""
|
|
Iterate over all issue keys matching JQL query.
|
|
|
|
Handles pagination automatically using nextPageToken.
|
|
|
|
Args:
|
|
jql: JQL query string
|
|
|
|
Yields:
|
|
Issue keys (e.g., "PROJ-15190")
|
|
"""
|
|
next_page_token = None
|
|
total_fetched = 0
|
|
first_page = True
|
|
|
|
while True:
|
|
result = self.search_issues(jql, next_page_token)
|
|
|
|
if first_page:
|
|
# Note: new API doesn't return total, we discover it as we paginate
|
|
logger.info(f"Starting search with JQL: {jql}")
|
|
first_page = False
|
|
|
|
issues = result.get("issues", [])
|
|
if not issues:
|
|
break
|
|
|
|
for issue in issues:
|
|
yield issue["key"]
|
|
|
|
total_fetched += len(issues)
|
|
self.stats["searched"] = total_fetched
|
|
|
|
# Progress logging
|
|
if total_fetched % 500 == 0:
|
|
logger.info(f"Enumerated {total_fetched} issues...")
|
|
|
|
# Check for next page
|
|
next_page_token = result.get("nextPageToken")
|
|
if not next_page_token:
|
|
break
|
|
|
|
# Respect rate limits
|
|
time.sleep(0.1)
|
|
|
|
logger.info(f"Found {total_fetched} issues total")
|
|
|
|
def fetch_issue(self, issue_key: str) -> dict | None:
|
|
"""
|
|
Fetch complete issue data from Jira.
|
|
|
|
Args:
|
|
issue_key: Issue key (e.g., "PROJ-123")
|
|
|
|
Returns:
|
|
Issue data dict or None if fetch failed
|
|
"""
|
|
url = f"{self.base_url}/issue/{issue_key}"
|
|
params = {
|
|
"expand": "renderedFields,changelog",
|
|
"fields": "*all",
|
|
}
|
|
|
|
try:
|
|
with httpx.Client(timeout=30) as client:
|
|
response = client.get(
|
|
url,
|
|
auth=self.auth,
|
|
params=params,
|
|
headers={"Accept": "application/json"},
|
|
)
|
|
|
|
if response.status_code == 200:
|
|
return response.json()
|
|
elif response.status_code == 404:
|
|
logger.warning(f"Issue {issue_key} not found")
|
|
return None
|
|
elif response.status_code == 429:
|
|
# Rate limited - wait and retry
|
|
retry_after = int(response.headers.get("Retry-After", 60))
|
|
logger.warning(f"Rate limited, waiting {retry_after}s...")
|
|
time.sleep(retry_after)
|
|
return self.fetch_issue(issue_key) # Retry
|
|
else:
|
|
logger.error(f"Failed to fetch {issue_key}: {response.status_code}")
|
|
return None
|
|
|
|
except httpx.RequestError as e:
|
|
logger.error(f"Request error fetching {issue_key}: {e}")
|
|
return None
|
|
|
|
def fetch_remote_links(self, issue_key: str) -> list[dict]:
|
|
"""
|
|
Fetch remote links for an issue from Jira.
|
|
|
|
Args:
|
|
issue_key: Issue key (e.g., "PROJ-123")
|
|
|
|
Returns:
|
|
List of remote link dicts, empty list on failure
|
|
"""
|
|
url = f"{self.base_url}/issue/{issue_key}/remotelink"
|
|
|
|
try:
|
|
with httpx.Client(timeout=30) as client:
|
|
response = client.get(
|
|
url,
|
|
auth=self.auth,
|
|
headers={"Accept": "application/json"},
|
|
)
|
|
|
|
if response.status_code == 200:
|
|
return response.json()
|
|
elif response.status_code == 404:
|
|
return []
|
|
elif response.status_code == 429:
|
|
retry_after = int(response.headers.get("Retry-After", 60))
|
|
logger.warning(f"Rate limited on remote links, waiting {retry_after}s...")
|
|
time.sleep(retry_after)
|
|
return self.fetch_remote_links(issue_key)
|
|
else:
|
|
logger.debug(f"Failed to fetch remote links for {issue_key}: {response.status_code}")
|
|
return []
|
|
|
|
except httpx.RequestError as e:
|
|
logger.debug(f"Error fetching remote links for {issue_key}: {e}")
|
|
return []
|
|
|
|
def save_issue(self, issue_data: dict) -> Path | None:
|
|
"""
|
|
Save issue data to JSON file.
|
|
|
|
Args:
|
|
issue_data: Complete issue data from Jira API
|
|
|
|
Returns:
|
|
Path to saved file or None if save failed
|
|
"""
|
|
issue_key = issue_data.get("key")
|
|
if not issue_key:
|
|
return None
|
|
|
|
# Add sync metadata
|
|
issue_data["_synced_at"] = datetime.utcnow().isoformat()
|
|
|
|
file_path = self.issues_dir / f"{issue_key}.json"
|
|
|
|
try:
|
|
with open(file_path, "w") as f:
|
|
json.dump(issue_data, f, indent=2, default=str)
|
|
return file_path
|
|
except Exception as e:
|
|
logger.error(f"Failed to save {issue_key}: {e}")
|
|
return None
|
|
|
|
def download_attachment(self, attachment: dict, issue_key: str) -> Path | None:
|
|
"""
|
|
Download a single attachment.
|
|
|
|
Args:
|
|
attachment: Attachment metadata from Jira
|
|
issue_key: Issue key for organizing files
|
|
|
|
Returns:
|
|
Path to downloaded file or None if failed
|
|
"""
|
|
content_url = attachment.get("content")
|
|
filename = attachment.get("filename", "unknown")
|
|
size = attachment.get("size", 0)
|
|
attachment_id = attachment.get("id", "unknown")
|
|
|
|
if not content_url:
|
|
return None
|
|
|
|
# Skip large attachments
|
|
if size > self.MAX_ATTACHMENT_SIZE:
|
|
logger.debug(f"Skipping large attachment {filename} ({size} bytes)")
|
|
return None
|
|
|
|
# Create issue-specific directory
|
|
issue_attachments_dir = self.attachments_dir / issue_key
|
|
issue_attachments_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
safe_filename = f"{attachment_id}_{filename}"
|
|
file_path = issue_attachments_dir / safe_filename
|
|
|
|
# Skip if already downloaded
|
|
if file_path.exists():
|
|
return file_path
|
|
|
|
try:
|
|
with httpx.Client(timeout=60, follow_redirects=True) as client:
|
|
response = client.get(content_url, auth=self.auth)
|
|
|
|
if response.status_code == 200:
|
|
with open(file_path, "wb") as f:
|
|
f.write(response.content)
|
|
return file_path
|
|
else:
|
|
logger.debug(f"Failed to download {filename}: {response.status_code}")
|
|
return None
|
|
|
|
except httpx.RequestError as e:
|
|
logger.debug(f"Error downloading {filename}: {e}")
|
|
return None
|
|
|
|
def download_issue_attachments(self, issue_data: dict) -> int:
|
|
"""
|
|
Download all attachments for an issue.
|
|
|
|
Args:
|
|
issue_data: Complete issue data
|
|
|
|
Returns:
|
|
Number of attachments downloaded
|
|
"""
|
|
issue_key = issue_data.get("key", "unknown")
|
|
attachments = issue_data.get("fields", {}).get("attachment", [])
|
|
|
|
downloaded = 0
|
|
for attachment in attachments:
|
|
if self.download_attachment(attachment, issue_key):
|
|
downloaded += 1
|
|
|
|
return downloaded
|
|
|
|
def process_issue(self, issue_key: str, skip_existing: bool = True) -> bool:
|
|
"""
|
|
Fetch and save a single issue with attachments.
|
|
|
|
Args:
|
|
issue_key: Issue key to process
|
|
skip_existing: Skip if JSON already exists
|
|
|
|
Returns:
|
|
True if successful, False otherwise
|
|
"""
|
|
# Check if already downloaded
|
|
json_path = self.issues_dir / f"{issue_key}.json"
|
|
if skip_existing and json_path.exists():
|
|
self.stats["skipped"] += 1
|
|
return True
|
|
|
|
# Fetch issue
|
|
issue_data = self.fetch_issue(issue_key)
|
|
if not issue_data:
|
|
self.stats["failed"] += 1
|
|
return False
|
|
|
|
# Fetch and embed remote links for Parquet transform
|
|
issue_data["_remote_links"] = self.fetch_remote_links(issue_key)
|
|
|
|
# Save JSON
|
|
if not self.save_issue(issue_data):
|
|
self.stats["failed"] += 1
|
|
return False
|
|
|
|
# Download attachments
|
|
num_attachments = self.download_issue_attachments(issue_data)
|
|
self.stats["attachments"] += num_attachments
|
|
self.stats["downloaded"] += 1
|
|
|
|
return True
|
|
|
|
def run(
|
|
self,
|
|
jql: str = "ORDER BY created ASC",
|
|
skip_existing: bool = True,
|
|
dry_run: bool = False,
|
|
parallel: int = 4,
|
|
) -> dict:
|
|
"""
|
|
Run the backfill process.
|
|
|
|
Args:
|
|
jql: JQL query for selecting issues
|
|
skip_existing: Skip issues that already have JSON files
|
|
dry_run: Only enumerate issues, don't download
|
|
parallel: Number of parallel download threads
|
|
|
|
Returns:
|
|
Statistics dict
|
|
"""
|
|
logger.info(f"Starting Jira backfill")
|
|
logger.info(f"JQL: {jql}")
|
|
logger.info(f"Skip existing: {skip_existing}")
|
|
logger.info(f"Dry run: {dry_run}")
|
|
logger.info(f"Data directory: {self.config.data_dir}")
|
|
|
|
start_time = time.time()
|
|
|
|
# Collect all issue keys first
|
|
issue_keys = list(self.iter_issue_keys(jql))
|
|
total_issues = len(issue_keys)
|
|
|
|
logger.info(f"Total issues to process: {total_issues}")
|
|
|
|
if dry_run:
|
|
logger.info("Dry run mode - not downloading any data")
|
|
# Count existing
|
|
existing = sum(1 for k in issue_keys if (self.issues_dir / f"{k}.json").exists())
|
|
logger.info(f"Already downloaded: {existing}")
|
|
logger.info(f"Would download: {total_issues - existing}")
|
|
return {"total": total_issues, "existing": existing}
|
|
|
|
# Process issues in parallel
|
|
processed = 0
|
|
with ThreadPoolExecutor(max_workers=parallel) as executor:
|
|
# Submit all tasks
|
|
futures = {
|
|
executor.submit(self.process_issue, key, skip_existing): key
|
|
for key in issue_keys
|
|
}
|
|
|
|
# Process as completed
|
|
for future in as_completed(futures):
|
|
issue_key = futures[future]
|
|
processed += 1
|
|
|
|
try:
|
|
success = future.result()
|
|
except Exception as e:
|
|
logger.error(f"Error processing {issue_key}: {e}")
|
|
self.stats["failed"] += 1
|
|
|
|
# Progress logging
|
|
if processed % 100 == 0:
|
|
elapsed = time.time() - start_time
|
|
rate = processed / elapsed if elapsed > 0 else 0
|
|
logger.info(
|
|
f"Progress: {processed}/{total_issues} "
|
|
f"({rate:.1f}/s) - "
|
|
f"downloaded: {self.stats['downloaded']}, "
|
|
f"skipped: {self.stats['skipped']}, "
|
|
f"failed: {self.stats['failed']}"
|
|
)
|
|
|
|
elapsed = time.time() - start_time
|
|
|
|
# Final summary
|
|
logger.info("=" * 60)
|
|
logger.info("Backfill completed!")
|
|
logger.info(f"Total issues: {total_issues}")
|
|
logger.info(f"Downloaded: {self.stats['downloaded']}")
|
|
logger.info(f"Skipped (existing): {self.stats['skipped']}")
|
|
logger.info(f"Failed: {self.stats['failed']}")
|
|
logger.info(f"Attachments: {self.stats['attachments']}")
|
|
logger.info(f"Time: {elapsed:.1f}s ({total_issues/elapsed:.1f} issues/s)")
|
|
logger.info("=" * 60)
|
|
|
|
return self.stats
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="Download all Jira issues",
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
epilog=__doc__,
|
|
)
|
|
parser.add_argument(
|
|
"--jql",
|
|
default="ORDER BY created ASC",
|
|
help="JQL query for selecting issues (e.g., 'project = \"My Project\" ORDER BY created ASC')",
|
|
)
|
|
parser.add_argument(
|
|
"--skip-existing",
|
|
action="store_true",
|
|
default=True,
|
|
help="Skip issues that already have JSON files (default: True)",
|
|
)
|
|
parser.add_argument(
|
|
"--no-skip-existing",
|
|
action="store_false",
|
|
dest="skip_existing",
|
|
help="Re-download all issues even if they exist",
|
|
)
|
|
parser.add_argument(
|
|
"--dry-run",
|
|
action="store_true",
|
|
help="Only count issues, don't download",
|
|
)
|
|
parser.add_argument(
|
|
"--parallel",
|
|
type=int,
|
|
default=4,
|
|
help="Number of parallel download threads (default: 4)",
|
|
)
|
|
parser.add_argument(
|
|
"--data-dir",
|
|
type=Path,
|
|
help="Override data directory",
|
|
)
|
|
parser.add_argument(
|
|
"--issue-keys",
|
|
help="Comma-separated list of specific issue keys to backfill (e.g., PROJ-123,PROJ-456)",
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
try:
|
|
config = Config.from_env()
|
|
|
|
# Override data dir if specified
|
|
if args.data_dir:
|
|
config.data_dir = args.data_dir
|
|
|
|
backfill = JiraBackfill(config)
|
|
|
|
# Targeted backfill mode (specific issue keys)
|
|
if args.issue_keys:
|
|
issue_keys = [key.strip() for key in args.issue_keys.split(",")]
|
|
logger.info(f"Targeted backfill mode: {len(issue_keys)} issues")
|
|
|
|
if args.dry_run:
|
|
logger.info("Dry run mode - not downloading any data")
|
|
existing = sum(
|
|
1 for k in issue_keys
|
|
if (backfill.issues_dir / f"{k}.json").exists()
|
|
)
|
|
logger.info(f"Already downloaded: {existing}")
|
|
logger.info(f"Would download: {len(issue_keys) - existing}")
|
|
sys.exit(0)
|
|
|
|
# Process each issue
|
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
|
|
start_time = time.time()
|
|
processed = 0
|
|
|
|
with ThreadPoolExecutor(max_workers=args.parallel) as executor:
|
|
futures = {
|
|
executor.submit(
|
|
backfill.process_issue,
|
|
key,
|
|
args.skip_existing
|
|
): key
|
|
for key in issue_keys
|
|
}
|
|
|
|
for future in as_completed(futures):
|
|
issue_key = futures[future]
|
|
processed += 1
|
|
|
|
try:
|
|
success = future.result()
|
|
except Exception as e:
|
|
logger.error(f"Error processing {issue_key}: {e}")
|
|
backfill.stats["failed"] += 1
|
|
|
|
if processed % 10 == 0:
|
|
logger.info(
|
|
f"Progress: {processed}/{len(issue_keys)} - "
|
|
f"downloaded: {backfill.stats['downloaded']}, "
|
|
f"skipped: {backfill.stats['skipped']}, "
|
|
f"failed: {backfill.stats['failed']}"
|
|
)
|
|
|
|
elapsed = time.time() - start_time
|
|
|
|
# Summary for targeted mode
|
|
logger.info("=" * 60)
|
|
logger.info("Targeted backfill completed!")
|
|
logger.info(f"Total issues: {len(issue_keys)}")
|
|
logger.info(f"Downloaded: {backfill.stats['downloaded']}")
|
|
logger.info(f"Skipped (existing): {backfill.stats['skipped']}")
|
|
logger.info(f"Failed: {backfill.stats['failed']}")
|
|
logger.info(f"Attachments: {backfill.stats['attachments']}")
|
|
logger.info(f"Time: {elapsed:.1f}s")
|
|
logger.info("=" * 60)
|
|
|
|
stats = backfill.stats
|
|
|
|
# Standard JQL search mode
|
|
else:
|
|
stats = backfill.run(
|
|
jql=args.jql,
|
|
skip_existing=args.skip_existing,
|
|
dry_run=args.dry_run,
|
|
parallel=args.parallel,
|
|
)
|
|
|
|
# Exit with error if any failed
|
|
if stats.get("failed", 0) > 0:
|
|
sys.exit(1)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Backfill failed: {e}")
|
|
sys.exit(1)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|