- Delete root auth/ directory (legacy Flask providers, orphaned) - Clean requirements.txt: remove Flask, gunicorn, authlib, sendgrid, anthropic, openai, argon2-cffi (9 unused deps) - Fix hash computation in orchestrator: MD5 of parquet mtime+size (CLI sync now skips unchanged tables correctly) - Migrate pip → uv in CLAUDE.md, scripts/init.sh, pyproject.toml - Sync pyproject.toml dependencies with requirements.txt 578 tests passing.
185 lines
6.6 KiB
Python
185 lines
6.6 KiB
Python
"""Sync orchestrator — ATTACHes extract.duckdb files into master analytics.duckdb."""
|
|
|
|
import hashlib
|
|
import logging
|
|
import os
|
|
import threading
|
|
from pathlib import Path
|
|
from typing import Dict, List
|
|
|
|
import duckdb
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
_rebuild_lock = threading.Lock()
|
|
|
|
|
|
def _get_extracts_dir() -> Path:
|
|
data_dir = Path(os.environ.get("DATA_DIR", "./data"))
|
|
return data_dir / "extracts"
|
|
|
|
|
|
class SyncOrchestrator:
|
|
"""Scans /data/extracts/*, ATTACHes each extract.duckdb, creates master views."""
|
|
|
|
def __init__(self, analytics_db_path: str | None = None):
|
|
# analytics_db_path allows override for testing
|
|
if analytics_db_path:
|
|
self._db_path = analytics_db_path
|
|
else:
|
|
data_dir = Path(os.environ.get("DATA_DIR", "./data"))
|
|
self._db_path = str(data_dir / "analytics" / "server.duckdb")
|
|
Path(self._db_path).parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
def rebuild(self) -> Dict[str, List[str]]:
|
|
"""Scan all extract directories, ATTACH each, create master views.
|
|
|
|
Returns: {source_name: [table_names]} for logging.
|
|
"""
|
|
with _rebuild_lock:
|
|
return self._do_rebuild()
|
|
|
|
def rebuild_source(self, source_name: str) -> List[str]:
|
|
"""Rebuild views from a single source (e.g. after Jira webhook)."""
|
|
with _rebuild_lock:
|
|
return self._do_rebuild_source(source_name)
|
|
|
|
def _do_rebuild(self) -> Dict[str, List[str]]:
|
|
extracts_dir = _get_extracts_dir()
|
|
if not extracts_dir.exists():
|
|
logger.warning("Extracts directory %s does not exist", extracts_dir)
|
|
return {}
|
|
|
|
result = {}
|
|
# Write to temp file then rename — avoids lock conflict with query endpoint
|
|
tmp_path = self._db_path + ".tmp"
|
|
if Path(tmp_path).exists():
|
|
Path(tmp_path).unlink()
|
|
conn = duckdb.connect(tmp_path)
|
|
try:
|
|
# Detach any previously attached databases (except main and temp)
|
|
attached = [
|
|
row[0]
|
|
for row in conn.execute(
|
|
"SELECT database_name FROM duckdb_databases() "
|
|
"WHERE database_name NOT IN ('memory', 'system', 'temp')"
|
|
).fetchall()
|
|
]
|
|
for db_name in attached:
|
|
if db_name != Path(self._db_path).stem:
|
|
try:
|
|
conn.execute(f"DETACH {db_name}")
|
|
except Exception:
|
|
pass
|
|
|
|
for ext_dir in sorted(extracts_dir.iterdir()):
|
|
if not ext_dir.is_dir():
|
|
continue
|
|
db_file = ext_dir / "extract.duckdb"
|
|
if not db_file.exists():
|
|
logger.debug("Skipping %s — no extract.duckdb", ext_dir.name)
|
|
continue
|
|
|
|
tables = self._attach_and_create_views(
|
|
conn, ext_dir.name, str(db_file)
|
|
)
|
|
if tables:
|
|
result[ext_dir.name] = tables
|
|
logger.info("Attached %s: %d tables", ext_dir.name, len(tables))
|
|
finally:
|
|
conn.close()
|
|
|
|
# Atomic swap: replace analytics.duckdb with new version
|
|
import shutil
|
|
if Path(tmp_path).exists():
|
|
shutil.move(tmp_path, self._db_path)
|
|
|
|
return result
|
|
|
|
def _do_rebuild_source(self, source_name: str) -> List[str]:
|
|
extracts_dir = _get_extracts_dir()
|
|
db_file = extracts_dir / source_name / "extract.duckdb"
|
|
if not db_file.exists():
|
|
logger.warning("No extract.duckdb for source %s", source_name)
|
|
return []
|
|
|
|
tmp_path = self._db_path + ".tmp"
|
|
if Path(tmp_path).exists():
|
|
Path(tmp_path).unlink()
|
|
conn = duckdb.connect(tmp_path)
|
|
try:
|
|
# Detach if already attached
|
|
try:
|
|
conn.execute(f"DETACH {source_name}")
|
|
except Exception:
|
|
pass
|
|
tables = self._attach_and_create_views(conn, source_name, str(db_file))
|
|
finally:
|
|
conn.close()
|
|
|
|
import shutil
|
|
if Path(tmp_path).exists():
|
|
shutil.move(tmp_path, self._db_path)
|
|
|
|
return tables
|
|
|
|
def _attach_and_create_views(
|
|
self, conn: duckdb.DuckDBPyConnection, source_name: str, db_path: str
|
|
) -> List[str]:
|
|
"""ATTACH extract.duckdb, read _meta, create views in master."""
|
|
tables = []
|
|
try:
|
|
conn.execute(f"ATTACH '{db_path}' AS {source_name} (READ_ONLY)")
|
|
|
|
# Read _meta to know what's available
|
|
meta_rows = conn.execute(
|
|
f"SELECT table_name, rows, size_bytes, query_mode "
|
|
f"FROM {source_name}._meta"
|
|
).fetchall()
|
|
|
|
for table_name, rows, size_bytes, query_mode in meta_rows:
|
|
conn.execute(
|
|
f"CREATE OR REPLACE VIEW \"{table_name}\" AS "
|
|
f"SELECT * FROM {source_name}.\"{table_name}\""
|
|
)
|
|
tables.append(table_name)
|
|
|
|
# Update sync_state in system DB
|
|
self._update_sync_state(meta_rows, source_name)
|
|
|
|
except Exception as e:
|
|
logger.error("Failed to attach %s: %s", source_name, e)
|
|
|
|
return tables
|
|
|
|
def _update_sync_state(self, meta_rows: list, source_name: str) -> None:
|
|
"""Update sync_state table in system.duckdb from _meta entries."""
|
|
try:
|
|
from src.db import get_system_db
|
|
from src.repositories.sync_state import SyncStateRepository
|
|
|
|
extracts_dir = _get_extracts_dir()
|
|
sys_conn = get_system_db()
|
|
try:
|
|
repo = SyncStateRepository(sys_conn)
|
|
for table_name, rows, size_bytes, query_mode in meta_rows:
|
|
# Compute hash from parquet file stats (fast, no file read)
|
|
pq_path = extracts_dir / source_name / "data" / f"{table_name}.parquet"
|
|
if pq_path.exists():
|
|
stat = pq_path.stat()
|
|
file_hash = hashlib.md5(
|
|
f"{stat.st_mtime_ns}:{stat.st_size}".encode()
|
|
).hexdigest()[:12]
|
|
else:
|
|
file_hash = ""
|
|
|
|
repo.update_sync(
|
|
table_id=table_name,
|
|
rows=rows or 0,
|
|
file_size_bytes=size_bytes or 0,
|
|
hash=file_hash,
|
|
)
|
|
finally:
|
|
sys_conn.close()
|
|
except Exception as e:
|
|
logger.warning("Could not update sync_state: %s", e)
|