Phase 5 cleanup: remove all code replaced by extract.duckdb architecture. Deleted modules: - src/config.py (653) — replaced by DuckDB table_registry - src/parquet_manager.py (755) — replaced by DuckDB COPY TO - src/data_sync.py (734) — replaced by SyncOrchestrator - src/remote_query.py (636) — replaced by DuckDB BigQuery ATTACH - src/table_registry.py (464) — replaced by DuckDB repository - connectors/keboola/adapter.py (820) — replaced by extractor.py - connectors/bigquery/adapter.py (665) — replaced by extractor.py - connectors/bigquery/client.py (644) — replaced by DuckDB BQ extension Updated all imports in webapp, catalog_export, enricher, router, sync_settings_service, generate_sample_data. Kept keboola/client.py as fallback (removed src.config dependency). 704 tests passing.
401 lines
14 KiB
Python
401 lines
14 KiB
Python
"""
|
|
Sync settings service for the webapp.
|
|
|
|
Reads/writes shared JSON files in /data/notifications/ to manage
|
|
user sync settings (which datasets to sync).
|
|
"""
|
|
|
|
import json
|
|
import logging
|
|
import os
|
|
import subprocess
|
|
import tempfile
|
|
import time
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
NOTIFICATIONS_DIR = "/data/notifications"
|
|
SYNC_SETTINGS_FILE = os.path.join(NOTIFICATIONS_DIR, "sync_settings.json")
|
|
|
|
def _load_dataset_config():
|
|
"""Load dataset configuration from instance config."""
|
|
try:
|
|
from config.loader import load_instance_config, get_instance_value
|
|
config = load_instance_config()
|
|
datasets = get_instance_value(config, "datasets", default={})
|
|
if datasets:
|
|
defaults = {k: False for k in datasets}
|
|
return defaults, datasets
|
|
except Exception:
|
|
pass
|
|
# Fallback: empty (no optional datasets)
|
|
return {}, {}
|
|
|
|
|
|
DEFAULT_SETTINGS, DATASET_INFO = _load_dataset_config()
|
|
|
|
|
|
def _read_json(path: str) -> dict:
|
|
"""Read a JSON file, return empty dict if not found or invalid."""
|
|
try:
|
|
with open(path, "r") as f:
|
|
return json.load(f)
|
|
except (FileNotFoundError, json.JSONDecodeError):
|
|
return {}
|
|
|
|
|
|
def _write_json(path: str, data: dict) -> None:
|
|
"""Write JSON data to file atomically."""
|
|
dir_path = os.path.dirname(path)
|
|
os.makedirs(dir_path, exist_ok=True)
|
|
|
|
fd, tmp_path = tempfile.mkstemp(dir=dir_path, suffix=".tmp")
|
|
try:
|
|
with os.fdopen(fd, "w") as f:
|
|
json.dump(data, f, indent=2)
|
|
os.chmod(tmp_path, 0o660) # group-readable for data-ops
|
|
os.replace(tmp_path, path)
|
|
except Exception:
|
|
os.unlink(tmp_path)
|
|
raise
|
|
|
|
|
|
def get_sync_settings(username: str) -> dict[str, Any]:
|
|
"""Get sync settings for a user.
|
|
|
|
Returns dict with:
|
|
- datasets: {name: enabled} for each dataset
|
|
- metadata: {name: {label, description, size_hint, requires}}
|
|
"""
|
|
all_settings = _read_json(SYNC_SETTINGS_FILE)
|
|
user_settings = all_settings.get(username, {})
|
|
|
|
# Merge with defaults
|
|
datasets = dict(DEFAULT_SETTINGS)
|
|
datasets.update(user_settings.get("datasets", {}))
|
|
|
|
return {
|
|
"datasets": datasets,
|
|
"metadata": DATASET_INFO,
|
|
"updated_at": user_settings.get("updated_at"),
|
|
}
|
|
|
|
|
|
def update_sync_settings(username: str, settings: dict) -> tuple[bool, str]:
|
|
"""Update sync settings for a user.
|
|
|
|
Args:
|
|
username: The username to update settings for
|
|
settings: Dict with dataset names as keys and bool enabled as values
|
|
|
|
Returns:
|
|
(success, message) tuple
|
|
"""
|
|
# Validate settings
|
|
for key, value in settings.items():
|
|
if key not in DEFAULT_SETTINGS:
|
|
return False, f"Unknown dataset: {key}"
|
|
if not isinstance(value, bool):
|
|
return False, f"Invalid value for {key}: must be boolean"
|
|
|
|
# Read current settings and merge (so partial updates don't reset other datasets)
|
|
all_settings = _read_json(SYNC_SETTINGS_FILE)
|
|
existing = all_settings.get(username, {}).get("datasets", dict(DEFAULT_SETTINGS))
|
|
existing.update(settings)
|
|
|
|
# Validate dependencies on merged state (from instance config)
|
|
for key, info in DATASET_INFO.items():
|
|
requires = info.get("requires") if isinstance(info, dict) else None
|
|
if requires and existing.get(key) and not existing.get(requires):
|
|
return False, f"{key} requires {requires} to be enabled"
|
|
|
|
# Preserve existing table subscription settings
|
|
existing_user = all_settings.get(username, {})
|
|
table_mode = existing_user.get("table_mode", "all")
|
|
table_settings = existing_user.get("tables", {})
|
|
|
|
# Update user's settings
|
|
all_settings[username] = {
|
|
"datasets": existing,
|
|
"table_mode": table_mode,
|
|
"tables": table_settings,
|
|
"updated_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
|
|
}
|
|
|
|
# Write back
|
|
_write_json(SYNC_SETTINGS_FILE, all_settings)
|
|
|
|
# Regenerate user's config file (with table settings)
|
|
success = _regenerate_user_config(username, existing, table_mode, table_settings)
|
|
if not success:
|
|
logger.warning(f"Failed to regenerate config for {username}")
|
|
# Don't fail - settings are saved, just config generation failed
|
|
|
|
logger.info(f"Updated sync settings for '{username}': {existing}")
|
|
return True, "Settings saved. Changes take effect on next sync."
|
|
|
|
|
|
def _regenerate_user_config(username: str, settings: dict, table_mode: str = "all", table_settings: dict | None = None) -> bool:
|
|
"""Regenerate ~/.sync_settings.yaml and ~/.sync_rsync_filter for a user on the server.
|
|
|
|
Returns True on success, False on failure.
|
|
"""
|
|
# Generate YAML content
|
|
yaml_content = generate_user_config_yaml(settings, table_mode, table_settings)
|
|
|
|
# Write to user's home directory on server
|
|
user_config_path = f"/home/{username}/.sync_settings.yaml"
|
|
|
|
try:
|
|
# Use sudo to write as the target user
|
|
# This requires webapp user to have sudoers entry for this specific operation
|
|
# IMPORTANT: Must use /tmp/ explicitly - sudoers rule only allows /tmp/*.yaml
|
|
with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False, dir="/tmp") as f:
|
|
f.write(yaml_content)
|
|
tmp_path = f.name
|
|
|
|
# Copy to user's home with correct ownership
|
|
result = subprocess.run(
|
|
["/usr/bin/sudo", "-n", "/usr/bin/install", "-o", username, "-g", username, "-m", "644", tmp_path, user_config_path],
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=10,
|
|
)
|
|
|
|
os.unlink(tmp_path)
|
|
|
|
if result.returncode != 0:
|
|
logger.error(f"Failed to install config for {username}: {result.stderr}")
|
|
return False
|
|
|
|
# Generate and write rsync filter file
|
|
filter_ok = _write_rsync_filter(username, settings, table_mode, table_settings or {})
|
|
if not filter_ok:
|
|
logger.warning(f"Failed to write rsync filter for {username}")
|
|
# Don't fail overall - YAML config was written successfully
|
|
|
|
return True
|
|
|
|
except subprocess.TimeoutExpired:
|
|
logger.error(f"Timeout installing config for {username}")
|
|
return False
|
|
except Exception as e:
|
|
logger.error(f"Error installing config for {username}: {e}")
|
|
return False
|
|
|
|
|
|
def _write_rsync_filter(username: str, dataset_settings: dict, table_mode: str, table_settings: dict) -> bool:
|
|
"""Write ~/.sync_rsync_filter for a user on the server.
|
|
|
|
Returns True on success, False on failure.
|
|
"""
|
|
# Load folder_mapping from table registry (or instance config as fallback)
|
|
folder_mapping = {}
|
|
try:
|
|
from src.db import get_system_db
|
|
from src.repositories.table_registry import TableRegistryRepository
|
|
conn = get_system_db()
|
|
repo = TableRegistryRepository(conn)
|
|
tables = repo.list_all()
|
|
folder_mapping = {t["bucket"]: t["folder"] for t in tables if t.get("bucket") and t.get("folder")}
|
|
except Exception:
|
|
try:
|
|
from config.loader import load_instance_config, get_instance_value
|
|
config = load_instance_config()
|
|
folder_mapping = get_instance_value(config, "folder_mapping", default={})
|
|
except Exception:
|
|
pass
|
|
|
|
# Generate filter content
|
|
filter_content = generate_rsync_filter(dataset_settings, table_mode, table_settings, folder_mapping)
|
|
|
|
user_filter_path = f"/home/{username}/.sync_rsync_filter"
|
|
|
|
try:
|
|
# Write filter to temp file, then install to user's home
|
|
# IMPORTANT: Must use /tmp/ explicitly - sudoers rule restrictions
|
|
with tempfile.NamedTemporaryFile(mode="w", suffix=".filter", delete=False, dir="/tmp") as f:
|
|
f.write(filter_content)
|
|
tmp_path = f.name
|
|
|
|
result = subprocess.run(
|
|
["/usr/bin/sudo", "-n", "/usr/bin/install", "-o", username, "-g", username, "-m", "644", tmp_path, user_filter_path],
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=10,
|
|
)
|
|
|
|
os.unlink(tmp_path)
|
|
|
|
if result.returncode != 0:
|
|
logger.error(f"Failed to install rsync filter for {username}: {result.stderr}")
|
|
return False
|
|
|
|
return True
|
|
|
|
except subprocess.TimeoutExpired:
|
|
logger.error(f"Timeout installing rsync filter for {username}")
|
|
return False
|
|
except Exception as e:
|
|
logger.error(f"Error installing rsync filter for {username}: {e}")
|
|
return False
|
|
|
|
|
|
def get_table_subscriptions(username: str) -> dict:
|
|
"""Get per-table subscription settings for a user.
|
|
|
|
Returns:
|
|
{"table_mode": "all"|"explicit", "tables": {"name": bool, ...}}
|
|
"""
|
|
all_settings = _read_json(SYNC_SETTINGS_FILE)
|
|
user_settings = all_settings.get(username, {})
|
|
|
|
return {
|
|
"table_mode": user_settings.get("table_mode", "all"),
|
|
"tables": user_settings.get("tables", {}),
|
|
}
|
|
|
|
|
|
def update_table_subscriptions(username: str, table_mode: str, table_settings: dict) -> tuple[bool, str]:
|
|
"""Update per-table subscriptions for a user.
|
|
|
|
Args:
|
|
username: The username
|
|
table_mode: "all" or "explicit"
|
|
table_settings: Dict with table names as keys and bool as values
|
|
|
|
Returns:
|
|
(success, message)
|
|
"""
|
|
# Validate table_mode
|
|
if table_mode not in ("all", "explicit"):
|
|
return False, f"Invalid table_mode: {table_mode}. Must be 'all' or 'explicit'"
|
|
|
|
# Validate table_settings values
|
|
for key, value in table_settings.items():
|
|
if not isinstance(value, bool):
|
|
return False, f"Invalid value for table '{key}': must be boolean"
|
|
|
|
# Read current settings and update
|
|
all_settings = _read_json(SYNC_SETTINGS_FILE)
|
|
if username not in all_settings:
|
|
all_settings[username] = {}
|
|
|
|
all_settings[username]["table_mode"] = table_mode
|
|
all_settings[username]["tables"] = table_settings
|
|
all_settings[username]["updated_at"] = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
|
|
|
|
# Write back
|
|
_write_json(SYNC_SETTINGS_FILE, all_settings)
|
|
|
|
# Regenerate user's config file (with dataset + table settings)
|
|
dataset_settings = all_settings[username].get("datasets", dict(DEFAULT_SETTINGS))
|
|
success = _regenerate_user_config(username, dataset_settings, table_mode, table_settings)
|
|
if not success:
|
|
logger.warning(f"Failed to regenerate config for {username}")
|
|
|
|
logger.info(f"Updated table subscriptions for '{username}': mode={table_mode}, tables={table_settings}")
|
|
return True, "Table subscriptions saved. Changes take effect on next sync."
|
|
|
|
|
|
def generate_rsync_filter(dataset_settings: dict, table_mode: str, table_settings: dict, folder_mapping: dict) -> str:
|
|
"""Generate rsync filter file content for per-table sync.
|
|
|
|
Args:
|
|
dataset_settings: {"jira": True, ...}
|
|
table_mode: "all" or "explicit"
|
|
table_settings: {"company": True, "events": False, ...}
|
|
folder_mapping: {"in.c-crm": "crm", ...} from registry/config
|
|
|
|
Returns:
|
|
Rsync filter file content string.
|
|
"""
|
|
if table_mode == "all":
|
|
# No filtering needed - include everything
|
|
lines = [
|
|
"# AUTO-GENERATED rsync filter for per-table sync",
|
|
"# table_mode: all",
|
|
"",
|
|
"# No filtering - all tables included",
|
|
]
|
|
return "\n".join(lines) + "\n"
|
|
|
|
lines = [
|
|
"# AUTO-GENERATED rsync filter for per-table sync",
|
|
"# table_mode: explicit",
|
|
"",
|
|
]
|
|
|
|
# Build reverse mapping: table_name -> folder
|
|
# We need to know which folder each table lives in
|
|
# folder_mapping is bucket_id -> folder_name
|
|
# We'll collect all unique folders
|
|
folders_used = set(folder_mapping.values()) if folder_mapping else set()
|
|
|
|
# Subscribed tables
|
|
subscribed = {name for name, enabled in table_settings.items() if enabled}
|
|
unsubscribed = {name for name, enabled in table_settings.items() if not enabled}
|
|
|
|
if subscribed:
|
|
lines.append("# Subscribed tables")
|
|
for name in sorted(subscribed):
|
|
# Include parquet file and partitioned directory
|
|
lines.append(f"+ **/{name}.parquet")
|
|
lines.append(f"+ **/{name}/***")
|
|
lines.append("")
|
|
|
|
if unsubscribed:
|
|
lines.append("# Excluded tables")
|
|
for name in sorted(unsubscribed):
|
|
lines.append(f"- **/{name}.parquet")
|
|
lines.append(f"- **/{name}/***")
|
|
lines.append("")
|
|
|
|
# Include folder structure but exclude unknown files
|
|
lines.append("# Include folder structure")
|
|
lines.append("+ */")
|
|
lines.append("- *")
|
|
lines.append("")
|
|
|
|
return "\n".join(lines)
|
|
|
|
|
|
def generate_user_config_yaml(settings: dict, table_mode: str = "all", table_settings: dict | None = None) -> str:
|
|
"""Generate YAML content for sync config.
|
|
|
|
Args:
|
|
settings: Dict with dataset names and enabled status
|
|
table_mode: "all" or "explicit" (default "all")
|
|
table_settings: Dict with table names and subscription status (optional)
|
|
|
|
Returns:
|
|
YAML string content
|
|
"""
|
|
lines = [
|
|
"# Data Analyst - Sync Configuration",
|
|
"# Managed by web portal - changes here may be overwritten",
|
|
"",
|
|
"datasets:",
|
|
]
|
|
|
|
for dataset, enabled in sorted(settings.items()):
|
|
value = "true" if enabled else "false"
|
|
lines.append(f" {dataset}: {value}")
|
|
|
|
lines.append("")
|
|
|
|
# Per-table subscriptions
|
|
lines.append(f"table_mode: {table_mode}")
|
|
|
|
if table_settings:
|
|
lines.append("tables:")
|
|
for table_name, subscribed in sorted(table_settings.items()):
|
|
value = "true" if subscribed else "false"
|
|
lines.append(f" {table_name}: {value}")
|
|
else:
|
|
lines.append("tables: {}")
|
|
|
|
lines.append("")
|
|
return "\n".join(lines)
|