agnes-the-ai-analyst/src/data_sync.py
Petr 80c5b902e0 Add scheduled data sync and catalog refresh with systemd timers
- New sync_schedule and profile_after_sync fields in TableConfig
  (formats: "every 15m", "every 1h", "daily 05:00")
- New src/scheduler.py with schedule evaluation logic (is_table_due)
- New --scheduled mode in data_sync.py: only syncs tables that are due,
  respects profile_after_sync flag, auto-restarts webapp after profiling
- Systemd timer+service for data-refresh (every 15 min)
- Systemd timer+service for catalog-refresh (every 15 min)
- deploy.sh enables new timers automatically
- Complete table config reference in data_description.md.example
- 58 new scheduler tests
2026-03-15 02:16:31 +01:00

734 lines
25 KiB
Python

"""
Data Synchronization Manager
Orchestrates data synchronization from configured sources to local Parquet files.
Main functions:
1. Tracking sync state (when was last synchronization)
2. DataSource ABC for pluggable connectors
3. Sync single table or all tables at once
4. Progress tracking and error handling
5. Schema generation from synced Parquet files
Sync State:
- Stored in data/metadata/sync_state.json
- Contains timestamp of last synchronization for each table
- Used for incremental sync (changedSince parameter)
"""
import importlib
import json
import logging
from abc import ABC, abstractmethod
from pathlib import Path
from typing import Dict, List, Optional, Any
from datetime import datetime
from tqdm import tqdm
from .config import get_config, TableConfig
from config.loader import load_instance_config
from connectors.openmetadata.enricher import CatalogEnricher
logger = logging.getLogger(__name__)
class SyncState:
"""
Synchronization state management.
Stores and loads information about last synchronization of each table.
"""
def __init__(self, state_file: Path):
"""
Args:
state_file: Path to JSON file with sync state
"""
self.state_file = state_file
self.state: Dict[str, Any] = self._load_state()
def _load_state(self) -> Dict[str, Any]:
"""
Load sync state from disk.
Returns:
Dictionary with sync state
"""
if self.state_file.exists():
try:
with open(self.state_file, "r") as f:
return json.load(f)
except Exception as e:
logger.error(f"Error loading sync state: {e}")
return {}
return {}
def _save_state(self):
"""
Save sync state to disk.
Creates data/metadata/ directory if needed.
"""
try:
self.state_file.parent.mkdir(parents=True, exist_ok=True)
with open(self.state_file, "w") as f:
json.dump(self.state, f, indent=2, default=str)
except Exception as e:
logger.error(f"Error saving sync state: {e}")
def get_last_sync(self, table_id: str) -> Optional[str]:
"""
Get timestamp of last synchronization for given table.
Args:
table_id: Table identifier
Returns:
ISO timestamp string, or None if not synced yet
"""
table_state = self.state.get(table_id, {})
return table_state.get("last_sync")
def get_table_state(self, table_id: str) -> Dict[str, Any]:
"""
Get complete sync state for a table.
Args:
table_id: Table identifier
Returns:
Dictionary with table sync state
"""
return self.state.get(table_id, {})
def update_sync(
self,
table_id: str,
table_name: str,
strategy: str,
rows: int,
file_size_bytes: int,
columns: int = 0,
uncompressed_bytes: int = 0,
):
"""
Update synchronization state for a table.
Args:
table_id: Table identifier
table_name: Human-readable table name
strategy: Sync strategy used
rows: Number of rows synced
file_size_bytes: Size of Parquet file in bytes
columns: Number of columns
uncompressed_bytes: Uncompressed data size
"""
self.state[table_id] = {
"table_name": table_name,
"last_sync": datetime.now().isoformat(),
"strategy": strategy,
"rows": rows,
"columns": columns,
"file_size_mb": round(file_size_bytes / 1024 / 1024, 2),
"uncompressed_mb": round(uncompressed_bytes / 1024 / 1024, 2),
}
self._save_state()
class DataSource(ABC):
"""
Abstract class for data source.
Connectors implement this to integrate different data backends.
See connectors/keboola/ for a reference implementation.
"""
@abstractmethod
def sync_table(
self,
table_config: TableConfig,
sync_state: SyncState,
) -> Dict[str, Any]:
"""
Synchronize single table.
Args:
table_config: Table configuration
sync_state: Sync state manager
Returns:
Dictionary with sync result
"""
pass
def discover_tables(self) -> List[Dict[str, Any]]:
"""List all available tables in the data source.
Returns list of dicts with at minimum:
id, name, bucket_id, columns, row_count, size_bytes,
primary_key, last_change
Default: empty list (source doesn't support discovery).
"""
return []
def get_column_metadata(self, table_id: str) -> Optional[Dict[str, Any]]:
"""Return processed column metadata for schema generation.
Returns:
{"columns": {"col_name": {"source_type": "...", "description": "..."}}}
or None if the source doesn't support metadata.
"""
return None
def get_source_name(self) -> str:
"""Display name of this data source for schema comments."""
return "Unknown"
def _get_uncompressed_size(parquet_path: Path) -> int:
"""Read total uncompressed size from Parquet file metadata."""
try:
import pyarrow.parquet as pq
meta = pq.ParquetFile(parquet_path).metadata
total = 0
for rg_idx in range(meta.num_row_groups):
rg = meta.row_group(rg_idx)
for col_idx in range(rg.num_columns):
total += rg.column(col_idx).total_uncompressed_size
return total
except Exception:
return 0
class DataSyncManager:
"""
Main data synchronization orchestrator.
Manages sync of all tables and tracks results.
"""
def __init__(self):
"""Initialize sync manager."""
self.config = get_config()
self.sync_state = SyncState(
self.config.get_metadata_path() / "sync_state.json"
)
self.data_source = create_data_source()
# Initialize OpenMetadata catalog enricher
try:
instance_config = load_instance_config()
self.catalog_enricher = CatalogEnricher(instance_config)
except Exception as e:
logger.warning(f"Failed to initialize catalog enricher: {e}")
self.catalog_enricher = CatalogEnricher({}) # Disabled enricher
def _generate_schema_yaml(self):
"""
Generate schema.yml file with actual table schemas from Parquet files.
This file is auto-generated and contains:
- Table names and descriptions
- Column names, types (from Parquet), and descriptions (from source metadata)
- Primary keys
Output: DOCS_OUTPUT_DIR/schema.yml (default: ./docs/schema.yml)
"""
import yaml
import pyarrow.parquet as pq
source_name = self.data_source.get_source_name()
logger.info("Generating schema.yml from synced tables...")
schema_data = {
"_metadata": {
"_schema_version": 2,
"generated_at": datetime.now().isoformat(),
"note": "AUTO-GENERATED - DO NOT EDIT. This file contains actual table schemas from synced Parquet files.",
"source": source_name,
"generator": "src/data_sync.py::DataSyncManager._generate_schema_yaml()",
},
"tables": {},
}
# Process each table in configuration
for table_config in self.config.tables:
try:
parquet_path = self.config.get_parquet_path(table_config)
# Skip if Parquet doesn't exist (table not synced yet)
if table_config.partition_by:
if not parquet_path.exists() or not list(parquet_path.glob("*.parquet")):
logger.debug(f" Skipping {table_config.name} (not synced yet)")
continue
first_partition = next(parquet_path.glob("*.parquet"))
pf = pq.ParquetFile(first_partition)
else:
if not parquet_path.exists():
logger.debug(f" Skipping {table_config.name} (not synced yet)")
continue
pf = pq.ParquetFile(parquet_path)
arrow_schema = pf.schema_arrow
# Get column metadata from data source (if supported)
col_metadata = self.data_source.get_column_metadata(table_config.id)
# Enrich with catalog metadata (OpenMetadata)
catalog_data = self.catalog_enricher.enrich_table(table_config)
# Extract column information
columns = []
for field_item in arrow_schema:
col_name = field_item.name
col_name_lower = col_name.lower()
pyarrow_type = str(field_item.type)
column_info = {
"name": col_name,
"type": pyarrow_type,
}
# Priority for description: catalog > BQ API > (nothing)
description = None
if catalog_data and col_name_lower in catalog_data.columns:
description = catalog_data.columns[col_name_lower].description
elif col_metadata and "columns" in col_metadata:
col_meta = col_metadata["columns"].get(col_name, {})
description = col_meta.get("description")
if description:
column_info["description"] = description
# Add source type from connector metadata
if col_metadata and "columns" in col_metadata:
col_meta = col_metadata["columns"].get(col_name, {})
if "source_type" in col_meta:
column_info["source_type"] = col_meta["source_type"]
columns.append(column_info)
primary_key = table_config.get_primary_key_columns()
# Priority for table description: catalog > data_description.md
table_description = table_config.description
if catalog_data:
table_description = catalog_data.description or table_description
table_info = {
"table_id": table_config.id,
"description": table_description,
"primary_key": primary_key,
"sync_strategy": table_config.sync_strategy,
"columns": columns,
}
if table_config.partition_by:
table_info["partitioned_by"] = table_config.partition_by
schema_data["tables"][table_config.name] = table_info
logger.debug(f" {table_config.name}: {len(columns)} columns")
except Exception as e:
logger.warning(f" Error processing {table_config.name}: {e}")
# Split tables into core (no dataset) and per-dataset groups
core_tables = {}
dataset_tables = {} # {dataset_name: {table_name: table_info}}
for table_name, table_info in schema_data["tables"].items():
table_config = next(
(t for t in self.config.tables if t.name == table_name), None
)
if table_config and table_config.dataset:
ds = table_config.dataset
if ds not in dataset_tables:
dataset_tables[ds] = {}
dataset_tables[ds][table_name] = table_info
else:
core_tables[table_name] = table_info
generated_at = schema_data["_metadata"]["generated_at"]
def _write_schema_file(filepath, tables, note=""):
"""Write a schema YAML file with header comments."""
filepath.parent.mkdir(parents=True, exist_ok=True)
data = {
"_metadata": {
"_schema_version": 2,
"generated_at": generated_at,
"note": "AUTO-GENERATED - DO NOT EDIT.",
"source": source_name,
"generator": "src/data_sync.py::DataSyncManager._generate_schema_yaml()",
},
"tables": tables,
}
with open(filepath, "w") as f:
f.write("# AUTO-GENERATED - DO NOT EDIT\n")
f.write("# This file is automatically generated during data sync\n")
f.write(f"# Generated: {generated_at}\n")
if note:
f.write(f"# {note}\n")
f.write("#\n")
f.write("# Contains actual table schemas from synced Parquet files:\n")
f.write("# - Column names and PyArrow types (from Parquet)\n")
f.write(f"# - Source types and descriptions (from {source_name})\n")
f.write("# - Primary keys and sync strategies\n")
f.write("#\n")
f.write("# For architectural documentation and relationships, see data_description.md\n")
f.write("\n")
yaml.dump(data, f, default_flow_style=False, sort_keys=False, allow_unicode=True)
# Write core schema.yml
schema_file = self.config.docs_output_dir / "schema.yml"
_write_schema_file(schema_file, core_tables)
logger.info(f"Core schema YAML: {len(core_tables)} tables -> {schema_file}")
# Write per-dataset schema files
for ds_name, ds_tables in dataset_tables.items():
ds_schema_file = self.config.docs_output_dir / "datasets" / ds_name / "schema.yml"
_write_schema_file(ds_schema_file, ds_tables, note=f"Dataset: {ds_name}")
logger.info(f"Dataset schema YAML: {len(ds_tables)} tables -> {ds_schema_file}")
total = len(core_tables) + sum(len(t) for t in dataset_tables.values())
logger.info(f"Schema generation complete: {total} tables total")
return schema_file
def sync_table(self, table_id: str) -> Dict[str, Any]:
"""
Synchronize single table by ID.
Args:
table_id: Table ID to synchronize
Returns:
Dictionary with sync result
"""
table_config = self.config.get_table_config(table_id)
if not table_config:
raise ValueError(f"Table {table_id} not found in configuration")
return self.data_source.sync_table(table_config, self.sync_state)
def sync_all(self, tables: Optional[List[str]] = None) -> Dict[str, Dict[str, Any]]:
"""
Synchronize all tables (or subset according to list).
Args:
tables: List of table IDs to synchronize. If None, syncs all.
Returns:
Dictionary {table_id: result} with sync results
"""
if tables:
table_configs = [
self.config.get_table_config(tid) for tid in tables
]
table_configs = [tc for tc in table_configs if tc is not None]
else:
table_configs = self.config.tables
# Filter out remote-only tables (no local sync needed)
remote_skipped = [
tc for tc in table_configs if tc.query_mode == "remote"
]
table_configs = [
tc for tc in table_configs if tc.query_mode != "remote"
]
if remote_skipped:
logger.info(
f"Skipping {len(remote_skipped)} remote-only tables "
f"(query via BigQuery): "
f"{', '.join(tc.name for tc in remote_skipped)}"
)
logger.info(f"Synchronizing {len(table_configs)} tables...")
results = {}
with tqdm(table_configs, desc="Syncing tables") as pbar:
for table_config in pbar:
pbar.set_description(f"Sync: {table_config.name}")
result = self.data_source.sync_table(table_config, self.sync_state)
results[table_config.id] = result
if result["success"]:
pbar.write(
f" {table_config.name}: {result['rows']:,} rows, "
f"{result['file_size_mb']:.2f} MB"
)
else:
pbar.write(f" {table_config.name}: {result['error']}")
success_count = sum(1 for r in results.values() if r["success"])
logger.info(
f"Synchronization completed: {success_count}/{len(results)} tables successful"
)
# Generate schema.yml from synced tables
if success_count > 0:
try:
self._generate_schema_yaml()
except Exception as e:
logger.warning(f"Failed to generate schema.yml: {e}")
# Auto-profile changed tables
if success_count > 0:
self._auto_profile(results)
return results
def _auto_profile(
self,
results: Dict[str, Dict[str, Any]],
skip_tables: Optional[List[str]] = None,
):
"""Run profiler on successfully synced tables.
Args:
results: Sync results dict {table_id: result}
skip_tables: Table IDs to skip profiling for
"""
skip_set = set(skip_tables or [])
try:
from src.profiler import profile_changed_tables
changed = [
self.config.get_table_config(tid).name
for tid, r in results.items()
if r.get("success")
and self.config.get_table_config(tid)
and tid not in skip_set
]
if changed:
result = profile_changed_tables(changed)
logger.info(
f"Auto-profiling: {result['success']} profiled, "
f"{result['errors']} errors, {result['skipped']} skipped"
)
else:
logger.info("No tables to profile (all skipped or none succeeded)")
except Exception as e:
logger.warning(f"Auto-profiling failed (non-fatal): {e}")
def sync_scheduled(self) -> Dict[str, Dict[str, Any]]:
"""Synchronize only tables whose sync_schedule says they are due.
Evaluates each table's sync_schedule against its last_sync timestamp.
Only syncs tables that are due. Respects profile_after_sync flag.
Returns:
Dictionary {table_id: result} with sync results (only for synced tables)
"""
from src.scheduler import is_table_due
scheduled_tables = [
tc for tc in self.config.tables
if tc.sync_schedule and tc.query_mode != "remote"
]
if not scheduled_tables:
logger.info("No tables with sync_schedule configured")
return {}
# Evaluate which tables are due
due_tables = []
for tc in scheduled_tables:
last_sync = self.sync_state.get_last_sync(tc.id)
if is_table_due(tc.sync_schedule, last_sync):
due_tables.append(tc)
logger.info(f"Table {tc.name} is DUE (schedule: {tc.sync_schedule})")
else:
logger.debug(f"Table {tc.name} is not due (schedule: {tc.sync_schedule})")
if not due_tables:
logger.info(
f"Checked {len(scheduled_tables)} scheduled tables, none are due"
)
return {}
logger.info(
f"Syncing {len(due_tables)}/{len(scheduled_tables)} due tables: "
f"{', '.join(tc.name for tc in due_tables)}"
)
# Sync due tables
results = {}
for table_config in due_tables:
try:
result = self.data_source.sync_table(table_config, self.sync_state)
results[table_config.id] = result
if result["success"]:
logger.info(
f" {table_config.name}: {result['rows']:,} rows, "
f"{result['file_size_mb']:.2f} MB"
)
else:
logger.error(f" {table_config.name}: {result['error']}")
except Exception as e:
logger.error(f" {table_config.name}: sync failed: {e}")
results[table_config.id] = {"success": False, "error": str(e)}
success_count = sum(1 for r in results.values() if r["success"])
logger.info(f"Scheduled sync: {success_count}/{len(results)} tables successful")
# Generate schema.yml
if success_count > 0:
try:
self._generate_schema_yaml()
except Exception as e:
logger.warning(f"Failed to generate schema.yml: {e}")
# Profile only tables with profile_after_sync=True
skip_profiler = [
tc.id for tc in due_tables if not tc.profile_after_sync
]
if skip_profiler:
logger.info(
f"Skipping profiler for: "
f"{', '.join(self.config.get_table_config(tid).name for tid in skip_profiler)}"
)
profiled_any = False
if success_count > 0:
tables_to_profile = [
tid for tid, r in results.items()
if r.get("success") and tid not in set(skip_profiler)
]
if tables_to_profile:
self._auto_profile(results, skip_tables=skip_profiler)
profiled_any = True
# Restart webapp if profiler ran (new profiles.json needs reload)
if profiled_any:
self._restart_webapp()
return results
def _restart_webapp(self):
"""Restart webapp service to pick up new profiles.json."""
import subprocess
try:
subprocess.run(
["sudo", "systemctl", "restart", "webapp"],
check=True,
capture_output=True,
timeout=30,
)
logger.info("Webapp restarted successfully")
except subprocess.CalledProcessError as e:
logger.warning(f"Failed to restart webapp: {e.stderr.decode() if e.stderr else e}")
except FileNotFoundError:
logger.debug("systemctl not found (not running on server)")
def create_sync_manager() -> DataSyncManager:
"""
Factory function to create DataSyncManager.
Returns:
DataSyncManager instance
"""
return DataSyncManager()
def create_data_source(source_type: str = None) -> DataSource:
"""Create a data source based on configuration.
Args:
source_type: Override source type. If None, uses DATA_SOURCE env var.
Returns:
DataSource instance
Raises:
ValueError: If source type is unknown
ImportError: If connector dependencies are missing
"""
if source_type is None:
source_type = get_config().data_source
if source_type in ("local", "keboola"):
try:
from connectors.keboola.adapter import KeboolaDataSource
except ModuleNotFoundError as e:
if "kbcstorage" in str(e):
raise ImportError(
"Keboola connector requires 'kbcstorage' package. "
"Install with: pip install kbcstorage"
) from e
raise # Re-raise real import errors
return KeboolaDataSource()
# Try dynamic connector import for other types
try:
mod = importlib.import_module(f"connectors.{source_type}.adapter")
factory = getattr(mod, "create_data_source", None)
if factory:
return factory()
# Fallback: look for a class named *DataSource
for attr_name in dir(mod):
attr = getattr(mod, attr_name)
if isinstance(attr, type) and issubclass(attr, DataSource) and attr is not DataSource:
return attr()
except ModuleNotFoundError:
pass
raise ValueError(
f"Unknown data source: '{source_type}'. "
f"Available connectors: keboola, bigquery. "
f"Create connectors/{source_type}/adapter.py to add a new one."
)
if __name__ == "__main__":
# CLI interface for sync
import sys
scheduled_mode = "--scheduled" in sys.argv
table_args = [a for a in sys.argv[1:] if a != "--scheduled"]
try:
manager = create_sync_manager()
if scheduled_mode:
print("Data Sync (scheduled mode)")
results = manager.sync_scheduled()
if not results:
print("No tables due for sync")
sys.exit(0)
elif table_args:
print("Data Sync")
print(f"\nSynchronizing selected tables: {', '.join(table_args)}")
results = manager.sync_all(tables=table_args)
else:
print("Data Sync")
print("\nSynchronizing all tables...")
results = manager.sync_all()
success_count = sum(1 for r in results.values() if r["success"])
total_count = len(results)
if success_count == total_count:
print(f"\nAll {total_count} tables synchronized successfully!")
sys.exit(0)
else:
print(
f"\n{success_count}/{total_count} tables synchronized. "
f"Check logs for details."
)
sys.exit(1)
except Exception as e:
print(f"\nError: {e}")
import traceback
traceback.print_exc()
sys.exit(1)