- New sync_schedule and profile_after_sync fields in TableConfig (formats: "every 15m", "every 1h", "daily 05:00") - New src/scheduler.py with schedule evaluation logic (is_table_due) - New --scheduled mode in data_sync.py: only syncs tables that are due, respects profile_after_sync flag, auto-restarts webapp after profiling - Systemd timer+service for data-refresh (every 15 min) - Systemd timer+service for catalog-refresh (every 15 min) - deploy.sh enables new timers automatically - Complete table config reference in data_description.md.example - 58 new scheduler tests
734 lines
25 KiB
Python
734 lines
25 KiB
Python
"""
|
|
Data Synchronization Manager
|
|
|
|
Orchestrates data synchronization from configured sources to local Parquet files.
|
|
|
|
Main functions:
|
|
1. Tracking sync state (when was last synchronization)
|
|
2. DataSource ABC for pluggable connectors
|
|
3. Sync single table or all tables at once
|
|
4. Progress tracking and error handling
|
|
5. Schema generation from synced Parquet files
|
|
|
|
Sync State:
|
|
- Stored in data/metadata/sync_state.json
|
|
- Contains timestamp of last synchronization for each table
|
|
- Used for incremental sync (changedSince parameter)
|
|
"""
|
|
|
|
import importlib
|
|
import json
|
|
import logging
|
|
from abc import ABC, abstractmethod
|
|
from pathlib import Path
|
|
from typing import Dict, List, Optional, Any
|
|
from datetime import datetime
|
|
|
|
from tqdm import tqdm
|
|
|
|
from .config import get_config, TableConfig
|
|
from config.loader import load_instance_config
|
|
from connectors.openmetadata.enricher import CatalogEnricher
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class SyncState:
|
|
"""
|
|
Synchronization state management.
|
|
|
|
Stores and loads information about last synchronization of each table.
|
|
"""
|
|
|
|
def __init__(self, state_file: Path):
|
|
"""
|
|
Args:
|
|
state_file: Path to JSON file with sync state
|
|
"""
|
|
self.state_file = state_file
|
|
self.state: Dict[str, Any] = self._load_state()
|
|
|
|
def _load_state(self) -> Dict[str, Any]:
|
|
"""
|
|
Load sync state from disk.
|
|
|
|
Returns:
|
|
Dictionary with sync state
|
|
"""
|
|
if self.state_file.exists():
|
|
try:
|
|
with open(self.state_file, "r") as f:
|
|
return json.load(f)
|
|
except Exception as e:
|
|
logger.error(f"Error loading sync state: {e}")
|
|
return {}
|
|
return {}
|
|
|
|
def _save_state(self):
|
|
"""
|
|
Save sync state to disk.
|
|
|
|
Creates data/metadata/ directory if needed.
|
|
"""
|
|
try:
|
|
self.state_file.parent.mkdir(parents=True, exist_ok=True)
|
|
with open(self.state_file, "w") as f:
|
|
json.dump(self.state, f, indent=2, default=str)
|
|
except Exception as e:
|
|
logger.error(f"Error saving sync state: {e}")
|
|
|
|
def get_last_sync(self, table_id: str) -> Optional[str]:
|
|
"""
|
|
Get timestamp of last synchronization for given table.
|
|
|
|
Args:
|
|
table_id: Table identifier
|
|
|
|
Returns:
|
|
ISO timestamp string, or None if not synced yet
|
|
"""
|
|
table_state = self.state.get(table_id, {})
|
|
return table_state.get("last_sync")
|
|
|
|
def get_table_state(self, table_id: str) -> Dict[str, Any]:
|
|
"""
|
|
Get complete sync state for a table.
|
|
|
|
Args:
|
|
table_id: Table identifier
|
|
|
|
Returns:
|
|
Dictionary with table sync state
|
|
"""
|
|
return self.state.get(table_id, {})
|
|
|
|
def update_sync(
|
|
self,
|
|
table_id: str,
|
|
table_name: str,
|
|
strategy: str,
|
|
rows: int,
|
|
file_size_bytes: int,
|
|
columns: int = 0,
|
|
uncompressed_bytes: int = 0,
|
|
):
|
|
"""
|
|
Update synchronization state for a table.
|
|
|
|
Args:
|
|
table_id: Table identifier
|
|
table_name: Human-readable table name
|
|
strategy: Sync strategy used
|
|
rows: Number of rows synced
|
|
file_size_bytes: Size of Parquet file in bytes
|
|
columns: Number of columns
|
|
uncompressed_bytes: Uncompressed data size
|
|
"""
|
|
self.state[table_id] = {
|
|
"table_name": table_name,
|
|
"last_sync": datetime.now().isoformat(),
|
|
"strategy": strategy,
|
|
"rows": rows,
|
|
"columns": columns,
|
|
"file_size_mb": round(file_size_bytes / 1024 / 1024, 2),
|
|
"uncompressed_mb": round(uncompressed_bytes / 1024 / 1024, 2),
|
|
}
|
|
|
|
self._save_state()
|
|
|
|
|
|
class DataSource(ABC):
|
|
"""
|
|
Abstract class for data source.
|
|
|
|
Connectors implement this to integrate different data backends.
|
|
See connectors/keboola/ for a reference implementation.
|
|
"""
|
|
|
|
@abstractmethod
|
|
def sync_table(
|
|
self,
|
|
table_config: TableConfig,
|
|
sync_state: SyncState,
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Synchronize single table.
|
|
|
|
Args:
|
|
table_config: Table configuration
|
|
sync_state: Sync state manager
|
|
|
|
Returns:
|
|
Dictionary with sync result
|
|
"""
|
|
pass
|
|
|
|
def discover_tables(self) -> List[Dict[str, Any]]:
|
|
"""List all available tables in the data source.
|
|
|
|
Returns list of dicts with at minimum:
|
|
id, name, bucket_id, columns, row_count, size_bytes,
|
|
primary_key, last_change
|
|
Default: empty list (source doesn't support discovery).
|
|
"""
|
|
return []
|
|
|
|
def get_column_metadata(self, table_id: str) -> Optional[Dict[str, Any]]:
|
|
"""Return processed column metadata for schema generation.
|
|
|
|
Returns:
|
|
{"columns": {"col_name": {"source_type": "...", "description": "..."}}}
|
|
or None if the source doesn't support metadata.
|
|
"""
|
|
return None
|
|
|
|
def get_source_name(self) -> str:
|
|
"""Display name of this data source for schema comments."""
|
|
return "Unknown"
|
|
|
|
|
|
def _get_uncompressed_size(parquet_path: Path) -> int:
|
|
"""Read total uncompressed size from Parquet file metadata."""
|
|
try:
|
|
import pyarrow.parquet as pq
|
|
|
|
meta = pq.ParquetFile(parquet_path).metadata
|
|
total = 0
|
|
for rg_idx in range(meta.num_row_groups):
|
|
rg = meta.row_group(rg_idx)
|
|
for col_idx in range(rg.num_columns):
|
|
total += rg.column(col_idx).total_uncompressed_size
|
|
return total
|
|
except Exception:
|
|
return 0
|
|
|
|
|
|
class DataSyncManager:
|
|
"""
|
|
Main data synchronization orchestrator.
|
|
|
|
Manages sync of all tables and tracks results.
|
|
"""
|
|
|
|
def __init__(self):
|
|
"""Initialize sync manager."""
|
|
self.config = get_config()
|
|
self.sync_state = SyncState(
|
|
self.config.get_metadata_path() / "sync_state.json"
|
|
)
|
|
self.data_source = create_data_source()
|
|
|
|
# Initialize OpenMetadata catalog enricher
|
|
try:
|
|
instance_config = load_instance_config()
|
|
self.catalog_enricher = CatalogEnricher(instance_config)
|
|
except Exception as e:
|
|
logger.warning(f"Failed to initialize catalog enricher: {e}")
|
|
self.catalog_enricher = CatalogEnricher({}) # Disabled enricher
|
|
|
|
def _generate_schema_yaml(self):
|
|
"""
|
|
Generate schema.yml file with actual table schemas from Parquet files.
|
|
|
|
This file is auto-generated and contains:
|
|
- Table names and descriptions
|
|
- Column names, types (from Parquet), and descriptions (from source metadata)
|
|
- Primary keys
|
|
|
|
Output: DOCS_OUTPUT_DIR/schema.yml (default: ./docs/schema.yml)
|
|
"""
|
|
import yaml
|
|
import pyarrow.parquet as pq
|
|
|
|
source_name = self.data_source.get_source_name()
|
|
|
|
logger.info("Generating schema.yml from synced tables...")
|
|
|
|
schema_data = {
|
|
"_metadata": {
|
|
"_schema_version": 2,
|
|
"generated_at": datetime.now().isoformat(),
|
|
"note": "AUTO-GENERATED - DO NOT EDIT. This file contains actual table schemas from synced Parquet files.",
|
|
"source": source_name,
|
|
"generator": "src/data_sync.py::DataSyncManager._generate_schema_yaml()",
|
|
},
|
|
"tables": {},
|
|
}
|
|
|
|
# Process each table in configuration
|
|
for table_config in self.config.tables:
|
|
try:
|
|
parquet_path = self.config.get_parquet_path(table_config)
|
|
|
|
# Skip if Parquet doesn't exist (table not synced yet)
|
|
if table_config.partition_by:
|
|
if not parquet_path.exists() or not list(parquet_path.glob("*.parquet")):
|
|
logger.debug(f" Skipping {table_config.name} (not synced yet)")
|
|
continue
|
|
first_partition = next(parquet_path.glob("*.parquet"))
|
|
pf = pq.ParquetFile(first_partition)
|
|
else:
|
|
if not parquet_path.exists():
|
|
logger.debug(f" Skipping {table_config.name} (not synced yet)")
|
|
continue
|
|
pf = pq.ParquetFile(parquet_path)
|
|
|
|
arrow_schema = pf.schema_arrow
|
|
|
|
# Get column metadata from data source (if supported)
|
|
col_metadata = self.data_source.get_column_metadata(table_config.id)
|
|
|
|
# Enrich with catalog metadata (OpenMetadata)
|
|
catalog_data = self.catalog_enricher.enrich_table(table_config)
|
|
|
|
# Extract column information
|
|
columns = []
|
|
for field_item in arrow_schema:
|
|
col_name = field_item.name
|
|
col_name_lower = col_name.lower()
|
|
pyarrow_type = str(field_item.type)
|
|
|
|
column_info = {
|
|
"name": col_name,
|
|
"type": pyarrow_type,
|
|
}
|
|
|
|
# Priority for description: catalog > BQ API > (nothing)
|
|
description = None
|
|
if catalog_data and col_name_lower in catalog_data.columns:
|
|
description = catalog_data.columns[col_name_lower].description
|
|
elif col_metadata and "columns" in col_metadata:
|
|
col_meta = col_metadata["columns"].get(col_name, {})
|
|
description = col_meta.get("description")
|
|
|
|
if description:
|
|
column_info["description"] = description
|
|
|
|
# Add source type from connector metadata
|
|
if col_metadata and "columns" in col_metadata:
|
|
col_meta = col_metadata["columns"].get(col_name, {})
|
|
if "source_type" in col_meta:
|
|
column_info["source_type"] = col_meta["source_type"]
|
|
|
|
columns.append(column_info)
|
|
|
|
primary_key = table_config.get_primary_key_columns()
|
|
|
|
# Priority for table description: catalog > data_description.md
|
|
table_description = table_config.description
|
|
if catalog_data:
|
|
table_description = catalog_data.description or table_description
|
|
|
|
table_info = {
|
|
"table_id": table_config.id,
|
|
"description": table_description,
|
|
"primary_key": primary_key,
|
|
"sync_strategy": table_config.sync_strategy,
|
|
"columns": columns,
|
|
}
|
|
|
|
if table_config.partition_by:
|
|
table_info["partitioned_by"] = table_config.partition_by
|
|
|
|
schema_data["tables"][table_config.name] = table_info
|
|
|
|
logger.debug(f" {table_config.name}: {len(columns)} columns")
|
|
|
|
except Exception as e:
|
|
logger.warning(f" Error processing {table_config.name}: {e}")
|
|
|
|
# Split tables into core (no dataset) and per-dataset groups
|
|
core_tables = {}
|
|
dataset_tables = {} # {dataset_name: {table_name: table_info}}
|
|
for table_name, table_info in schema_data["tables"].items():
|
|
table_config = next(
|
|
(t for t in self.config.tables if t.name == table_name), None
|
|
)
|
|
if table_config and table_config.dataset:
|
|
ds = table_config.dataset
|
|
if ds not in dataset_tables:
|
|
dataset_tables[ds] = {}
|
|
dataset_tables[ds][table_name] = table_info
|
|
else:
|
|
core_tables[table_name] = table_info
|
|
|
|
generated_at = schema_data["_metadata"]["generated_at"]
|
|
|
|
def _write_schema_file(filepath, tables, note=""):
|
|
"""Write a schema YAML file with header comments."""
|
|
filepath.parent.mkdir(parents=True, exist_ok=True)
|
|
data = {
|
|
"_metadata": {
|
|
"_schema_version": 2,
|
|
"generated_at": generated_at,
|
|
"note": "AUTO-GENERATED - DO NOT EDIT.",
|
|
"source": source_name,
|
|
"generator": "src/data_sync.py::DataSyncManager._generate_schema_yaml()",
|
|
},
|
|
"tables": tables,
|
|
}
|
|
with open(filepath, "w") as f:
|
|
f.write("# AUTO-GENERATED - DO NOT EDIT\n")
|
|
f.write("# This file is automatically generated during data sync\n")
|
|
f.write(f"# Generated: {generated_at}\n")
|
|
if note:
|
|
f.write(f"# {note}\n")
|
|
f.write("#\n")
|
|
f.write("# Contains actual table schemas from synced Parquet files:\n")
|
|
f.write("# - Column names and PyArrow types (from Parquet)\n")
|
|
f.write(f"# - Source types and descriptions (from {source_name})\n")
|
|
f.write("# - Primary keys and sync strategies\n")
|
|
f.write("#\n")
|
|
f.write("# For architectural documentation and relationships, see data_description.md\n")
|
|
f.write("\n")
|
|
yaml.dump(data, f, default_flow_style=False, sort_keys=False, allow_unicode=True)
|
|
|
|
# Write core schema.yml
|
|
schema_file = self.config.docs_output_dir / "schema.yml"
|
|
_write_schema_file(schema_file, core_tables)
|
|
logger.info(f"Core schema YAML: {len(core_tables)} tables -> {schema_file}")
|
|
|
|
# Write per-dataset schema files
|
|
for ds_name, ds_tables in dataset_tables.items():
|
|
ds_schema_file = self.config.docs_output_dir / "datasets" / ds_name / "schema.yml"
|
|
_write_schema_file(ds_schema_file, ds_tables, note=f"Dataset: {ds_name}")
|
|
logger.info(f"Dataset schema YAML: {len(ds_tables)} tables -> {ds_schema_file}")
|
|
|
|
total = len(core_tables) + sum(len(t) for t in dataset_tables.values())
|
|
logger.info(f"Schema generation complete: {total} tables total")
|
|
|
|
return schema_file
|
|
|
|
def sync_table(self, table_id: str) -> Dict[str, Any]:
|
|
"""
|
|
Synchronize single table by ID.
|
|
|
|
Args:
|
|
table_id: Table ID to synchronize
|
|
|
|
Returns:
|
|
Dictionary with sync result
|
|
"""
|
|
table_config = self.config.get_table_config(table_id)
|
|
if not table_config:
|
|
raise ValueError(f"Table {table_id} not found in configuration")
|
|
|
|
return self.data_source.sync_table(table_config, self.sync_state)
|
|
|
|
def sync_all(self, tables: Optional[List[str]] = None) -> Dict[str, Dict[str, Any]]:
|
|
"""
|
|
Synchronize all tables (or subset according to list).
|
|
|
|
Args:
|
|
tables: List of table IDs to synchronize. If None, syncs all.
|
|
|
|
Returns:
|
|
Dictionary {table_id: result} with sync results
|
|
"""
|
|
if tables:
|
|
table_configs = [
|
|
self.config.get_table_config(tid) for tid in tables
|
|
]
|
|
table_configs = [tc for tc in table_configs if tc is not None]
|
|
else:
|
|
table_configs = self.config.tables
|
|
|
|
# Filter out remote-only tables (no local sync needed)
|
|
remote_skipped = [
|
|
tc for tc in table_configs if tc.query_mode == "remote"
|
|
]
|
|
table_configs = [
|
|
tc for tc in table_configs if tc.query_mode != "remote"
|
|
]
|
|
|
|
if remote_skipped:
|
|
logger.info(
|
|
f"Skipping {len(remote_skipped)} remote-only tables "
|
|
f"(query via BigQuery): "
|
|
f"{', '.join(tc.name for tc in remote_skipped)}"
|
|
)
|
|
|
|
logger.info(f"Synchronizing {len(table_configs)} tables...")
|
|
|
|
results = {}
|
|
with tqdm(table_configs, desc="Syncing tables") as pbar:
|
|
for table_config in pbar:
|
|
pbar.set_description(f"Sync: {table_config.name}")
|
|
|
|
result = self.data_source.sync_table(table_config, self.sync_state)
|
|
results[table_config.id] = result
|
|
|
|
if result["success"]:
|
|
pbar.write(
|
|
f" {table_config.name}: {result['rows']:,} rows, "
|
|
f"{result['file_size_mb']:.2f} MB"
|
|
)
|
|
else:
|
|
pbar.write(f" {table_config.name}: {result['error']}")
|
|
|
|
success_count = sum(1 for r in results.values() if r["success"])
|
|
logger.info(
|
|
f"Synchronization completed: {success_count}/{len(results)} tables successful"
|
|
)
|
|
|
|
# Generate schema.yml from synced tables
|
|
if success_count > 0:
|
|
try:
|
|
self._generate_schema_yaml()
|
|
except Exception as e:
|
|
logger.warning(f"Failed to generate schema.yml: {e}")
|
|
|
|
# Auto-profile changed tables
|
|
if success_count > 0:
|
|
self._auto_profile(results)
|
|
|
|
return results
|
|
|
|
def _auto_profile(
|
|
self,
|
|
results: Dict[str, Dict[str, Any]],
|
|
skip_tables: Optional[List[str]] = None,
|
|
):
|
|
"""Run profiler on successfully synced tables.
|
|
|
|
Args:
|
|
results: Sync results dict {table_id: result}
|
|
skip_tables: Table IDs to skip profiling for
|
|
"""
|
|
skip_set = set(skip_tables or [])
|
|
try:
|
|
from src.profiler import profile_changed_tables
|
|
changed = [
|
|
self.config.get_table_config(tid).name
|
|
for tid, r in results.items()
|
|
if r.get("success")
|
|
and self.config.get_table_config(tid)
|
|
and tid not in skip_set
|
|
]
|
|
if changed:
|
|
result = profile_changed_tables(changed)
|
|
logger.info(
|
|
f"Auto-profiling: {result['success']} profiled, "
|
|
f"{result['errors']} errors, {result['skipped']} skipped"
|
|
)
|
|
else:
|
|
logger.info("No tables to profile (all skipped or none succeeded)")
|
|
except Exception as e:
|
|
logger.warning(f"Auto-profiling failed (non-fatal): {e}")
|
|
|
|
def sync_scheduled(self) -> Dict[str, Dict[str, Any]]:
|
|
"""Synchronize only tables whose sync_schedule says they are due.
|
|
|
|
Evaluates each table's sync_schedule against its last_sync timestamp.
|
|
Only syncs tables that are due. Respects profile_after_sync flag.
|
|
|
|
Returns:
|
|
Dictionary {table_id: result} with sync results (only for synced tables)
|
|
"""
|
|
from src.scheduler import is_table_due
|
|
|
|
scheduled_tables = [
|
|
tc for tc in self.config.tables
|
|
if tc.sync_schedule and tc.query_mode != "remote"
|
|
]
|
|
|
|
if not scheduled_tables:
|
|
logger.info("No tables with sync_schedule configured")
|
|
return {}
|
|
|
|
# Evaluate which tables are due
|
|
due_tables = []
|
|
for tc in scheduled_tables:
|
|
last_sync = self.sync_state.get_last_sync(tc.id)
|
|
if is_table_due(tc.sync_schedule, last_sync):
|
|
due_tables.append(tc)
|
|
logger.info(f"Table {tc.name} is DUE (schedule: {tc.sync_schedule})")
|
|
else:
|
|
logger.debug(f"Table {tc.name} is not due (schedule: {tc.sync_schedule})")
|
|
|
|
if not due_tables:
|
|
logger.info(
|
|
f"Checked {len(scheduled_tables)} scheduled tables, none are due"
|
|
)
|
|
return {}
|
|
|
|
logger.info(
|
|
f"Syncing {len(due_tables)}/{len(scheduled_tables)} due tables: "
|
|
f"{', '.join(tc.name for tc in due_tables)}"
|
|
)
|
|
|
|
# Sync due tables
|
|
results = {}
|
|
for table_config in due_tables:
|
|
try:
|
|
result = self.data_source.sync_table(table_config, self.sync_state)
|
|
results[table_config.id] = result
|
|
if result["success"]:
|
|
logger.info(
|
|
f" {table_config.name}: {result['rows']:,} rows, "
|
|
f"{result['file_size_mb']:.2f} MB"
|
|
)
|
|
else:
|
|
logger.error(f" {table_config.name}: {result['error']}")
|
|
except Exception as e:
|
|
logger.error(f" {table_config.name}: sync failed: {e}")
|
|
results[table_config.id] = {"success": False, "error": str(e)}
|
|
|
|
success_count = sum(1 for r in results.values() if r["success"])
|
|
logger.info(f"Scheduled sync: {success_count}/{len(results)} tables successful")
|
|
|
|
# Generate schema.yml
|
|
if success_count > 0:
|
|
try:
|
|
self._generate_schema_yaml()
|
|
except Exception as e:
|
|
logger.warning(f"Failed to generate schema.yml: {e}")
|
|
|
|
# Profile only tables with profile_after_sync=True
|
|
skip_profiler = [
|
|
tc.id for tc in due_tables if not tc.profile_after_sync
|
|
]
|
|
if skip_profiler:
|
|
logger.info(
|
|
f"Skipping profiler for: "
|
|
f"{', '.join(self.config.get_table_config(tid).name for tid in skip_profiler)}"
|
|
)
|
|
|
|
profiled_any = False
|
|
if success_count > 0:
|
|
tables_to_profile = [
|
|
tid for tid, r in results.items()
|
|
if r.get("success") and tid not in set(skip_profiler)
|
|
]
|
|
if tables_to_profile:
|
|
self._auto_profile(results, skip_tables=skip_profiler)
|
|
profiled_any = True
|
|
|
|
# Restart webapp if profiler ran (new profiles.json needs reload)
|
|
if profiled_any:
|
|
self._restart_webapp()
|
|
|
|
return results
|
|
|
|
def _restart_webapp(self):
|
|
"""Restart webapp service to pick up new profiles.json."""
|
|
import subprocess
|
|
try:
|
|
subprocess.run(
|
|
["sudo", "systemctl", "restart", "webapp"],
|
|
check=True,
|
|
capture_output=True,
|
|
timeout=30,
|
|
)
|
|
logger.info("Webapp restarted successfully")
|
|
except subprocess.CalledProcessError as e:
|
|
logger.warning(f"Failed to restart webapp: {e.stderr.decode() if e.stderr else e}")
|
|
except FileNotFoundError:
|
|
logger.debug("systemctl not found (not running on server)")
|
|
|
|
|
|
def create_sync_manager() -> DataSyncManager:
|
|
"""
|
|
Factory function to create DataSyncManager.
|
|
|
|
Returns:
|
|
DataSyncManager instance
|
|
"""
|
|
return DataSyncManager()
|
|
|
|
|
|
def create_data_source(source_type: str = None) -> DataSource:
|
|
"""Create a data source based on configuration.
|
|
|
|
Args:
|
|
source_type: Override source type. If None, uses DATA_SOURCE env var.
|
|
|
|
Returns:
|
|
DataSource instance
|
|
|
|
Raises:
|
|
ValueError: If source type is unknown
|
|
ImportError: If connector dependencies are missing
|
|
"""
|
|
if source_type is None:
|
|
source_type = get_config().data_source
|
|
|
|
if source_type in ("local", "keboola"):
|
|
try:
|
|
from connectors.keboola.adapter import KeboolaDataSource
|
|
except ModuleNotFoundError as e:
|
|
if "kbcstorage" in str(e):
|
|
raise ImportError(
|
|
"Keboola connector requires 'kbcstorage' package. "
|
|
"Install with: pip install kbcstorage"
|
|
) from e
|
|
raise # Re-raise real import errors
|
|
return KeboolaDataSource()
|
|
|
|
# Try dynamic connector import for other types
|
|
try:
|
|
mod = importlib.import_module(f"connectors.{source_type}.adapter")
|
|
factory = getattr(mod, "create_data_source", None)
|
|
if factory:
|
|
return factory()
|
|
# Fallback: look for a class named *DataSource
|
|
for attr_name in dir(mod):
|
|
attr = getattr(mod, attr_name)
|
|
if isinstance(attr, type) and issubclass(attr, DataSource) and attr is not DataSource:
|
|
return attr()
|
|
except ModuleNotFoundError:
|
|
pass
|
|
|
|
raise ValueError(
|
|
f"Unknown data source: '{source_type}'. "
|
|
f"Available connectors: keboola, bigquery. "
|
|
f"Create connectors/{source_type}/adapter.py to add a new one."
|
|
)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
# CLI interface for sync
|
|
import sys
|
|
|
|
scheduled_mode = "--scheduled" in sys.argv
|
|
table_args = [a for a in sys.argv[1:] if a != "--scheduled"]
|
|
|
|
try:
|
|
manager = create_sync_manager()
|
|
|
|
if scheduled_mode:
|
|
print("Data Sync (scheduled mode)")
|
|
results = manager.sync_scheduled()
|
|
|
|
if not results:
|
|
print("No tables due for sync")
|
|
sys.exit(0)
|
|
elif table_args:
|
|
print("Data Sync")
|
|
print(f"\nSynchronizing selected tables: {', '.join(table_args)}")
|
|
results = manager.sync_all(tables=table_args)
|
|
else:
|
|
print("Data Sync")
|
|
print("\nSynchronizing all tables...")
|
|
results = manager.sync_all()
|
|
|
|
success_count = sum(1 for r in results.values() if r["success"])
|
|
total_count = len(results)
|
|
|
|
if success_count == total_count:
|
|
print(f"\nAll {total_count} tables synchronized successfully!")
|
|
sys.exit(0)
|
|
else:
|
|
print(
|
|
f"\n{success_count}/{total_count} tables synchronized. "
|
|
f"Check logs for details."
|
|
)
|
|
sys.exit(1)
|
|
|
|
except Exception as e:
|
|
print(f"\nError: {e}")
|
|
import traceback
|
|
|
|
traceback.print_exc()
|
|
sys.exit(1)
|