fix: extractor subprocess reads table configs via stdin, not DuckDB
Subprocess cannot open system.duckdb (main process holds lock). Now main process reads table_registry and passes configs as JSON via stdin to subprocess. Subprocess never touches system.duckdb.
This commit is contained in:
parent
4d1acd014a
commit
7612385ed6
1 changed files with 60 additions and 10 deletions
|
|
@ -38,26 +38,76 @@ def _get_data_dir() -> Path:
|
||||||
def _run_sync(tables: Optional[List[str]] = None):
|
def _run_sync(tables: Optional[List[str]] = None):
|
||||||
"""Run extractor as subprocess + orchestrator rebuild.
|
"""Run extractor as subprocess + orchestrator rebuild.
|
||||||
|
|
||||||
Extractor runs in a separate process to avoid DuckDB lock conflicts
|
Reads table configs from DuckDB (in main process which has the shared
|
||||||
with the main API process. Orchestrator rebuild runs in-process after
|
connection), passes them as JSON via stdin to the extractor subprocess.
|
||||||
the extractor finishes (it only reads extract.duckdb files).
|
This avoids DuckDB lock conflicts — subprocess never opens system.duckdb.
|
||||||
"""
|
"""
|
||||||
|
import json as _json
|
||||||
import subprocess
|
import subprocess
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Run extractor as subprocess — completely separate DuckDB connections
|
from app.instance_config import get_data_source_type, get_value
|
||||||
|
from src.db import get_system_db
|
||||||
|
from src.repositories.table_registry import TableRegistryRepository
|
||||||
|
|
||||||
|
source_type = get_data_source_type()
|
||||||
|
data_dir = _get_data_dir()
|
||||||
|
|
||||||
|
# Read table configs in main process (has shared DuckDB connection)
|
||||||
|
sys_conn = get_system_db()
|
||||||
|
try:
|
||||||
|
repo = TableRegistryRepository(sys_conn)
|
||||||
|
if tables:
|
||||||
|
all_configs = [repo.get(t) for t in tables]
|
||||||
|
table_configs = [c for c in all_configs if c is not None]
|
||||||
|
else:
|
||||||
|
table_configs = repo.list_by_source(source_type) if source_type else repo.list_all()
|
||||||
|
finally:
|
||||||
|
sys_conn.close()
|
||||||
|
|
||||||
|
if not table_configs:
|
||||||
|
logger.warning("No tables to sync for source_type=%s", source_type)
|
||||||
|
return
|
||||||
|
|
||||||
|
# Serialize configs — strip non-serializable fields
|
||||||
|
serializable = []
|
||||||
|
for tc in table_configs:
|
||||||
|
serializable.append({k: (v.isoformat() if hasattr(v, 'isoformat') else v)
|
||||||
|
for k, v in tc.items() if v is not None})
|
||||||
|
|
||||||
|
# Run extractor subprocess with table configs via stdin
|
||||||
|
# Subprocess does NOT open system.duckdb — no lock conflict
|
||||||
env = {**os.environ}
|
env = {**os.environ}
|
||||||
cmd = [sys.executable, "-m", "connectors.keboola.extractor"]
|
cmd = [sys.executable, "-c", """
|
||||||
logger.info("Starting extractor subprocess: %s", " ".join(cmd))
|
import json, sys, os, logging
|
||||||
|
from pathlib import Path
|
||||||
|
logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
|
||||||
|
|
||||||
|
configs = json.load(sys.stdin)
|
||||||
|
url = os.environ.get("KEBOOLA_STACK_URL", "")
|
||||||
|
token = os.environ.get("KEBOOLA_STORAGE_TOKEN", "")
|
||||||
|
|
||||||
|
if not url or not token:
|
||||||
|
print("ERROR: Missing KEBOOLA_STACK_URL or KEBOOLA_STORAGE_TOKEN", file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
from connectors.keboola.extractor import run
|
||||||
|
data_dir = Path(os.environ.get("DATA_DIR", "./data"))
|
||||||
|
result = run(str(data_dir / "extracts" / "keboola"), configs, url, token)
|
||||||
|
print(json.dumps(result))
|
||||||
|
"""]
|
||||||
|
|
||||||
|
logger.info("Starting extractor subprocess for %d tables", len(table_configs))
|
||||||
|
|
||||||
result = subprocess.run(
|
result = subprocess.run(
|
||||||
cmd, capture_output=True, text=True, timeout=1800, env=env,
|
cmd, input=_json.dumps(serializable), capture_output=True, text=True,
|
||||||
cwd=str(Path(__file__).parent.parent.parent), # project root
|
timeout=1800, env=env,
|
||||||
|
cwd=str(Path(__file__).parent.parent.parent),
|
||||||
)
|
)
|
||||||
|
|
||||||
if result.stdout:
|
if result.stdout:
|
||||||
logger.info("Extractor stdout: %s", result.stdout[-500:])
|
logger.info("Extractor result: %s", result.stdout.strip()[-500:])
|
||||||
if result.stderr:
|
if result.stderr:
|
||||||
logger.warning("Extractor stderr: %s", result.stderr[-500:])
|
logger.warning("Extractor stderr: %s", result.stderr[-500:])
|
||||||
if result.returncode != 0:
|
if result.returncode != 0:
|
||||||
|
|
@ -70,7 +120,7 @@ def _run_sync(tables: Optional[List[str]] = None):
|
||||||
logger.info("Orchestrator rebuild: %s", {k: len(v) for k, v in views.items()})
|
logger.info("Orchestrator rebuild: %s", {k: len(v) for k, v in views.items()})
|
||||||
|
|
||||||
except subprocess.TimeoutExpired:
|
except subprocess.TimeoutExpired:
|
||||||
logger.error("Extractor timed out after 600s")
|
logger.error("Extractor timed out after 1800s")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Data sync failed: {e}\n{traceback.format_exc()}")
|
logger.error(f"Data sync failed: {e}\n{traceback.format_exc()}")
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue