release: 0.41.0 — orchestrator filesystem fallback for missing _meta materialized rows
0.40.0 added _persist_materialized_inner_view in materialize_query, which
tried to open extract.duckdb from a fresh DuckDB handle to write the _meta
row + inner view. In production this conflicts with the same uvicorn
process's existing read-only ATTACH (orchestrator's analytics conn holds
extract.duckdb ATTACHed as <source_name> alias), and DuckDB single-process
file-handle uniqueness rejects with:
Binder Error: Unique file handle conflict: Cannot attach "extract"
— already attached by database "<source>"
The helper logs WARNING fail-soft, parquet stays canonical, but the
master view never appears via the meta path.
Fix: at the end of _attach_and_create_views, scan
<extract_dir>/data/*.parquet and CREATE OR REPLACE VIEW <id> AS
SELECT * FROM read_parquet('<path>') for any parquet whose <id> is not
already in the per-source tables list (= meta path didn't pick it up).
Decoupled from materialize_query open-handle race. Honors the same
view_ownership cross-connector collision rules as the meta path
(first-come-first-served via view_repo.claim).
Tests:
- filesystem-fallback fires when _meta row missing
- skipped when meta path already created the view (no shadow)
- skips invalid identifiers (e.g. parquet stem starting with a digit)
- doesn't crash when source has no data/ subdir
This commit is contained in:
parent
0fd73faa8d
commit
dfb7f25e76
4 changed files with 261 additions and 1 deletions
29
CHANGELOG.md
29
CHANGELOG.md
|
|
@ -10,6 +10,35 @@ CalVer image tags (`stable-YYYY.MM.N`, `dev-YYYY.MM.N`) are produced for every C
|
|||
|
||||
## [Unreleased]
|
||||
|
||||
## [0.41.0] — 2026-05-06
|
||||
|
||||
### Fixed
|
||||
- **Orchestrator filesystem fallback for materialized parquets that
|
||||
couldn't register in `extract.duckdb`'s `_meta`**
|
||||
(`src/orchestrator.py:_attach_and_create_views`). The 0.40.0 fix in
|
||||
`materialize_query` opens `extract.duckdb` from a fresh DuckDB handle
|
||||
to write the `_meta` row + inner view; in production the same uvicorn
|
||||
process already holds `extract.duckdb` ATTACHed read-only as the
|
||||
source-name alias under the orchestrator's analytics connection, and
|
||||
DuckDB's single-process file-handle uniqueness rejects the second
|
||||
open with `Binder Error: Unique file handle conflict: Cannot attach
|
||||
"extract" — already attached by database "<source>"`. The 0.40.0
|
||||
helper logs WARNING and falls through; parquet stays canonical, but
|
||||
the master view never appears via the meta path.
|
||||
|
||||
This release adds a second pass at the end of
|
||||
`_attach_and_create_views`: scan `<extract_dir>/data/*.parquet` and
|
||||
create a master view via `read_parquet('<path>')` for any parquet
|
||||
whose `<id>` is not already in the per-source `tables` list (i.e. the
|
||||
meta path didn't pick it up). Decoupled from `materialize_query`'s
|
||||
open-handle race; robust against any registration drift between
|
||||
materialize and rebuild. Honors the same `view_ownership` / cross-
|
||||
connector collision rules as the meta path (first-come-first-served
|
||||
via `view_repo.claim`). Tests cover: fallback fires when meta row is
|
||||
missing; fallback skips when meta path already created the view (no
|
||||
shadow); invalid identifier in parquet stem is skipped without crash;
|
||||
source without `data/` subdir doesn't crash the scan.
|
||||
|
||||
## [0.40.0] — 2026-05-06
|
||||
|
||||
### Fixed
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
[project]
|
||||
name = "agnes-the-ai-analyst"
|
||||
version = "0.40.0"
|
||||
version = "0.41.0"
|
||||
description = "Agnes — AI Data Analyst platform for AI analytical systems"
|
||||
requires-python = ">=3.11,<3.14"
|
||||
license = "MIT"
|
||||
|
|
|
|||
|
|
@ -395,6 +395,69 @@ class SyncOrchestrator:
|
|||
source_name, table_name, e,
|
||||
)
|
||||
|
||||
# Filesystem-fallback master views (0.41.0). The 0.40.0 fix in
|
||||
# `materialize_query` tries to register the parquet in
|
||||
# `extract.duckdb`'s `_meta` + inner view, but the open-as-
|
||||
# second-write-handle from the same uvicorn process collides
|
||||
# with the existing read-only ATTACH that `rebuild()` itself
|
||||
# holds (`Unique file handle conflict: Cannot attach "extract"
|
||||
# — already attached by database "<source>"`). The 0.40.0
|
||||
# helper logs a WARNING and falls through, parquet is
|
||||
# canonical, but the master view never appears via the meta
|
||||
# path. This second pass scans `<extract_dir>/data/*.parquet`
|
||||
# directly and creates a master view via `read_parquet()` for
|
||||
# any parquet that didn't already get one through the meta
|
||||
# path. Decoupled from materialize_query's open-handle race;
|
||||
# robust against any registration drift between materialize
|
||||
# and rebuild.
|
||||
try:
|
||||
extracts_dir = _get_extracts_dir()
|
||||
except Exception:
|
||||
extracts_dir = None
|
||||
if extracts_dir is not None:
|
||||
data_dir = extracts_dir / source_name / "data"
|
||||
if data_dir.exists():
|
||||
already_created = set(tables)
|
||||
for parquet_path in sorted(data_dir.glob("*.parquet")):
|
||||
table_id = parquet_path.stem
|
||||
if not _validate_identifier(table_id, "fs_fallback table_id"):
|
||||
continue
|
||||
if table_id in already_created:
|
||||
continue
|
||||
# view_repo claim — same first-come-first-served
|
||||
# rule as the meta-path branch above.
|
||||
if view_repo is not None:
|
||||
if not view_repo.claim(table_id, source_name):
|
||||
prior_owner = (
|
||||
view_repo.get_owner(table_id)
|
||||
or existing_owners.get(table_id, "<unknown>")
|
||||
)
|
||||
logger.error(
|
||||
"view_ownership collision: %s already owns view %r; "
|
||||
"%s.%s (filesystem-fallback) will NOT be exposed.",
|
||||
prior_owner, table_id, source_name, table_id,
|
||||
)
|
||||
continue
|
||||
if claimed_pairs is not None:
|
||||
claimed_pairs.append((source_name, table_id))
|
||||
try:
|
||||
safe_path = str(parquet_path).replace("'", "''")
|
||||
conn.execute(
|
||||
f"CREATE OR REPLACE VIEW \"{table_id}\" AS "
|
||||
f"SELECT * FROM read_parquet('{safe_path}')"
|
||||
)
|
||||
tables.append(table_id)
|
||||
logger.info(
|
||||
"filesystem-fallback master view created: "
|
||||
"%s/%s (parquet at %s) — meta row was missing",
|
||||
source_name, table_id, parquet_path,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
"filesystem-fallback master view failed for %s/%s: %s",
|
||||
source_name, table_id, e,
|
||||
)
|
||||
|
||||
# Update sync_state in system DB
|
||||
self._update_sync_state(meta_rows, source_name)
|
||||
|
||||
|
|
|
|||
|
|
@ -677,3 +677,171 @@ class TestOrchestratorFailureModes:
|
|||
assert "corrupt_a" not in result
|
||||
assert "corrupt_b" not in result
|
||||
assert "keboola" in result
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# 0.41.0 — filesystem-fallback master views for materialized parquets that
|
||||
# couldn't register themselves in extract.duckdb's _meta (the open-as-second-
|
||||
# write-handle race that 0.40.0's _persist_materialized_inner_view hits).
|
||||
# ----------------------------------------------------------------------------
|
||||
|
||||
|
||||
import struct
|
||||
|
||||
|
||||
def _write_minimal_parquet(path: Path, n_rows: int = 3) -> None:
|
||||
"""Write a tiny valid parquet file using DuckDB's COPY. Used to seed
|
||||
the filesystem-fallback test cases where we want a parquet on disk
|
||||
that the extractor never registered in _meta."""
|
||||
conn = duckdb.connect(":memory:")
|
||||
try:
|
||||
conn.execute(f"CREATE TABLE t AS SELECT range AS id FROM range({n_rows})")
|
||||
safe = str(path).replace("'", "''")
|
||||
conn.execute(f"COPY (SELECT * FROM t) TO '{safe}' (FORMAT PARQUET)")
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
|
||||
class TestFilesystemFallbackMasterViews:
|
||||
"""If a parquet exists on disk but never made it into _meta (the
|
||||
open-handle race that bit 0.40.0), the orchestrator must still
|
||||
create a master view over it via read_parquet()."""
|
||||
|
||||
def test_filesystem_fallback_creates_master_view(self, setup_env):
|
||||
from src.orchestrator import SyncOrchestrator
|
||||
|
||||
# Set up an extract.duckdb with ONLY remote rows in _meta.
|
||||
_create_mock_extract(
|
||||
setup_env["extracts_dir"],
|
||||
"bigquery",
|
||||
tables=[
|
||||
{"name": "remote_one", "data": [{"x": "1"}], "query_mode": "remote"},
|
||||
],
|
||||
)
|
||||
|
||||
# Drop a parquet on disk for a table that's NOT in _meta — this
|
||||
# is the materialize_query "atomic swap succeeded but _meta
|
||||
# registration hit lock conflict" scenario.
|
||||
data_dir = setup_env["extracts_dir"] / "bigquery" / "data"
|
||||
_write_minimal_parquet(data_dir / "order_economics.parquet", n_rows=42)
|
||||
|
||||
orch = SyncOrchestrator(analytics_db_path=setup_env["analytics_db"])
|
||||
result = orch.rebuild()
|
||||
|
||||
# Both views should appear: the meta-path one and the
|
||||
# filesystem-fallback one.
|
||||
assert "bigquery" in result
|
||||
assert "remote_one" in result["bigquery"]
|
||||
assert "order_economics" in result["bigquery"], (
|
||||
"filesystem-fallback master view must be created when parquet "
|
||||
"exists on disk but _meta row is missing"
|
||||
)
|
||||
|
||||
# Verify the master view actually queries the parquet.
|
||||
master = duckdb.connect(setup_env["analytics_db"], read_only=True)
|
||||
try:
|
||||
n = master.execute("SELECT COUNT(*) FROM order_economics").fetchone()[0]
|
||||
assert n == 42, f"master view should return parquet rows, got {n}"
|
||||
finally:
|
||||
master.close()
|
||||
|
||||
def test_filesystem_fallback_does_not_duplicate_meta_path(self, setup_env, caplog):
|
||||
"""When the same name is in BOTH _meta (with an inner view) AND
|
||||
on disk as a parquet, the meta path wins — filesystem fallback
|
||||
must not create a second / replacement view that shadows the
|
||||
meta-driven one."""
|
||||
from src.orchestrator import SyncOrchestrator
|
||||
|
||||
_create_mock_extract(
|
||||
setup_env["extracts_dir"],
|
||||
"bigquery",
|
||||
tables=[
|
||||
{
|
||||
"name": "order_economics",
|
||||
"data": [{"x": "from_meta"}],
|
||||
"query_mode": "materialized",
|
||||
},
|
||||
],
|
||||
)
|
||||
# Also drop a parquet of the same name on disk.
|
||||
data_dir = setup_env["extracts_dir"] / "bigquery" / "data"
|
||||
_write_minimal_parquet(data_dir / "order_economics.parquet", n_rows=99)
|
||||
|
||||
import logging
|
||||
orch = SyncOrchestrator(analytics_db_path=setup_env["analytics_db"])
|
||||
with caplog.at_level(logging.INFO, logger="src.orchestrator"):
|
||||
result = orch.rebuild()
|
||||
|
||||
# `order_economics` should appear exactly once in the result list —
|
||||
# meta path won, fallback skipped because the name was already
|
||||
# in the per-source `tables` set.
|
||||
bq_tables = result.get("bigquery", [])
|
||||
assert bq_tables.count("order_economics") == 1, (
|
||||
f"name should appear exactly once, got {bq_tables}"
|
||||
)
|
||||
# Fallback log line must NOT have fired for this name.
|
||||
fallback_lines = [
|
||||
r.getMessage() for r in caplog.records
|
||||
if "filesystem-fallback master view created" in r.getMessage()
|
||||
and "order_economics" in r.getMessage()
|
||||
]
|
||||
assert fallback_lines == [], (
|
||||
f"filesystem-fallback must not fire when meta path is viable; got: {fallback_lines}"
|
||||
)
|
||||
|
||||
def test_filesystem_fallback_skips_invalid_table_id(self, setup_env, tmp_path):
|
||||
"""A parquet whose stem doesn't pass identifier validation
|
||||
(e.g. starts with a digit or contains spaces) must be skipped,
|
||||
not crash the rebuild."""
|
||||
from src.orchestrator import SyncOrchestrator
|
||||
|
||||
_create_mock_extract(
|
||||
setup_env["extracts_dir"],
|
||||
"bigquery",
|
||||
tables=[
|
||||
{"name": "remote_one", "data": [{"x": "1"}], "query_mode": "remote"},
|
||||
],
|
||||
)
|
||||
|
||||
# Identifier validator rejects names starting with a digit.
|
||||
data_dir = setup_env["extracts_dir"] / "bigquery" / "data"
|
||||
_write_minimal_parquet(data_dir / "9bad_name.parquet")
|
||||
|
||||
orch = SyncOrchestrator(analytics_db_path=setup_env["analytics_db"])
|
||||
result = orch.rebuild()
|
||||
|
||||
# remote_one still works; bad-named parquet is silently skipped.
|
||||
assert "remote_one" in result["bigquery"]
|
||||
assert "9bad_name" not in result["bigquery"]
|
||||
# Master DB has no such view.
|
||||
master = duckdb.connect(setup_env["analytics_db"], read_only=True)
|
||||
try:
|
||||
tables = [r[0] for r in master.execute(
|
||||
"SELECT table_name FROM information_schema.tables "
|
||||
"WHERE table_schema='main'"
|
||||
).fetchall()]
|
||||
assert "9bad_name" not in tables
|
||||
finally:
|
||||
master.close()
|
||||
|
||||
def test_filesystem_fallback_no_data_dir_is_safe(self, setup_env):
|
||||
"""Sources without a `<extract_dir>/data/` directory (e.g. the
|
||||
BigQuery extractor in remote-only mode pre-#160) must not crash
|
||||
the fallback scan."""
|
||||
from src.orchestrator import SyncOrchestrator
|
||||
|
||||
# Create a source with extract.duckdb but no `data/` subdir.
|
||||
source_dir = setup_env["extracts_dir"] / "bigquery"
|
||||
source_dir.mkdir()
|
||||
db_path = source_dir / "extract.duckdb"
|
||||
conn = duckdb.connect(str(db_path))
|
||||
conn.execute(
|
||||
"CREATE TABLE _meta (table_name VARCHAR, description VARCHAR, "
|
||||
"rows BIGINT, size_bytes BIGINT, extracted_at TIMESTAMP, "
|
||||
"query_mode VARCHAR)"
|
||||
)
|
||||
conn.close()
|
||||
|
||||
orch = SyncOrchestrator(analytics_db_path=setup_env["analytics_db"])
|
||||
# Must not crash.
|
||||
orch.rebuild()
|
||||
|
|
|
|||
Loading…
Reference in a new issue