""" Proof-of-concept: Connector Kit architecture validation. Tests that the proposed Connector Protocol + Runtime model is: 1. Implementable in Python (Protocol, Cap flags, partial implementation) 2. Arrow RecordBatch iteration works with DuckDB (zero-copy) 3. ConnectorRuntime can build extract.duckdb from any connector 4. Schema evolution detection works via Arrow schema diff 5. A real connector can be written in ~50 lines 6. Incremental state tracking works 7. Manifest validation works 8. Discovery → read pipeline is end-to-end functional """ import asyncio import os import shutil import tempfile from dataclasses import dataclass, field from enum import Flag, auto from pathlib import Path from typing import AsyncIterator, Iterator, Protocol, runtime_checkable import duckdb import pyarrow as pa import pyarrow.parquet as pq import pytest import yaml # ============================================================================ # Layer 2: Connector Protocol (the contract) # ============================================================================ class Cap(Flag): """Connector capabilities — declare what you support.""" DISCOVER = auto() READ = auto() STREAM = auto() REMOTE = auto() WRITE = auto() @dataclass class TableInfo: name: str schema: pa.Schema capabilities: Cap primary_key: list[str] | None = None description: str = "" @dataclass class ReadOptions: columns: list[str] | None = None filter: dict | None = None incremental_key: str | None = None incremental_value: str | None = None batch_size: int = 10_000 @dataclass class RemoteAttachInfo: extension: str url: str token_env: str @runtime_checkable class Connector(Protocol): @property def capabilities(self) -> Cap: ... def discover(self) -> list[TableInfo]: ... def read(self, table: str, options: ReadOptions) -> Iterator[pa.RecordBatch]: ... # ============================================================================ # Layer 3: Connector Runtime (the SDK — replaces manual boilerplate) # ============================================================================ @dataclass class ExtractStats: tables_extracted: int = 0 tables_failed: int = 0 total_rows: int = 0 schema_changes: list[str] = field(default_factory=list) errors: list[str] = field(default_factory=list) class ConnectorRuntime: """Handles extract.duckdb lifecycle — what every connector does manually today.""" def __init__(self, output_dir: Path): self.output_dir = output_dir self.data_dir = output_dir / "data" self.db_path = output_dir / "extract.duckdb" self.state_path = output_dir / ".state.yaml" self.data_dir.mkdir(parents=True, exist_ok=True) def run(self, connector: Connector, tables: list[str] | None = None) -> ExtractStats: stats = ExtractStats() # 1. Discovery available: list[TableInfo] = [] if Cap.DISCOVER in connector.capabilities: available = connector.discover() # If no tables specified, extract all discovered if tables is None: tables = [t.name for t in available if Cap.READ in t.capabilities] # 2. Schema evolution check for table_name in tables: table_info = self._find_table(available, table_name) if table_info: change = self._check_schema_evolution(table_name, table_info.schema) if change: stats.schema_changes.append(change) # 3. Extract via read() if Cap.READ in connector.capabilities: for table_name in tables: try: options = self._build_read_options(table_name, available) rows = self._extract_table(connector, table_name, options) stats.tables_extracted += 1 stats.total_rows += rows except Exception as e: stats.tables_failed += 1 stats.errors.append(f"{table_name}: {e}") # 4. Remote attach (if supported) if Cap.REMOTE in connector.capabilities: try: info = connector.remote() # type: ignore[attr-defined] self._write_remote_attach(info) except Exception as e: stats.errors.append(f"remote_attach: {e}") # 5. Build extract.duckdb (_meta + views) self._build_extract_db(available, tables) # 6. Save incremental state self._save_state(tables) return stats def _extract_table(self, connector: Connector, table: str, options: ReadOptions) -> int: """Extract a table via Arrow RecordBatch iterator → Parquet.""" parquet_path = self.data_dir / f"{table}.parquet" writer = None total_rows = 0 for batch in connector.read(table, options): if writer is None: writer = pq.ParquetWriter(str(parquet_path), batch.schema) writer.write_batch(batch) total_rows += batch.num_rows if writer: writer.close() return total_rows def _build_extract_db(self, available: list[TableInfo], tables: list[str]): """Build extract.duckdb with _meta and views — atomic swap.""" tmp_db = self.output_dir / "extract.duckdb.tmp" if tmp_db.exists(): tmp_db.unlink() con = duckdb.connect(str(tmp_db)) try: # _meta table con.execute(""" CREATE TABLE _meta ( table_name VARCHAR NOT NULL, description VARCHAR, rows BIGINT, size_bytes BIGINT, extracted_at TIMESTAMP DEFAULT current_timestamp, query_mode VARCHAR DEFAULT 'local', schema_json VARCHAR ) """) for table_name in tables: parquet_path = self.data_dir / f"{table_name}.parquet" if parquet_path.exists(): # Create view pointing to parquet con.execute( f'CREATE VIEW "{table_name}" AS ' f"SELECT * FROM read_parquet('{parquet_path}')" ) # Get row count + size rows = con.execute(f'SELECT count(*) FROM "{table_name}"').fetchone()[0] size = parquet_path.stat().st_size # Find description and schema info = self._find_table(available, table_name) desc = info.description if info else "" schema_json = info.schema.to_string() if info else "" con.execute( "INSERT INTO _meta (table_name, description, rows, size_bytes, schema_json) " "VALUES (?, ?, ?, ?, ?)", [table_name, desc, rows, size, schema_json], ) finally: con.close() # Atomic swap if self.db_path.exists(): self.db_path.unlink() # Clean WAL if exists wal = Path(str(tmp_db) + ".wal") if wal.exists(): wal.unlink() tmp_db.rename(self.db_path) def _find_table(self, available: list[TableInfo], name: str) -> TableInfo | None: return next((t for t in available if t.name == name), None) def _check_schema_evolution(self, table: str, new_schema: pa.Schema) -> str | None: """Detect schema changes by comparing Arrow schemas.""" schema_file = self.output_dir / f".schema_{table}.arrow" if schema_file.exists(): reader = pa.ipc.open_stream(schema_file.read_bytes()) old_schema = reader.schema if old_schema != new_schema: # Diff: added, removed, changed fields old_names = set(old_schema.names) new_names = set(new_schema.names) added = new_names - old_names removed = old_names - new_names msg = f"{table}: " if added: msg += f"+{added} " if removed: msg += f"-{removed} " # Check type changes for common fields for name in old_names & new_names: old_type = old_schema.field(name).type new_type = new_schema.field(name).type if old_type != new_type: msg += f"{name}:{old_type}→{new_type} " # Save new schema self._save_schema(table, new_schema) return msg.strip() else: self._save_schema(table, new_schema) return None def _save_schema(self, table: str, schema: pa.Schema): """Serialize Arrow schema via IPC stream (compatible with all PyArrow versions).""" schema_file = self.output_dir / f".schema_{table}.arrow" sink = pa.BufferOutputStream() writer = pa.ipc.new_stream(sink, schema) writer.close() schema_file.write_bytes(sink.getvalue().to_pybytes()) def _build_read_options(self, table: str, available: list[TableInfo]) -> ReadOptions: """Build ReadOptions with incremental state if available.""" options = ReadOptions() state = self._load_state() if table in state: options.incremental_key = state[table].get("incremental_key") options.incremental_value = state[table].get("incremental_value") return options def _load_state(self) -> dict: if self.state_path.exists(): return yaml.safe_load(self.state_path.read_text()) or {} return {} def _save_state(self, tables: list[str]): state = self._load_state() for table in tables: if table not in state: state[table] = {} state[table]["last_extracted"] = str(duckdb.query("SELECT current_timestamp").fetchone()[0]) self.state_path.write_text(yaml.dump(state)) def _write_remote_attach(self, info: RemoteAttachInfo): """Write _remote_attach info for orchestrator.""" # This gets added to extract.duckdb in _build_extract_db # For POC, store as yaml; real impl writes to DuckDB ra_path = self.output_dir / ".remote_attach.yaml" ra_path.write_text( yaml.dump({"extension": info.extension, "url": info.url, "token_env": info.token_env}) ) # ============================================================================ # Example connectors (proving the contract works) # ============================================================================ class SampleAPIConnector: """ A sample connector simulating an HTTP API source. Proves: ~50 lines for a complete connector implementation. """ capabilities = Cap.DISCOVER | Cap.READ ORDERS_SCHEMA = pa.schema( [ pa.field("id", pa.int64()), pa.field("customer", pa.string()), pa.field("amount", pa.float64()), pa.field("date", pa.string()), ] ) USERS_SCHEMA = pa.schema( [ pa.field("id", pa.int64()), pa.field("name", pa.string()), pa.field("email", pa.string()), ] ) # Simulated API data _data = { "orders": [ {"id": 1, "customer": "Alice", "amount": 100.0, "date": "2026-01-15"}, {"id": 2, "customer": "Bob", "amount": 250.0, "date": "2026-02-01"}, {"id": 3, "customer": "Carol", "amount": 75.5, "date": "2026-03-10"}, ], "users": [ {"id": 1, "name": "Alice", "email": "alice@example.com"}, {"id": 2, "name": "Bob", "email": "bob@example.com"}, ], } def discover(self) -> list[TableInfo]: return [ TableInfo( name="orders", schema=self.ORDERS_SCHEMA, capabilities=Cap.READ, primary_key=["id"], description="Sales orders", ), TableInfo( name="users", schema=self.USERS_SCHEMA, capabilities=Cap.READ, primary_key=["id"], description="Registered users", ), ] def read(self, table: str, options: ReadOptions) -> Iterator[pa.RecordBatch]: data = self._data.get(table, []) schema = self.ORDERS_SCHEMA if table == "orders" else self.USERS_SCHEMA # Simulate batched reading (batch_size controls chunking) for i in range(0, len(data), options.batch_size): chunk = data[i : i + options.batch_size] arrays = [pa.array([row[col] for row in chunk], type=schema.field(col).type) for col in schema.names] yield pa.RecordBatch.from_arrays(arrays, schema=schema) class StreamingConnector: """Proves: async stream capability works.""" capabilities = Cap.DISCOVER | Cap.STREAM EVENTS_SCHEMA = pa.schema( [ pa.field("event_id", pa.string()), pa.field("type", pa.string()), pa.field("payload", pa.string()), ] ) def discover(self) -> list[TableInfo]: return [ TableInfo( name="events", schema=self.EVENTS_SCHEMA, capabilities=Cap.STREAM, description="Real-time events", ) ] async def stream(self, table: str) -> AsyncIterator[pa.RecordBatch]: """Simulate webhook events arriving.""" events = [ {"event_id": "e1", "type": "created", "payload": '{"issue": "PROJ-1"}'}, {"event_id": "e2", "type": "updated", "payload": '{"issue": "PROJ-2"}'}, {"event_id": "e3", "type": "deleted", "payload": '{"issue": "PROJ-3"}'}, ] for event in events: arrays = [pa.array([event[col]], type=self.EVENTS_SCHEMA.field(col).type) for col in self.EVENTS_SCHEMA.names] yield pa.RecordBatch.from_arrays(arrays, schema=self.EVENTS_SCHEMA) class RemoteOnlyConnector: """Proves: remote-only connector (like BigQuery) works.""" capabilities = Cap.DISCOVER | Cap.REMOTE def discover(self) -> list[TableInfo]: return [ TableInfo( name="big_table", schema=pa.schema([pa.field("id", pa.int64()), pa.field("value", pa.string())]), capabilities=Cap.REMOTE, description="Remote-only table, queries go to source", ) ] def remote(self) -> RemoteAttachInfo: return RemoteAttachInfo( extension="bigquery", url="project_id=my-project", token_env="GOOGLE_APPLICATION_CREDENTIALS", ) # ============================================================================ # Tests # ============================================================================ class TestCapabilityFlags: """Test 1: Cap Flag enum works for declaration and checking.""" def test_flag_composition(self): caps = Cap.DISCOVER | Cap.READ | Cap.REMOTE assert Cap.DISCOVER in caps assert Cap.READ in caps assert Cap.REMOTE in caps assert Cap.STREAM not in caps assert Cap.WRITE not in caps def test_per_table_capabilities(self): info = TableInfo( name="orders", schema=pa.schema([pa.field("id", pa.int64())]), capabilities=Cap.READ | Cap.STREAM, ) assert Cap.READ in info.capabilities assert Cap.STREAM in info.capabilities assert Cap.WRITE not in info.capabilities def test_flag_iteration(self): """Can iterate individual flags from a composite.""" caps = Cap.DISCOVER | Cap.READ | Cap.STREAM individual = list(caps) assert len(individual) == 3 assert Cap.DISCOVER in individual class TestProtocolCompliance: """Test 2: Protocol type checking works at runtime.""" def test_sample_connector_is_connector(self): c = SampleAPIConnector() assert isinstance(c, Connector) def test_streaming_connector_partial_protocol(self): """StreamingConnector doesn't implement read() — that's OK. Protocol is structural, not enforced for methods you don't use.""" c = StreamingConnector() assert hasattr(c, "capabilities") assert hasattr(c, "discover") assert Cap.STREAM in c.capabilities def test_remote_connector_is_valid(self): c = RemoteOnlyConnector() assert hasattr(c, "discover") assert hasattr(c, "remote") assert Cap.REMOTE in c.capabilities class TestArrowIntegration: """Test 3: Arrow RecordBatch → DuckDB zero-copy works.""" def test_record_batch_to_duckdb(self): """DuckDB can query Arrow RecordBatches directly.""" schema = pa.schema([pa.field("id", pa.int64()), pa.field("name", pa.string())]) batch = pa.RecordBatch.from_arrays( [pa.array([1, 2, 3]), pa.array(["a", "b", "c"])], schema=schema, ) con = duckdb.connect() result = con.execute("SELECT * FROM batch WHERE id > 1").fetchall() assert len(result) == 2 assert result[0] == (2, "b") def test_record_batch_iterator_to_duckdb(self): """DuckDB can consume an iterator of RecordBatches.""" schema = pa.schema([pa.field("value", pa.float64())]) def generate_batches(): for i in range(3): yield pa.RecordBatch.from_arrays( [pa.array([float(i * 10 + j) for j in range(5)])], schema=schema, ) reader = pa.RecordBatchReader.from_batches(schema, generate_batches()) con = duckdb.connect() result = con.execute("SELECT count(*), sum(value) FROM reader").fetchone() assert result[0] == 15 # 3 batches * 5 rows assert result[1] == sum(float(i * 10 + j) for i in range(3) for j in range(5)) def test_arrow_to_parquet_roundtrip(self): """Arrow → Parquet → DuckDB roundtrip preserves data.""" schema = pa.schema( [ pa.field("id", pa.int64()), pa.field("amount", pa.float64()), pa.field("label", pa.string()), ] ) batch = pa.RecordBatch.from_arrays( [pa.array([1, 2]), pa.array([99.9, 200.0]), pa.array(["x", "y"])], schema=schema, ) with tempfile.NamedTemporaryFile(suffix=".parquet", delete=False) as f: pq.write_table(pa.Table.from_batches([batch]), f.name) con = duckdb.connect() result = con.execute(f"SELECT * FROM read_parquet('{f.name}')").fetchall() assert result == [(1, 99.9, "x"), (2, 200.0, "y")] os.unlink(f.name) class TestConnectorRuntime: """Test 4: Full runtime pipeline — connector → extract.duckdb.""" @pytest.fixture def output_dir(self, tmp_path): return tmp_path / "extract_test" def test_full_extract_pipeline(self, output_dir): """End-to-end: connector → runtime → extract.duckdb with _meta + views.""" connector = SampleAPIConnector() runtime = ConnectorRuntime(output_dir) stats = runtime.run(connector) # Stats are correct assert stats.tables_extracted == 2 assert stats.tables_failed == 0 assert stats.total_rows == 5 # 3 orders + 2 users assert stats.errors == [] # extract.duckdb exists and is valid db_path = output_dir / "extract.duckdb" assert db_path.exists() con = duckdb.connect(str(db_path), read_only=True) # _meta table has both tables meta = con.execute("SELECT table_name, rows, description FROM _meta ORDER BY table_name").fetchall() assert len(meta) == 2 assert meta[0] == ("orders", 3, "Sales orders") assert meta[1] == ("users", 2, "Registered users") # Views work — can query data through extract.duckdb orders = con.execute("SELECT * FROM orders ORDER BY id").fetchall() assert len(orders) == 3 assert orders[0] == (1, "Alice", 100.0, "2026-01-15") users = con.execute("SELECT * FROM users ORDER BY id").fetchall() assert len(users) == 2 assert users[0][1] == "Alice" # Cross-table query works result = con.execute(""" SELECT u.name, SUM(o.amount) as total FROM orders o JOIN users u ON o.customer = u.name GROUP BY u.name ORDER BY total DESC """).fetchall() assert result[0] == ("Bob", 250.0) assert result[1] == ("Alice", 100.0) con.close() def test_selective_table_extract(self, output_dir): """Can extract specific tables only.""" connector = SampleAPIConnector() runtime = ConnectorRuntime(output_dir) stats = runtime.run(connector, tables=["orders"]) assert stats.tables_extracted == 1 assert stats.total_rows == 3 con = duckdb.connect(str(output_dir / "extract.duckdb"), read_only=True) tables = con.execute("SELECT table_name FROM _meta").fetchall() assert tables == [("orders",)] con.close() def test_incremental_state_tracking(self, output_dir): """Runtime saves and loads incremental state between runs.""" connector = SampleAPIConnector() runtime = ConnectorRuntime(output_dir) # First run runtime.run(connector, tables=["orders"]) # State file exists state_path = output_dir / ".state.yaml" assert state_path.exists() state = yaml.safe_load(state_path.read_text()) assert "orders" in state assert "last_extracted" in state["orders"] # Second run — state persists runtime2 = ConnectorRuntime(output_dir) runtime2.run(connector, tables=["orders"]) state2 = yaml.safe_load(state_path.read_text()) assert "orders" in state2 def test_empty_table_handling(self, output_dir): """Connector that yields nothing for a table doesn't crash.""" class EmptyConnector: capabilities = Cap.DISCOVER | Cap.READ def discover(self) -> list[TableInfo]: return [ TableInfo( name="empty", schema=pa.schema([pa.field("id", pa.int64())]), capabilities=Cap.READ, ) ] def read(self, table: str, options: ReadOptions) -> Iterator[pa.RecordBatch]: return iter([]) # No data runtime = ConnectorRuntime(output_dir) stats = runtime.run(EmptyConnector()) # Extracted 0 rows, but no failure assert stats.tables_extracted == 1 assert stats.total_rows == 0 assert stats.errors == [] def test_error_in_one_table_doesnt_stop_others(self, output_dir): """Partial failure: one table fails, others still extract.""" class PartialFailConnector: capabilities = Cap.DISCOVER | Cap.READ def discover(self) -> list[TableInfo]: return [ TableInfo("good", pa.schema([pa.field("id", pa.int64())]), Cap.READ), TableInfo("bad", pa.schema([pa.field("id", pa.int64())]), Cap.READ), ] def read(self, table: str, options: ReadOptions) -> Iterator[pa.RecordBatch]: if table == "bad": raise ConnectionError("API timeout") yield pa.RecordBatch.from_arrays( [pa.array([1, 2, 3])], schema=pa.schema([pa.field("id", pa.int64())]), ) runtime = ConnectorRuntime(output_dir) stats = runtime.run(PartialFailConnector()) assert stats.tables_extracted == 1 assert stats.tables_failed == 1 assert "bad: API timeout" in stats.errors[0] class TestSchemaEvolution: """Test 5: Schema change detection via Arrow schema diff.""" def test_detect_added_column(self, tmp_path): output_dir = tmp_path / "schema_test" runtime = ConnectorRuntime(output_dir) # V1 schema schema_v1 = pa.schema([pa.field("id", pa.int64()), pa.field("name", pa.string())]) runtime._save_schema("orders", schema_v1) # V2 schema — added column schema_v2 = pa.schema( [ pa.field("id", pa.int64()), pa.field("name", pa.string()), pa.field("email", pa.string()), ] ) change = runtime._check_schema_evolution("orders", schema_v2) assert change is not None assert "email" in change assert "+" in change def test_detect_removed_column(self, tmp_path): output_dir = tmp_path / "schema_test" runtime = ConnectorRuntime(output_dir) schema_v1 = pa.schema( [ pa.field("id", pa.int64()), pa.field("name", pa.string()), pa.field("old_field", pa.string()), ] ) runtime._save_schema("orders", schema_v1) schema_v2 = pa.schema([pa.field("id", pa.int64()), pa.field("name", pa.string())]) change = runtime._check_schema_evolution("orders", schema_v2) assert change is not None assert "old_field" in change assert "-" in change def test_detect_type_change(self, tmp_path): output_dir = tmp_path / "schema_test" runtime = ConnectorRuntime(output_dir) schema_v1 = pa.schema([pa.field("id", pa.int32()), pa.field("value", pa.string())]) runtime._save_schema("data", schema_v1) schema_v2 = pa.schema([pa.field("id", pa.int64()), pa.field("value", pa.string())]) change = runtime._check_schema_evolution("data", schema_v2) assert change is not None assert "int32" in change assert "int64" in change def test_no_change_detected(self, tmp_path): output_dir = tmp_path / "schema_test" runtime = ConnectorRuntime(output_dir) schema = pa.schema([pa.field("id", pa.int64())]) runtime._save_schema("stable", schema) change = runtime._check_schema_evolution("stable", schema) assert change is None def test_first_run_no_previous_schema(self, tmp_path): output_dir = tmp_path / "schema_test" runtime = ConnectorRuntime(output_dir) schema = pa.schema([pa.field("id", pa.int64())]) change = runtime._check_schema_evolution("new_table", schema) assert change is None # First run, no previous schema to compare class TestStreamingCapability: """Test 6: Async streaming connector works.""" def test_async_stream(self): async def _run(): connector = StreamingConnector() batches = [] async for batch in connector.stream("events"): batches.append(batch) return batches batches = asyncio.run(_run()) assert len(batches) == 3 assert batches[0].num_rows == 1 assert batches[0].column("type")[0].as_py() == "created" def test_stream_to_duckdb(self): """Stream batches can be consumed by DuckDB.""" async def _run(): connector = StreamingConnector() all_batches = [] async for batch in connector.stream("events"): all_batches.append(batch) return all_batches all_batches = asyncio.run(_run()) arrow_table = pa.Table.from_batches(all_batches) con = duckdb.connect() result = con.execute("SELECT count(*) FROM arrow_table").fetchone() assert result[0] == 3 class TestRemoteOnlyConnector: """Test 7: Remote-only connector produces correct metadata.""" def test_remote_attach_info(self, tmp_path): output_dir = tmp_path / "remote_test" connector = RemoteOnlyConnector() runtime = ConnectorRuntime(output_dir) stats = runtime.run(connector) # No tables extracted (remote only), but no errors assert stats.tables_extracted == 0 assert stats.errors == [] # Remote attach info saved ra_path = output_dir / ".remote_attach.yaml" assert ra_path.exists() ra = yaml.safe_load(ra_path.read_text()) assert ra["extension"] == "bigquery" assert ra["token_env"] == "GOOGLE_APPLICATION_CREDENTIALS" class TestManifestValidation: """Test 8: YAML manifest parsing and validation.""" SAMPLE_MANIFEST = """ name: sample_api version: "1.0.0" description: "Sample API connector" entrypoint: connectors.sample.SampleAPIConnector capabilities: [discover, read] auth: type: token env_vars: - name: SAMPLE_API_TOKEN required: true description: "API authentication token" config: base_url: type: string required: true batch_size: type: integer required: false default: 1000 health_check: endpoint: "${base_url}/health" method: GET expect_status: 200 """ def test_manifest_parses(self): manifest = yaml.safe_load(self.SAMPLE_MANIFEST) assert manifest["name"] == "sample_api" assert manifest["version"] == "1.0.0" assert "discover" in manifest["capabilities"] assert "read" in manifest["capabilities"] def test_manifest_capabilities_to_flags(self): manifest = yaml.safe_load(self.SAMPLE_MANIFEST) cap_map = {c.name.lower(): c for c in Cap} flags = Cap(0) for c in manifest["capabilities"]: flags |= cap_map[c] assert Cap.DISCOVER in flags assert Cap.READ in flags assert Cap.STREAM not in flags def test_manifest_auth_config(self): manifest = yaml.safe_load(self.SAMPLE_MANIFEST) assert manifest["auth"]["type"] == "token" assert manifest["auth"]["env_vars"][0]["name"] == "SAMPLE_API_TOKEN" assert manifest["auth"]["env_vars"][0]["required"] is True def test_manifest_config_schema(self): manifest = yaml.safe_load(self.SAMPLE_MANIFEST) assert manifest["config"]["base_url"]["required"] is True assert manifest["config"]["batch_size"]["default"] == 1000 def test_manifest_health_check(self): manifest = yaml.safe_load(self.SAMPLE_MANIFEST) hc = manifest["health_check"] assert "${base_url}" in hc["endpoint"] assert hc["expect_status"] == 200 class TestDiscoveryToReadPipeline: """Test 9: Full discovery → read → query pipeline.""" def test_discover_then_read_all(self, tmp_path): """discover() → pick tables → read() → query in DuckDB.""" connector = SampleAPIConnector() # Step 1: Discovery tables = connector.discover() assert len(tables) == 2 assert all(isinstance(t, TableInfo) for t in tables) assert all(t.schema is not None for t in tables) # Step 2: Read via runtime (auto-discovers all tables) runtime = ConnectorRuntime(tmp_path / "full_pipeline") stats = runtime.run(connector) # No tables= arg → discovers automatically assert stats.tables_extracted == 2 # Step 3: Query con = duckdb.connect(str(tmp_path / "full_pipeline" / "extract.duckdb"), read_only=True) result = con.execute(""" SELECT table_name, rows, description FROM _meta ORDER BY table_name """).fetchall() assert result[0][0] == "orders" assert result[0][1] == 3 con.close() class TestLargeDataBatching: """Test 10: Connector can handle large data via batched iteration.""" def test_batched_read_memory_constant(self, tmp_path): """Large dataset extracted in batches — memory doesn't explode.""" class LargeConnector: capabilities = Cap.DISCOVER | Cap.READ NUM_BATCHES = 100 BATCH_SIZE = 1000 def discover(self) -> list[TableInfo]: return [ TableInfo( "big_table", pa.schema([pa.field("id", pa.int64()), pa.field("value", pa.float64())]), Cap.READ, ) ] def read(self, table: str, options: ReadOptions) -> Iterator[pa.RecordBatch]: schema = pa.schema([pa.field("id", pa.int64()), pa.field("value", pa.float64())]) for batch_num in range(self.NUM_BATCHES): start = batch_num * self.BATCH_SIZE yield pa.RecordBatch.from_arrays( [ pa.array(range(start, start + self.BATCH_SIZE), type=pa.int64()), pa.array( [float(i) * 0.1 for i in range(start, start + self.BATCH_SIZE)], type=pa.float64(), ), ], schema=schema, ) runtime = ConnectorRuntime(tmp_path / "large_test") stats = runtime.run(LargeConnector()) assert stats.total_rows == 100_000 assert stats.tables_extracted == 1 # Verify DuckDB can read it con = duckdb.connect(str(tmp_path / "large_test" / "extract.duckdb"), read_only=True) count = con.execute("SELECT count(*) FROM big_table").fetchone()[0] assert count == 100_000 con.close()