CLAUDE.md rewritten (708 -> ~320 lines): four overlapping release sections collapsed to one, stale v1->v35 schema history dropped (it lives in CHANGELOG), marketplace endpoint internals and verbose process sections moved out or tightened. New focused docs: - docs/RELEASING.md - release process, deploy workflows, CI quirks (RELEASE_TEMPLATE.md folded in as an appendix) - docs/marketplace.md - marketplace ingestion + re-serving internals - docs/README.md - documentation index by audience, linked from README.md and CLAUDE.md Archived under docs/archive/: docs/superpowers/ (52 historical planning artifacts), HACKATHON.md, pd-ps-comments.md, security-audit-2026-04.md, future/NOTIFICATIONS.md. Removed the docs/auto-install.md stub. Fixed dangling links in connectors/jira/README.md and dev_docs/README.md, repointed code/doc references to archived paths.
567 lines
19 KiB
Markdown
567 lines
19 KiB
Markdown
# Metadata Writer Implementation Plan
|
|
|
|
> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
|
|
|
|
**Goal:** Add column metadata management — discover basetypes/descriptions, store in DuckDB, push back to Keboola Storage API.
|
|
|
|
**Architecture:** `column_metadata` table (created in schema v4 by the metrics plan). New `ColumnMetadataRepository` following `table_registry.py` pattern. CLI subcommands under `da admin metadata`. API endpoints under `/api/admin/metadata/`. Keboola push uses Storage API v2.
|
|
|
|
**Tech Stack:** DuckDB, FastAPI, Typer, httpx (for Keboola API push), PyArrow (for schema introspection)
|
|
|
|
**Spec:** `docs/superpowers/specs/2026-04-10-porting-internal-features-design.md` — Section 3
|
|
|
|
**Depends on:** Business Metrics plan (Task 1 — schema v4 creates `column_metadata` table)
|
|
|
|
---
|
|
|
|
### Task 1: ColumnMetadataRepository
|
|
|
|
**Files:**
|
|
- Create: `src/repositories/column_metadata.py`
|
|
- Test: `tests/test_column_metadata.py`
|
|
|
|
- [ ] **Step 1: Write failing tests**
|
|
|
|
Create `tests/test_column_metadata.py`:
|
|
|
|
```python
|
|
"""Tests for ColumnMetadataRepository."""
|
|
|
|
import os
|
|
import json
|
|
from pathlib import Path
|
|
|
|
import pytest
|
|
import duckdb
|
|
|
|
|
|
@pytest.fixture
|
|
def db_conn(tmp_path, monkeypatch):
|
|
monkeypatch.setenv("DATA_DIR", str(tmp_path))
|
|
from src.db import get_system_db
|
|
conn = get_system_db()
|
|
yield conn
|
|
conn.close()
|
|
|
|
|
|
class TestColumnMetadataCreate:
|
|
def test_save_single_column(self, db_conn):
|
|
from src.repositories.column_metadata import ColumnMetadataRepository
|
|
repo = ColumnMetadataRepository(db_conn)
|
|
repo.save("orders", "total_amount", basetype="NUMERIC", description="Order total in USD")
|
|
result = repo.get("orders", "total_amount")
|
|
assert result is not None
|
|
assert result["basetype"] == "NUMERIC"
|
|
assert result["description"] == "Order total in USD"
|
|
assert result["confidence"] == "manual"
|
|
|
|
def test_upsert_overwrites(self, db_conn):
|
|
from src.repositories.column_metadata import ColumnMetadataRepository
|
|
repo = ColumnMetadataRepository(db_conn)
|
|
repo.save("orders", "total_amount", basetype="NUMERIC", description="v1")
|
|
repo.save("orders", "total_amount", basetype="FLOAT", description="v2")
|
|
result = repo.get("orders", "total_amount")
|
|
assert result["basetype"] == "FLOAT"
|
|
assert result["description"] == "v2"
|
|
|
|
|
|
class TestColumnMetadataRead:
|
|
def test_list_for_table(self, db_conn):
|
|
from src.repositories.column_metadata import ColumnMetadataRepository
|
|
repo = ColumnMetadataRepository(db_conn)
|
|
repo.save("orders", "id", basetype="STRING")
|
|
repo.save("orders", "total", basetype="NUMERIC")
|
|
repo.save("users", "email", basetype="STRING")
|
|
results = repo.list_for_table("orders")
|
|
assert len(results) == 2
|
|
names = {r["column_name"] for r in results}
|
|
assert names == {"id", "total"}
|
|
|
|
def test_get_missing(self, db_conn):
|
|
from src.repositories.column_metadata import ColumnMetadataRepository
|
|
repo = ColumnMetadataRepository(db_conn)
|
|
assert repo.get("x", "y") is None
|
|
|
|
|
|
class TestColumnMetadataDelete:
|
|
def test_delete_column(self, db_conn):
|
|
from src.repositories.column_metadata import ColumnMetadataRepository
|
|
repo = ColumnMetadataRepository(db_conn)
|
|
repo.save("orders", "total", basetype="NUMERIC")
|
|
assert repo.delete("orders", "total") is True
|
|
assert repo.get("orders", "total") is None
|
|
|
|
def test_delete_missing(self, db_conn):
|
|
from src.repositories.column_metadata import ColumnMetadataRepository
|
|
repo = ColumnMetadataRepository(db_conn)
|
|
assert repo.delete("x", "y") is False
|
|
|
|
|
|
class TestColumnMetadataProposal:
|
|
def test_import_proposal(self, db_conn, tmp_path):
|
|
from src.repositories.column_metadata import ColumnMetadataRepository
|
|
repo = ColumnMetadataRepository(db_conn)
|
|
|
|
proposal = {
|
|
"project": {"name": "sales"},
|
|
"generated_at": "2026-04-10T12:00:00",
|
|
"tables": {
|
|
"orders": {
|
|
"columns": {
|
|
"id": {"basetype": "STRING", "description": "Order ID", "confidence": "high"},
|
|
"total": {"basetype": "NUMERIC", "description": "Total amount", "confidence": "medium"},
|
|
}
|
|
}
|
|
},
|
|
}
|
|
proposal_path = tmp_path / "proposal.json"
|
|
proposal_path.write_text(json.dumps(proposal))
|
|
|
|
count = repo.import_proposal(proposal_path)
|
|
assert count == 2
|
|
assert repo.get("orders", "id")["basetype"] == "STRING"
|
|
assert repo.get("orders", "total")["confidence"] == "medium"
|
|
|
|
def test_import_proposal_sets_source(self, db_conn, tmp_path):
|
|
from src.repositories.column_metadata import ColumnMetadataRepository
|
|
repo = ColumnMetadataRepository(db_conn)
|
|
|
|
proposal = {
|
|
"tables": {
|
|
"orders": {
|
|
"columns": {
|
|
"id": {"basetype": "STRING", "description": "test", "confidence": "high"},
|
|
}
|
|
}
|
|
},
|
|
}
|
|
(tmp_path / "p.json").write_text(json.dumps(proposal))
|
|
repo.import_proposal(tmp_path / "p.json")
|
|
assert repo.get("orders", "id")["source"] == "ai_enrichment"
|
|
```
|
|
|
|
- [ ] **Step 2: Run tests to verify they fail**
|
|
|
|
Run: `pytest tests/test_column_metadata.py -v`
|
|
Expected: FAIL — `ModuleNotFoundError`
|
|
|
|
- [ ] **Step 3: Implement ColumnMetadataRepository**
|
|
|
|
Create `src/repositories/column_metadata.py`:
|
|
|
|
```python
|
|
"""Repository for column metadata (descriptions, basetypes)."""
|
|
|
|
import json
|
|
import logging
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Any, Dict, List, Optional
|
|
|
|
import duckdb
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class ColumnMetadataRepository:
|
|
def __init__(self, conn: duckdb.DuckDBPyConnection):
|
|
self.conn = conn
|
|
|
|
def save(self, table_id: str, column_name: str,
|
|
basetype: Optional[str] = None,
|
|
description: Optional[str] = None,
|
|
confidence: str = "manual",
|
|
source: str = "manual") -> Dict[str, Any]:
|
|
now = datetime.now(timezone.utc)
|
|
self.conn.execute(
|
|
"""INSERT INTO column_metadata (table_id, column_name, basetype, description, confidence, source, updated_at)
|
|
VALUES (?, ?, ?, ?, ?, ?, ?)
|
|
ON CONFLICT (table_id, column_name) DO UPDATE SET
|
|
basetype = excluded.basetype,
|
|
description = excluded.description,
|
|
confidence = excluded.confidence,
|
|
source = excluded.source,
|
|
updated_at = excluded.updated_at""",
|
|
[table_id, column_name, basetype, description, confidence, source, now],
|
|
)
|
|
return self.get(table_id, column_name)
|
|
|
|
def get(self, table_id: str, column_name: str) -> Optional[Dict[str, Any]]:
|
|
result = self.conn.execute(
|
|
"SELECT * FROM column_metadata WHERE table_id = ? AND column_name = ?",
|
|
[table_id, column_name],
|
|
).fetchone()
|
|
if not result:
|
|
return None
|
|
columns = [desc[0] for desc in self.conn.description]
|
|
return dict(zip(columns, result))
|
|
|
|
def list_for_table(self, table_id: str) -> List[Dict[str, Any]]:
|
|
results = self.conn.execute(
|
|
"SELECT * FROM column_metadata WHERE table_id = ? ORDER BY column_name",
|
|
[table_id],
|
|
).fetchall()
|
|
if not results:
|
|
return []
|
|
columns = [desc[0] for desc in self.conn.description]
|
|
return [dict(zip(columns, row)) for row in results]
|
|
|
|
def delete(self, table_id: str, column_name: str) -> bool:
|
|
existing = self.get(table_id, column_name)
|
|
if not existing:
|
|
return False
|
|
self.conn.execute(
|
|
"DELETE FROM column_metadata WHERE table_id = ? AND column_name = ?",
|
|
[table_id, column_name],
|
|
)
|
|
return True
|
|
|
|
def import_proposal(self, proposal_path) -> int:
|
|
"""Import a metadata proposal JSON file. Returns count of columns imported."""
|
|
path = Path(proposal_path)
|
|
data = json.loads(path.read_text())
|
|
count = 0
|
|
|
|
tables = data.get("tables", {})
|
|
for table_id, table_data in tables.items():
|
|
columns = table_data.get("columns", {})
|
|
for col_name, col_data in columns.items():
|
|
self.save(
|
|
table_id=table_id,
|
|
column_name=col_name,
|
|
basetype=col_data.get("basetype"),
|
|
description=col_data.get("description"),
|
|
confidence=col_data.get("confidence", "medium"),
|
|
source="ai_enrichment",
|
|
)
|
|
count += 1
|
|
|
|
return count
|
|
```
|
|
|
|
- [ ] **Step 4: Run tests to verify they pass**
|
|
|
|
Run: `pytest tests/test_column_metadata.py -v`
|
|
Expected: ALL PASS
|
|
|
|
- [ ] **Step 5: Commit**
|
|
|
|
```bash
|
|
git add src/repositories/column_metadata.py tests/test_column_metadata.py
|
|
git commit -m "feat: add ColumnMetadataRepository with CRUD and proposal import"
|
|
```
|
|
|
|
---
|
|
|
|
### Task 2: CLI Subcommands `da admin metadata`
|
|
|
|
**Files:**
|
|
- Modify: `cli/commands/admin.py` (add metadata subcommands)
|
|
- Test: `tests/test_cli.py`
|
|
|
|
- [ ] **Step 1: Write failing test**
|
|
|
|
Add to `tests/test_cli.py` in `TestCLIHelp`:
|
|
|
|
```python
|
|
def test_admin_metadata_help(self):
|
|
result = runner.invoke(app, ["admin", "metadata-show", "--help"])
|
|
assert result.exit_code == 0
|
|
```
|
|
|
|
- [ ] **Step 2: Run test to verify it fails**
|
|
|
|
Run: `pytest tests/test_cli.py::TestCLIHelp::test_admin_metadata_help -v`
|
|
Expected: FAIL — `No such command 'metadata-show'`
|
|
|
|
- [ ] **Step 3: Add metadata commands to admin.py**
|
|
|
|
Add to `cli/commands/admin.py`:
|
|
|
|
```python
|
|
@admin_app.command("metadata-show")
|
|
def metadata_show(
|
|
table_id: str = typer.Argument(..., help="Table ID"),
|
|
as_json: bool = typer.Option(False, "--json"),
|
|
):
|
|
"""Show column metadata for a table."""
|
|
resp = api_get(f"/api/admin/metadata/{table_id}")
|
|
if resp.status_code != 200:
|
|
typer.echo(f"Failed: {resp.json().get('detail', resp.text)}", err=True)
|
|
raise typer.Exit(1)
|
|
|
|
columns = resp.json().get("columns", [])
|
|
if as_json:
|
|
typer.echo(json.dumps(columns, indent=2))
|
|
else:
|
|
if not columns:
|
|
typer.echo(f"No metadata for table '{table_id}'")
|
|
return
|
|
typer.echo(f"\n Metadata for {table_id}:")
|
|
for c in columns:
|
|
desc = c.get("description", "-")
|
|
typer.echo(f" {c['column_name']:30s} {c.get('basetype', '?'):12s} {desc}")
|
|
|
|
|
|
@admin_app.command("metadata-apply")
|
|
def metadata_apply(
|
|
proposal_path: str = typer.Argument(..., help="Path to proposal JSON file"),
|
|
push_to_source: bool = typer.Option(False, "--push-to-source", help="Push to Keboola Storage API"),
|
|
dry_run: bool = typer.Option(False, "--dry-run", help="Show changes without applying"),
|
|
):
|
|
"""Apply a metadata proposal (JSON) to DuckDB and optionally push to source."""
|
|
from pathlib import Path
|
|
|
|
path = Path(proposal_path)
|
|
if not path.exists():
|
|
typer.echo(f"File not found: {proposal_path}", err=True)
|
|
raise typer.Exit(1)
|
|
|
|
import json as json_mod
|
|
data = json_mod.loads(path.read_text())
|
|
tables = data.get("tables", {})
|
|
|
|
if dry_run:
|
|
for table_id, td in tables.items():
|
|
for col, cd in td.get("columns", {}).items():
|
|
typer.echo(f" {table_id}.{col}: {cd.get('basetype', '?')} — {cd.get('description', '-')}")
|
|
typer.echo(f"\nDry run: {sum(len(td.get('columns', {})) for td in tables.values())} columns would be applied")
|
|
return
|
|
|
|
from src.db import get_system_db
|
|
from src.repositories.column_metadata import ColumnMetadataRepository
|
|
|
|
conn = get_system_db()
|
|
try:
|
|
repo = ColumnMetadataRepository(conn)
|
|
count = repo.import_proposal(path)
|
|
typer.echo(f"Applied {count} column metadata entries to DuckDB")
|
|
finally:
|
|
conn.close()
|
|
|
|
if push_to_source:
|
|
resp = api_post(f"/api/admin/metadata/push", json={"proposal_path": str(path)})
|
|
if resp.status_code == 200:
|
|
typer.echo("Pushed metadata to source system")
|
|
else:
|
|
typer.echo(f"Push failed: {resp.json().get('detail', resp.text)}", err=True)
|
|
```
|
|
|
|
- [ ] **Step 4: Run tests to verify they pass**
|
|
|
|
Run: `pytest tests/test_cli.py::TestCLIHelp::test_admin_metadata_help -v`
|
|
Expected: PASS
|
|
|
|
- [ ] **Step 5: Commit**
|
|
|
|
```bash
|
|
git add cli/commands/admin.py tests/test_cli.py
|
|
git commit -m "feat: add da admin metadata-show and metadata-apply commands"
|
|
```
|
|
|
|
---
|
|
|
|
### Task 3: API Endpoints
|
|
|
|
**Files:**
|
|
- Create: `app/api/metadata.py`
|
|
- Modify: `app/main.py` (register router)
|
|
- Test: `tests/test_api.py`
|
|
|
|
- [ ] **Step 1: Write failing tests**
|
|
|
|
Add to `tests/test_api.py`:
|
|
|
|
```python
|
|
class TestMetadataAPI:
|
|
def test_get_metadata_empty(self, seeded_client):
|
|
client, admin_token, _ = seeded_client
|
|
resp = client.get("/api/admin/metadata/orders",
|
|
headers={"Authorization": f"Bearer {admin_token}"})
|
|
assert resp.status_code == 200
|
|
assert resp.json()["columns"] == []
|
|
|
|
def test_save_and_get_metadata(self, seeded_client):
|
|
client, admin_token, _ = seeded_client
|
|
resp = client.post(
|
|
"/api/admin/metadata/orders",
|
|
json={"columns": [
|
|
{"column_name": "id", "basetype": "STRING", "description": "Order ID"},
|
|
{"column_name": "total", "basetype": "NUMERIC", "description": "Total amount"},
|
|
]},
|
|
headers={"Authorization": f"Bearer {admin_token}"},
|
|
)
|
|
assert resp.status_code == 200
|
|
assert resp.json()["count"] == 2
|
|
|
|
resp = client.get("/api/admin/metadata/orders",
|
|
headers={"Authorization": f"Bearer {admin_token}"})
|
|
assert len(resp.json()["columns"]) == 2
|
|
|
|
def test_analyst_cannot_save_metadata(self, seeded_client):
|
|
client, _, analyst_token = seeded_client
|
|
resp = client.post(
|
|
"/api/admin/metadata/orders",
|
|
json={"columns": [{"column_name": "id", "basetype": "STRING"}]},
|
|
headers={"Authorization": f"Bearer {analyst_token}"},
|
|
)
|
|
assert resp.status_code == 403
|
|
```
|
|
|
|
- [ ] **Step 2: Run tests to verify they fail**
|
|
|
|
Run: `pytest tests/test_api.py::TestMetadataAPI -v`
|
|
Expected: FAIL — 404 on `/api/admin/metadata/orders`
|
|
|
|
- [ ] **Step 3: Implement API router**
|
|
|
|
Create `app/api/metadata.py`:
|
|
|
|
```python
|
|
"""Column metadata API endpoints."""
|
|
|
|
from typing import List, Optional
|
|
|
|
from fastapi import APIRouter, Depends, HTTPException
|
|
from pydantic import BaseModel
|
|
import duckdb
|
|
|
|
from app.auth.dependencies import get_current_user, require_admin, _get_db
|
|
from src.repositories.column_metadata import ColumnMetadataRepository
|
|
|
|
router = APIRouter(tags=["metadata"])
|
|
|
|
|
|
class ColumnMetadataItem(BaseModel):
|
|
column_name: str
|
|
basetype: Optional[str] = None
|
|
description: Optional[str] = None
|
|
confidence: str = "manual"
|
|
|
|
|
|
class ColumnMetadataSave(BaseModel):
|
|
columns: List[ColumnMetadataItem]
|
|
|
|
|
|
@router.get("/api/admin/metadata/{table_id}")
|
|
async def get_table_metadata(
|
|
table_id: str,
|
|
user: dict = Depends(get_current_user),
|
|
conn: duckdb.DuckDBPyConnection = Depends(_get_db),
|
|
):
|
|
repo = ColumnMetadataRepository(conn)
|
|
columns = repo.list_for_table(table_id)
|
|
return {"table_id": table_id, "columns": columns}
|
|
|
|
|
|
@router.post("/api/admin/metadata/{table_id}")
|
|
async def save_table_metadata(
|
|
table_id: str,
|
|
body: ColumnMetadataSave,
|
|
user: dict = Depends(require_admin),
|
|
conn: duckdb.DuckDBPyConnection = Depends(_get_db),
|
|
):
|
|
repo = ColumnMetadataRepository(conn)
|
|
for col in body.columns:
|
|
repo.save(
|
|
table_id=table_id,
|
|
column_name=col.column_name,
|
|
basetype=col.basetype,
|
|
description=col.description,
|
|
confidence=col.confidence,
|
|
source="api",
|
|
)
|
|
return {"status": "ok", "table_id": table_id, "count": len(body.columns)}
|
|
|
|
|
|
@router.post("/api/admin/metadata/{table_id}/push")
|
|
async def push_metadata_to_source(
|
|
table_id: str,
|
|
user: dict = Depends(require_admin),
|
|
conn: duckdb.DuckDBPyConnection = Depends(_get_db),
|
|
):
|
|
"""Push column metadata to the source system (Keboola only)."""
|
|
from src.repositories.table_registry import TableRegistryRepository
|
|
table_repo = TableRegistryRepository(conn)
|
|
table = table_repo.get(table_id)
|
|
|
|
if not table:
|
|
raise HTTPException(status_code=404, detail=f"Table not found: {table_id}")
|
|
if table.get("source_type") != "keboola":
|
|
raise HTTPException(status_code=400, detail="Push only supported for Keboola source tables")
|
|
|
|
meta_repo = ColumnMetadataRepository(conn)
|
|
columns = meta_repo.list_for_table(table_id)
|
|
if not columns:
|
|
raise HTTPException(status_code=400, detail="No metadata to push")
|
|
|
|
# Build Keboola API payload
|
|
import os
|
|
import httpx
|
|
|
|
stack_url = os.environ.get("KBC_STACK_URL", "")
|
|
token = os.environ.get("KBC_STORAGE_TOKEN", "")
|
|
if not stack_url or not token:
|
|
raise HTTPException(status_code=400, detail="KBC_STACK_URL and KBC_STORAGE_TOKEN must be set")
|
|
|
|
source_table = table.get("source_table", table_id)
|
|
columns_metadata = {}
|
|
for col in columns:
|
|
entries = []
|
|
if col.get("basetype"):
|
|
entries.append({"key": "KBC.datatype.basetype", "value": col["basetype"]})
|
|
if col.get("description"):
|
|
entries.append({"key": "KBC.description", "value": col["description"]})
|
|
if entries:
|
|
columns_metadata[col["column_name"]] = entries
|
|
|
|
try:
|
|
resp = httpx.post(
|
|
f"{stack_url}/v2/storage/tables/{source_table}/metadata",
|
|
headers={"X-StorageApi-Token": token},
|
|
json={"provider": "ai-metadata-enrichment", "columnsMetadata": columns_metadata},
|
|
timeout=30,
|
|
)
|
|
resp.raise_for_status()
|
|
return {"status": "pushed", "table_id": table_id, "columns": len(columns_metadata)}
|
|
except httpx.HTTPStatusError as e:
|
|
raise HTTPException(status_code=502, detail=f"Keboola API error: {e.response.text}")
|
|
```
|
|
|
|
Register in `app/main.py`:
|
|
|
|
```python
|
|
from app.api.metadata import router as metadata_router
|
|
# ... (add near other router imports)
|
|
|
|
# In create_app(), add before web_router:
|
|
app.include_router(metadata_router)
|
|
```
|
|
|
|
- [ ] **Step 4: Run tests to verify they pass**
|
|
|
|
Run: `pytest tests/test_api.py::TestMetadataAPI -v`
|
|
Expected: ALL PASS
|
|
|
|
- [ ] **Step 5: Commit**
|
|
|
|
```bash
|
|
git add app/api/metadata.py app/main.py tests/test_api.py
|
|
git commit -m "feat: add column metadata API with Keboola push support"
|
|
```
|
|
|
|
---
|
|
|
|
### Task 4: Final Integration
|
|
|
|
- [ ] **Step 1: Run full test suite**
|
|
|
|
Run: `pytest tests/ -v --timeout=60`
|
|
Expected: ALL PASS
|
|
|
|
- [ ] **Step 2: Commit if any fixes needed**
|
|
|
|
```bash
|
|
git add -A
|
|
git commit -m "fix: address metadata writer integration issues"
|
|
```
|