diff --git a/.gitignore b/.gitignore index e1f359b..4a42bdc 100644 --- a/.gitignore +++ b/.gitignore @@ -117,7 +117,7 @@ docs/data_description.md .github/workflows/deploy.yml # Project-specific: Data directory -# Downloaded data from Keboola - never commit +# Downloaded source data - never commit data/ # Metadata tooling - entire folder diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md index 440c50d..c555d0a 100644 --- a/ARCHITECTURE.md +++ b/ARCHITECTURE.md @@ -10,7 +10,7 @@ Data Source (Keboola / CSV / BigQuery) | Data Broker Server | | | | src/data_sync.py | -| -> src/adapters/*.py (fetch data) | +| -> connectors/*.py (fetch data) | | -> src/parquet_manager.py (convert) | | | | /data/src_data/parquet/ (output) | @@ -37,9 +37,8 @@ Pulls data from configured source, converts to Parquet. | File | Role | |------|------| | `src/data_sync.py` | Orchestration + `DataSource` ABC (line 149) | -| `src/adapters/base.py` | Adapter interface | -| `src/adapters/keboola_adapter.py` | Keboola Storage adapter | -| `src/keboola_client.py` | Low-level Keboola API client | +| `connectors/keboola/adapter.py` | Keboola data source | +| `connectors/keboola/client.py` | Low-level Keboola API client | | `src/parquet_manager.py` | CSV -> typed Parquet conversion | | `src/config.py` | Reads `data_description.md` for table definitions | | `src/profiler.py` | Data profiling for catalog UI | @@ -129,7 +128,7 @@ inject_config() context processor ## Key Patterns -- **Adapter pattern**: Factory in `src/adapters/__init__.py`, ABC in `src/data_sync.py` +- **Connector pattern**: Dynamic connector registry in `src/data_sync.py`, `connectors/keboola/` for reference - **Atomic writes**: `tempfile.mkstemp()` + `os.fchmod()` + `os.replace()` for JSON state files - **User home writes**: `sudo install -o {user} -g {user}` for writing to analyst home dirs - **Config interpolation**: `${ENV_VAR}` in YAML resolved at load time, missing vars logged as warnings diff --git a/CLAUDE.md b/CLAUDE.md index fef725c..58912c3 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -33,11 +33,13 @@ Ask the user for: ``` ├── src/ # Core data sync engine -│ ├── adapters/ # Data source adapters (Keboola, CSV, etc.) │ ├── config.py # Configuration from data_description.md -│ ├── data_sync.py # Sync orchestration +│ ├── data_sync.py # Sync orchestration + DataSource ABC │ ├── parquet_manager.py # Parquet file management │ └── profiler.py # Data profiling +├── connectors/ # Data source connectors +│ ├── keboola/ # Keboola Storage connector +│ └── jira/ # Jira webhook connector ├── webapp/ # Flask web portal (login, dashboard, API) ├── server/ # Server deployment (systemd, scripts) ├── scripts/ # Utility scripts (sync, DuckDB setup) @@ -97,8 +99,8 @@ python -m src.data_sync ## Data Source Adapters -The platform supports pluggable data sources via `src/adapters/`: -- **Keboola** (`keboola`): Syncs from Keboola Storage API +The platform supports pluggable data sources via `connectors/`: +- **Keboola** (`keboola`): Syncs from Keboola Storage API (see `connectors/keboola/`) - **CSV** (`csv`): Import from local CSV files (planned) - **BigQuery** (`bigquery`): Query from Google BigQuery (planned) @@ -136,11 +138,11 @@ When reopening the project in Claude Code: 4. `inject_config()` context processor exposes `Config` to all Jinja templates 5. Templates use `{{ config.INSTANCE_NAME }}`, `{{ config.INSTANCE_SUBTITLE }}`, etc. -### Adapter Pattern -- Factory: `src/adapters/__init__.py` -> `create_data_source(adapter_type, **kwargs)` -- ABC: `DataSource` class in `src/data_sync.py` (lines 149-172) -- Keboola: `src/adapters/keboola_adapter.py` -> thin facade wrapping `LocalKeboolaSource` -- Core Keboola logic: `src/keboola_client.py` (788 lines, Keboola Storage API wrapper) +### Connector Pattern +- ABC: `DataSource` class in `src/data_sync.py` +- Registry: `create_data_source()` in `src/data_sync.py` auto-discovers connectors in `connectors/` +- Keboola: `connectors/keboola/adapter.py` -> `KeboolaDataSource` implementing `DataSource` +- Core Keboola logic: `connectors/keboola/client.py` (Keboola Storage API wrapper) ### Server Patterns - Atomic JSON writes: `tempfile.mkstemp()` + `os.fchmod(fd, 0o660)` + `os.replace()` diff --git a/README.md b/README.md index e236378..bbe8b7b 100644 --- a/README.md +++ b/README.md @@ -82,14 +82,17 @@ ai-data-analyst/ │ └── data_description.md.example # Data schema template │ ├── src/ # Server-side Python code -│ ├── adapters/ # Data source adapters -│ │ ├── base.py # Adapter interface (ABC) -│ │ └── keboola_adapter.py # Keboola Storage adapter -│ ├── data_sync.py # Orchestrates data pull from sources +│ ├── data_sync.py # Orchestrates data pull + DataSource ABC │ ├── parquet_manager.py # CSV to Parquet conversion │ ├── config.py # Configuration loader │ └── profiler.py # Data profiling for catalog │ +├── connectors/ # Data source connectors +│ ├── keboola/ # Keboola Storage connector +│ │ ├── adapter.py # KeboolaDataSource (implements DataSource) +│ │ └── client.py # Low-level Keboola API client +│ └── jira/ # Jira webhook connector +│ ├── webapp/ # Flask web application │ └── ... # User onboarding, settings, catalog │ @@ -124,7 +127,7 @@ ai-data-analyst/ | BigQuery | Planned | Google BigQuery adapter | | Snowflake | Planned | Snowflake adapter | -Adding a new adapter means implementing the `DataSource` interface in `src/adapters/` and setting `data_source.type` in `config/instance.yaml`. See `src/adapters/base.py` for the contract. +Adding a new data source means creating a connector module in `connectors/` that implements the `DataSource` interface from `src/data_sync.py`, and setting `data_source.type` in `config/instance.yaml`. See `connectors/keboola/` for a reference implementation. ## Using with Claude Code diff --git a/connectors/jira/README.md b/connectors/jira/README.md index 6e604d8..d662f48 100644 --- a/connectors/jira/README.md +++ b/connectors/jira/README.md @@ -73,7 +73,7 @@ Real-time sync of Jira support tickets for AI-powered analysis. ┌─────────────────────────────────────────────────────────────────────────────┐ │ ANALYST MACHINE │ │ │ -│ ~/keboola-analysis/ │ +│ ~/data-analysis/ │ │ └── server/ │ │ └── parquet/ │ │ └── jira/ # Synced Parquet + attachments │ @@ -540,7 +540,7 @@ Jira data is an **optional dataset** - not synced by default to save bandwidth. **Enable Jira sync:** ```bash # Edit local config (created on first sync_data.sh run) -nano ~/.config/keboola-analyst/sync.yaml +nano ~/.config/data-analyst/sync.yaml # Change: datasets: @@ -585,7 +585,7 @@ This is fast (only downloads files for one ticket) and keeps your local machine If you need frequent access to attachments, enable full sync: ```yaml -# ~/.config/keboola-analyst/sync.yaml +# ~/.config/data-analyst/sync.yaml datasets: jira: true jira_attachments: true # Syncs ~500MB+ of files diff --git a/dev_docs/desktop-app.md b/dev_docs/desktop-app.md index 6802b67..2f5f84a 100644 --- a/dev_docs/desktop-app.md +++ b/dev_docs/desktop-app.md @@ -32,18 +32,18 @@ The WebSocket gateway (`server/ws_gateway/`) runs as a separate systemd service ## Building ```bash -cd macos-app/KeboolaAnalyst -xcodebuild -scheme KeboolaAnalyst -configuration Debug build +cd macos-app/DataAnalyst +xcodebuild -scheme DataAnalyst -configuration Debug build ``` The built app is at: ``` -~/Library/Developer/Xcode/DerivedData/KeboolaAnalyst-*/Build/Products/Debug/KeboolaAnalyst.app +~/Library/Developer/Xcode/DerivedData/DataAnalyst-*/Build/Products/Debug/DataAnalyst.app ``` To run: ```bash -open ~/Library/Developer/Xcode/DerivedData/KeboolaAnalyst-*/Build/Products/Debug/KeboolaAnalyst.app +open ~/Library/Developer/Xcode/DerivedData/DataAnalyst-*/Build/Products/Debug/DataAnalyst.app ``` ## Authentication Flow @@ -52,7 +52,7 @@ open ~/Library/Developer/Xcode/DerivedData/KeboolaAnalyst-*/Build/Products/Debug 2. Browser opens `https://your-instance.example.com/desktop/link` 3. User authenticates via Google SSO (if not already logged in) 4. User clicks **Authorize Desktop App** -5. Webapp generates a JWT token (HS256, 30-day expiry) and redirects to `keboola-analyst://auth?token=eyJ...` +5. Webapp generates a JWT token (HS256, 30-day expiry) and redirects to `data-analyst://auth?token=eyJ...` 6. macOS app catches the custom URL scheme, stores the JWT in Keychain 7. App connects to WebSocket gateway, sends `{"type":"auth","token":"..."}` 8. Gateway validates JWT and confirms with `{"type":"auth_ok","username":"..."}` @@ -86,7 +86,7 @@ Client -> Server: {"type":"pong"} - **Persistence**: notifications stored in UserDefaults between launches - **Keychain**: JWT token stored securely in macOS Keychain - **Run scripts**: execute notification scripts on-demand via webapp API, results arrive as WS notifications -- **Logging**: `os.log` with subsystem `com.keboola.analyst`, category `WebSocket` -- view with `log show --predicate 'subsystem == "com.keboola.analyst"' --last 5m --info` +- **Logging**: `os.log` with subsystem `com.dataanalyst`, category `WebSocket` -- view with `log show --predicate 'subsystem == "com.dataanalyst"' --last 5m --info` ## Server Components @@ -161,12 +161,12 @@ location /ws/notifications { ## Project Structure ``` -macos-app/KeboolaAnalyst/ - KeboolaAnalyst.xcodeproj/ - KeboolaAnalyst/ +macos-app/DataAnalyst/ + DataAnalyst.xcodeproj/ + DataAnalyst/ App/ - KeboolaAnalystApp.swift # @main, MenuBarExtra - AppDelegate.swift # URL scheme handler (keboola-analyst://) + DataAnalystApp.swift # @main, MenuBarExtra + AppDelegate.swift # URL scheme handler (data-analyst://) Core/ Config.swift # URLs, timeouts, keychain names KeychainService.swift # JWT storage in Keychain @@ -182,7 +182,7 @@ macos-app/KeboolaAnalyst/ NotificationDetail.swift # Full view with chart image SettingsView.swift # Connection status, sign out Info.plist # URL scheme registration - KeboolaAnalyst.entitlements # Network client permission + DataAnalyst.entitlements # Network client permission ``` ## Troubleshooting @@ -208,7 +208,7 @@ sudo -u deploy curl -s --unix-socket /run/ws-gateway/ws.sock http://localhost/he ``` If connections is 0, restart the app. Check app logs: ```bash -/usr/bin/log show --predicate 'subsystem == "com.keboola.analyst"' --last 5m --info +/usr/bin/log show --predicate 'subsystem == "com.dataanalyst"' --last 5m --info ``` ### Script runs but no notification appears diff --git a/docs/DATA_SOURCES.md b/docs/DATA_SOURCES.md index e0171c2..0e663c6 100644 --- a/docs/DATA_SOURCES.md +++ b/docs/DATA_SOURCES.md @@ -51,12 +51,12 @@ tables: sync_strategy: "full_refresh" ``` -## Writing a Custom Adapter +## Writing a Custom Connector -Create a new file in `src/adapters/`: +Create a new connector module in `connectors//adapter.py`: ```python -from ..data_sync import DataSource +from src.data_sync import DataSource class MyDataSource(DataSource): def sync_table(self, table_config, sync_state): @@ -65,9 +65,6 @@ class MyDataSource(DataSource): pass ``` -Register in `src/adapters/__init__.py`: -```python -if adapter_type == "my_source": - from .my_adapter import MyDataSource - return MyDataSource(**kwargs) -``` +The `create_data_source()` function in `src/data_sync.py` auto-discovers connectors from the `connectors/` directory. Set `data_source.type` in `config/instance.yaml` to match the connector directory name (e.g., `keboola` for `connectors/keboola/`). + +See `connectors/keboola/` for a complete reference implementation. diff --git a/server/deploy.sh b/server/deploy.sh index 65b8843..2b5c424 100755 --- a/server/deploy.sh +++ b/server/deploy.sh @@ -267,7 +267,7 @@ if [[ -f "${REPO_DIR}/server/limits-users.conf" ]]; then fi # Create data sync .env file from environment variables (passed from GitHub Actions) -KEBOOLA_ENV_FILE="${REPO_DIR}/.env" +SYNC_ENV_FILE="${REPO_DIR}/.env" if [[ -n "${KEBOOLA_STORAGE_TOKEN:-}" ]]; then log "Creating data sync .env file..." { @@ -310,12 +310,12 @@ if [[ -n "${KEBOOLA_STORAGE_TOKEN:-}" ]]; then if [[ -n "${ANTHROPIC_API_KEY:-}" ]]; then echo "ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY}" fi - } | sudo /usr/bin/tee "$KEBOOLA_ENV_FILE" > /dev/null - sudo /usr/bin/chown root:data-ops "$KEBOOLA_ENV_FILE" - sudo /usr/bin/chmod 640 "$KEBOOLA_ENV_FILE" + } | sudo /usr/bin/tee "$SYNC_ENV_FILE" > /dev/null + sudo /usr/bin/chown root:data-ops "$SYNC_ENV_FILE" + sudo /usr/bin/chmod 640 "$SYNC_ENV_FILE" log " Data sync .env created with secure permissions (640)" else - log " Skipping data sync .env creation (no KEBOOLA_STORAGE_TOKEN provided)" + log " Skipping data sync .env creation (no sync credentials provided)" fi # Set correct permissions @@ -325,8 +325,8 @@ sudo /usr/bin/chmod -R 770 "$APP_DIR" # owner+group rwx, others none sudo /usr/bin/chmod -R g+s "$APP_DIR" # setgid for new files # Restore .env permissions (may have been overwritten by chmod -R) -if [[ -f "$KEBOOLA_ENV_FILE" ]]; then - sudo /usr/bin/chmod 640 "$KEBOOLA_ENV_FILE" +if [[ -f "$SYNC_ENV_FILE" ]]; then + sudo /usr/bin/chmod 640 "$SYNC_ENV_FILE" fi # Update and restart webapp if running diff --git a/server/notify-bot.service b/server/notify-bot.service index 199b345..99df2be 100644 --- a/server/notify-bot.service +++ b/server/notify-bot.service @@ -1,5 +1,5 @@ [Unit] -Description=Keboola Data Analyst Telegram Notification Bot +Description=Data Analyst Telegram Notification Bot After=network-online.target Wants=network-online.target @@ -12,7 +12,7 @@ ExecStart=/opt/data-analyst/.venv/bin/python -m server.telegram_bot.bot Restart=always RestartSec=10 -# Environment (webapp .env + Keboola .env with bot token) +# Environment (webapp .env + sync .env with bot token) EnvironmentFile=/opt/data-analyst/.env EnvironmentFile=/opt/data-analyst/repo/.env diff --git a/server/ws-gateway.service b/server/ws-gateway.service index f8ead1f..077d96a 100644 --- a/server/ws-gateway.service +++ b/server/ws-gateway.service @@ -1,5 +1,5 @@ [Unit] -Description=WebSocket Gateway for Keboola Data Analyst +Description=WebSocket Gateway for Data Analyst After=network.target [Service]