diff --git a/connectors/keboola/extractor.py b/connectors/keboola/extractor.py index 696a340..40d172a 100644 --- a/connectors/keboola/extractor.py +++ b/connectors/keboola/extractor.py @@ -186,9 +186,10 @@ def _extract_via_legacy( table_id = f"{bucket}.{source_table}" if bucket else tc.get("id", tc["name"]) client.export_table(table_id, Path(csv_path)) - # Convert CSV to Parquet using DuckDB + # Convert CSV to Parquet using DuckDB — all_varchar avoids type inference errors + # (e.g. columns with mostly numeric values but some strings like "Non-Manager") conv_conn = duckdb.connect() - conv_conn.execute(f"COPY (SELECT * FROM read_csv_auto('{csv_path}')) TO '{pq_path}' (FORMAT PARQUET)") + conv_conn.execute(f"COPY (SELECT * FROM read_csv('{csv_path}', all_varchar=true)) TO '{pq_path}' (FORMAT PARQUET)") conv_conn.close() finally: if os.path.exists(csv_path): diff --git a/docs/DEPLOYMENT.md b/docs/DEPLOYMENT.md index 72482bf..27286ab 100644 --- a/docs/DEPLOYMENT.md +++ b/docs/DEPLOYMENT.md @@ -2,95 +2,198 @@ ## Server Requirements -- Debian 12 / Ubuntu 22.04+ -- 2+ vCPUs, 2+ GB RAM -- 10+ GB data disk -- Public IP with DNS +- Ubuntu 24.04 LTS +- e2-small (2 vCPU, 2 GB RAM) or larger +- 30 GB SSD boot disk +- Docker + Docker Compose +- Public IP with port 8000 open -## Initial Server Setup +## Quick Deploy (GCP) -1. Provision a VM (GCP, AWS, Azure, etc.) +### 1. Create VM -2. Run the setup script: - ```bash - sudo bash server/setup.sh - ``` - - This creates: - - System groups: `data-ops`, `dataread`, `data-private` - - Deploy user with appropriate permissions - - Directory structure under `/opt/data-analyst/` - - Python virtual environment - -3. Set up the webapp: - ```bash - sudo bash server/webapp-setup.sh - ``` - - This installs: - - Gunicorn systemd service - - Nginx reverse proxy with SSL - - Log rotation - -## CI/CD Pipeline - -1. Copy the example workflow: - ```bash - cp .github/workflows/deploy.yml.example .github/workflows/deploy.yml - ``` - -2. Configure GitHub Secrets: - - `SERVER_HOST`: Server IP address - - `SERVER_USER`: Deploy username - - `SERVER_SSH_KEY`: Deploy SSH private key - - All environment variables from `.env` - -3. Push to `main` branch triggers automatic deployment. - -## Directory Structure on Server - -``` -/opt/data-analyst/ -├── repo/ # Git clone of this repository -├── .env # Environment variables (secrets) -├── .venv/ # Python virtual environment -└── logs/ # Application logs - -/data/ -├── src_data/ -│ ├── parquet/ # Converted data files -│ ├── metadata/ # Sync state, profiles -│ └── raw/ # Raw source data -├── docs/ # Documentation served to analysts -├── scripts/ # Scripts distributed to analysts -└── notifications/ # Notification system data -``` - -## Separate Config Repository - -For production deployments, keep instance config in a separate private repository: - -``` -client-config-repo/ -├── config/ -│ ├── instance.yaml -│ └── data_description.md -├── .env.example -└── .github/workflows/deploy.yml -``` - -Set `CONFIG_DIR=/opt/data-analyst/client-config/config/` in the environment. - -## SSL Setup - -Use certbot for Let's Encrypt SSL: ```bash -sudo apt install certbot python3-certbot-nginx -sudo certbot --nginx -d data.yourcompany.com +gcloud compute instances create data-analyst-dev \ + --project=YOUR_PROJECT \ + --zone=europe-west1-b \ + --machine-type=e2-small \ + --image-family=ubuntu-2404-lts-amd64 \ + --image-project=ubuntu-os-cloud \ + --boot-disk-size=30GB \ + --boot-disk-type=pd-ssd \ + --tags=data-analyst-dev ``` +### 2. Install Docker + +```bash +curl -fsSL https://get.docker.com | sh +sudo usermod -aG docker $USER +# Log out and back in for group change to take effect +``` + +### 3. Set up deploy key + +Generate an SSH key for GitHub access: + +```bash +ssh-keygen -t ed25519 -f ~/.ssh/agnes_deploy -N "" -C "agnes-deploy" +cat ~/.ssh/agnes_deploy.pub +# Add the public key as a deploy key on the GitHub repo +``` + +Configure SSH to use it: + +```bash +cat > ~/.ssh/config << 'EOF' +Host github.com + IdentityFile ~/.ssh/agnes_deploy + StrictHostKeyChecking no +EOF +chmod 600 ~/.ssh/config +``` + +### 4. Clone and configure + +```bash +sudo mkdir -p /opt/data-analyst +sudo chown $USER:$USER /opt/data-analyst +git clone git@github.com:keboola/agnes-the-ai-analyst.git /opt/data-analyst +cd /opt/data-analyst +``` + +Create `.env`: + +```bash +cat > .env << 'EOF' +JWT_SECRET_KEY= +DATA_DIR=/data +LOG_LEVEL=info +KEBOOLA_STORAGE_TOKEN= +KEBOOLA_STACK_URL= +SEED_ADMIN_EMAIL= +EOF +chmod 600 .env +``` + +Create `config/instance.yaml` (optional, for Keboola source config): + +```bash +cp config/instance.yaml.example config/instance.yaml +# Edit with your values +``` + +### 5. Create data directories + +```bash +sudo mkdir -p /data/state /data/analytics /data/extracts +sudo chown -R $USER:$USER /data +``` + +### 6. Build and start + +```bash +cd /opt/data-analyst +docker compose up -d +``` + +Wait for health check: + +```bash +curl -s http://localhost:8000/api/health | python3 -m json.tool +``` + +### 7. Bootstrap admin user + +```bash +curl -X POST http://localhost:8000/auth/bootstrap +``` + +This creates the first admin user using `SEED_ADMIN_EMAIL` from `.env`. + +### 8. Register tables and run first extraction + +Register tables via the admin API, then: + +```bash +# Stop app first — DuckDB only supports one writer +docker compose down +docker compose run --rm extract +docker compose up -d +``` + +### 9. Open firewall (GCP) + +```bash +gcloud compute firewall-rules create allow-data-analyst-dev \ + --allow tcp:8000 \ + --target-tags=data-analyst-dev \ + --project=YOUR_PROJECT +``` + +## Important Notes + +### DuckDB Write Locking + +DuckDB only supports one writer at a time. When running extraction: + +```bash +docker compose down # Stop app + scheduler +docker compose run --rm extract # Run extraction +docker compose up -d # Restart +``` + +The scheduler triggers extraction via the API, which handles locking internally. + +### Environment Variable Changes + +`docker compose restart` does NOT reload `.env`. Use: + +```bash +docker compose down && docker compose up -d +``` + +### Services + +| Service | Profile | Description | +|---------|---------|-------------| +| `app` | default | FastAPI server on port 8000 | +| `scheduler` | default | Periodic sync + extraction | +| `extract` | extract | One-shot data extraction | +| `telegram-bot` | full | Telegram notifications | +| `ws-gateway` | full | WebSocket gateway | +| `corporate-memory` | full | Knowledge collector | +| `session-collector` | full | Session collection | + +Start all services: `docker compose --profile full up -d` + +### Directory Structure on Server + +``` +/opt/data-analyst/ # Git repo + .env # Secrets (chmod 600) + config/instance.yaml # Instance config + +/data/ # Persistent data (Docker volume) + state/system.duckdb # System state (users, registry, sync) + analytics/server.duckdb # Analytics views + extracts/ # Per-source extract.duckdb + parquets + keboola/ + bigquery/ + jira/ +``` + +## CI/CD + +Push to `main` triggers GitHub Actions: +1. Run test suite (607 tests) +2. Build Docker image +3. Push to GHCR (`ghcr.io/keboola/agnes-the-ai-analyst`) +4. Deploy via Kamal + ## Monitoring -- Health check: `GET /health` -- Logs: `journalctl -u webapp -f` -- Disk usage: `df -h /data` +- Health: `GET /api/health` +- Logs: `docker compose logs -f app` +- Disk: `df -h /data` +- Tables: `curl -s http://localhost:8000/api/catalog | python3 -m json.tool`