From 5ee12d78e7e20d9b0ede2e358b88b14ab3af710d Mon Sep 17 00:00:00 2001 From: ZdenekSrotyr Date: Tue, 31 Mar 2026 19:17:44 +0200 Subject: [PATCH] =?UTF-8?q?refactor:=20final=20cleanup=20=E2=80=94=20delet?= =?UTF-8?q?e=20legacy=20auth,=20clean=20deps,=20fix=20hash,=20migrate=20to?= =?UTF-8?q?=20uv?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Delete root auth/ directory (legacy Flask providers, orphaned) - Clean requirements.txt: remove Flask, gunicorn, authlib, sendgrid, anthropic, openai, argon2-cffi (9 unused deps) - Fix hash computation in orchestrator: MD5 of parquet mtime+size (CLI sync now skips unchanged tables correctly) - Migrate pip → uv in CLAUDE.md, scripts/init.sh, pyproject.toml - Sync pyproject.toml dependencies with requirements.txt 578 tests passing. --- CLAUDE.md | 14 +- auth/__init__.py | 111 -------------- auth/desktop/__init__.py | 0 auth/desktop/provider.py | 60 -------- auth/email/__init__.py | 0 auth/email/provider.py | 314 -------------------------------------- auth/google/__init__.py | 0 auth/google/provider.py | 157 ------------------- auth/password/__init__.py | 0 auth/password/provider.py | 59 ------- pyproject.toml | 30 ++-- requirements.txt | 85 ++++------- scripts/init.sh | 13 +- src/orchestrator.py | 18 ++- 14 files changed, 77 insertions(+), 784 deletions(-) delete mode 100644 auth/__init__.py delete mode 100644 auth/desktop/__init__.py delete mode 100644 auth/desktop/provider.py delete mode 100644 auth/email/__init__.py delete mode 100644 auth/email/provider.py delete mode 100644 auth/google/__init__.py delete mode 100644 auth/google/provider.py delete mode 100644 auth/password/__init__.py delete mode 100644 auth/password/provider.py diff --git a/CLAUDE.md b/CLAUDE.md index b4cb9b3..0548307 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -47,7 +47,7 @@ docker compose --profile full up # Include telegram bot │ ├── bigquery/ # BigQuery: extractor.py (remote-only via DuckDB BQ extension) │ └── jira/ # Jira: webhook + incremental parquet → extract.duckdb ├── cli/ # CLI tool (`da sync`, `da query`, `da admin`) -├── auth/ # Authentication providers (google, email, password, desktop) +├── app/auth/ # Authentication (FastAPI-based providers) ├── services/ # Standalone services (scheduler, telegram_bot, ws_gateway, etc.) ├── server/ # Legacy deployment infrastructure ├── scripts/ # Utility + migration scripts @@ -105,7 +105,7 @@ Table definitions: DuckDB `table_registry` table in `system.duckdb`. ```bash # Setup python3 -m venv .venv && source .venv/bin/activate -pip install -r requirements.txt +uv pip install -r requirements.txt # Run FastAPI locally uvicorn app.main:app --reload @@ -128,12 +128,10 @@ Must create `_meta` table with columns: table_name, description, rows, size_byte Orchestrator ATTACHes it automatically. ### Authentication -Pluggable auth providers in `auth/`: -- **Google** (`google`): OAuth via Google -- **Email** (`email`): Email magic link (itsdangerous token) -- **Password** (`password`): Username/password -- **Desktop** (`desktop`): JWT for API -- New provider = `auth//provider.py` implementing `AuthProvider` +Auth providers in `app/auth/` (FastAPI-based): +- **Google**: OAuth via Google +- **Email**: Email magic link (itsdangerous token) +- **Desktop**: JWT for API ## Key Implementation Details diff --git a/auth/__init__.py b/auth/__init__.py deleted file mode 100644 index 4d8583d..0000000 --- a/auth/__init__.py +++ /dev/null @@ -1,111 +0,0 @@ -""" -Pluggable authentication provider system. - -Each auth provider lives in auth//provider.py and implements AuthProvider. -Providers are auto-discovered and registered with the Flask app. - -To add a new provider (e.g., Okta): -1. Create auth/okta/provider.py -2. Implement AuthProvider subclass -3. Export `provider` instance at module level -4. That's it - no changes to core code needed. -""" - -import importlib -import logging -from abc import ABC, abstractmethod -from pathlib import Path -from typing import Optional - -from flask import Blueprint - -logger = logging.getLogger(__name__) - - -class AuthProvider(ABC): - """Base class for authentication providers.""" - - @abstractmethod - def get_name(self) -> str: - """Internal name (e.g., 'google', 'password').""" - - @abstractmethod - def get_blueprint(self) -> Blueprint: - """Flask blueprint with auth routes.""" - - @abstractmethod - def get_login_button(self) -> dict: - """Login button definition for the login page. - - Returns dict with: - text: str - Button label (e.g., "Sign in with Google") - url: str - Route URL (e.g., "/login/google") - icon_html: str - SVG or HTML for the icon - subtitle: str - Optional help text below button - order: int - Sort order (lower = higher on page) - css_class: str - Optional CSS class for the button (e.g., "btn-google") - visible: bool - Whether to show on login page (default True) - """ - - def is_available(self) -> bool: - """Check if provider is configured and ready. - Override to check env vars, API keys, etc. - Returns False to skip this provider.""" - return True - - def get_display_name(self) -> str: - """Human-readable name for UI.""" - return self.get_name().title() - - def init_app(self, app) -> None: - """Optional: initialize provider with Flask app (e.g., for OAuth setup).""" - pass - - -def discover_providers() -> list[AuthProvider]: - """Auto-discover auth providers from auth/*/provider.py. - - Each provider module must export a `provider` instance of AuthProvider. - Providers are sorted by login button order. - Only available providers (is_available() == True) are returned. - Providers listed in Config.AUTH_DISABLED_PROVIDERS are skipped. - """ - from app.instance_config import get_value - - disabled_raw = get_value("auth", "disabled_providers", default=[]) - disabled = [name.lower() for name in (disabled_raw or [])] - providers = [] - auth_dir = Path(__file__).parent - - for subdir in sorted(auth_dir.iterdir()): - if not subdir.is_dir() or subdir.name.startswith("_"): - continue - provider_file = subdir / "provider.py" - if not provider_file.exists(): - continue - - try: - mod = importlib.import_module(f"auth.{subdir.name}.provider") - provider_instance = getattr(mod, "provider", None) - if provider_instance and isinstance(provider_instance, AuthProvider): - if provider_instance.get_name().lower() in disabled: - logger.info( - f"Auth provider disabled by config: {provider_instance.get_name()}" - ) - elif provider_instance.is_available(): - providers.append(provider_instance) - logger.info(f"Auth provider loaded: {provider_instance.get_name()}") - else: - logger.debug( - f"Auth provider skipped (not available): {subdir.name}" - ) - else: - logger.warning( - f"Auth provider {subdir.name} has no 'provider' instance" - ) - except Exception as e: - logger.warning(f"Failed to load auth provider {subdir.name}: {e}") - - # Sort by login button order - providers.sort(key=lambda p: p.get_login_button().get("order", 50)) - return providers diff --git a/auth/desktop/__init__.py b/auth/desktop/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/auth/desktop/provider.py b/auth/desktop/provider.py deleted file mode 100644 index e1eab36..0000000 --- a/auth/desktop/provider.py +++ /dev/null @@ -1,60 +0,0 @@ -""" -Desktop JWT authentication provider. - -Desktop JWT authentication (Flask blueprint). -This is NOT a login provider (no login button) - it provides -JWT-based API authentication for the native desktop application. -""" - -import logging - -from flask import Blueprint - -import os - -from auth import AuthProvider - - -class _Config: - DESKTOP_JWT_SECRET = os.environ.get("DESKTOP_JWT_SECRET", "") - - -Config = _Config - -logger = logging.getLogger(__name__) - - -class DesktopAuthProvider(AuthProvider): - """Desktop app JWT authentication provider.""" - - def get_name(self) -> str: - return "desktop" - - def get_display_name(self) -> str: - return "Desktop App" - - def get_blueprint(self) -> Blueprint: - # Legacy Flask blueprint — removed with webapp/ - return Blueprint("desktop_auth", __name__) - - def get_login_button(self) -> dict: - return { - "text": "", - "url": "", - "icon_html": "", - "subtitle": "", - "order": 100, - "css_class": "", - "visible": False, - } - - def is_available(self) -> bool: - return bool(Config.DESKTOP_JWT_SECRET) - - def init_app(self, app) -> None: - """No additional initialization needed.""" - pass - - -# Module-level provider instance for auto-discovery -provider = DesktopAuthProvider() diff --git a/auth/email/__init__.py b/auth/email/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/auth/email/provider.py b/auth/email/provider.py deleted file mode 100644 index 84393e2..0000000 --- a/auth/email/provider.py +++ /dev/null @@ -1,314 +0,0 @@ -""" -Email magic link authentication provider. - -Users enter their email, receive a magic link, click it and they're logged in. -No passwords needed. Domain restriction ensures only allowed users can access. - -Email delivery modes: -1. SMTP relay (recommended) - configure SMTP_HOST, SMTP_PORT, etc. in .env -2. Console mode (development) - link printed to server log, shown in browser -""" - -import logging -import smtplib -import time -from email.mime.multipart import MIMEMultipart -from email.mime.text import MIMEText - -from flask import ( - Blueprint, - current_app, - flash, - redirect, - render_template, - request, - session, - url_for, -) -from itsdangerous import BadSignature, SignatureExpired, URLSafeTimedSerializer - -import os - -from auth import AuthProvider -from app.instance_config import get_allowed_domains, get_value - -_ALLOWED_DOMAINS = get_allowed_domains() -_ALLOWED_EMAILS = [ - e.strip().lower() - for e in os.environ.get("ALLOWED_EMAILS", "").split(",") - if e.strip() -] - - -def validate_email_domain(email: str) -> bool: - if not email: - return False - email_lower = email.lower() - if email_lower in _ALLOWED_EMAILS: - return True - domain = email_lower.split("@")[-1] - return domain in _ALLOWED_DOMAINS - - -class _Config: - SECRET_KEY = os.environ.get("WEBAPP_SECRET_KEY", "dev-secret-key-change-me") - ALLOWED_DOMAINS = _ALLOWED_DOMAINS - SMTP_HOST = os.environ.get("SMTP_HOST", "") - SMTP_PORT = int(os.environ.get("SMTP_PORT", "587")) - SMTP_USER = os.environ.get("SMTP_USER", "") - SMTP_PASSWORD = os.environ.get("SMTP_PASSWORD", "") - SMTP_FROM = os.environ.get("SMTP_FROM", - os.environ.get("SMTP_USER", - get_value("email", "from_address", default="noreply@example.com"))) - SMTP_USE_TLS = os.environ.get("SMTP_USE_TLS", "true").lower() == "true" - INSTANCE_NAME = get_value("instance", "name", default="AI Data Analyst") - - -Config = _Config - -logger = logging.getLogger(__name__) - -email_bp = Blueprint("email_auth", __name__) - -# SVG envelope icon for the login button -_EMAIL_ICON_HTML = ( - '' - '' - '' - "" -) - - -def _get_serializer() -> URLSafeTimedSerializer: - """Create token serializer using the app secret key.""" - return URLSafeTimedSerializer(Config.SECRET_KEY, salt="email-magic-link") - - -def _generate_magic_token(email: str) -> str: - """Generate a signed, time-limited token containing the email.""" - s = _get_serializer() - return s.dumps({"email": email.lower(), "t": int(time.time())}) - - -def _verify_magic_token(token: str, max_age_seconds: int = 900) -> str | None: - """Verify magic link token. Returns email if valid, None otherwise. - - Args: - token: The signed token from the magic link URL. - max_age_seconds: Token validity period (default 15 minutes). - - Returns: - Email address if token is valid, None otherwise. - """ - s = _get_serializer() - try: - data = s.loads(token, max_age=max_age_seconds) - return data.get("email") - except SignatureExpired: - logger.warning("Magic link token expired") - return None - except BadSignature: - logger.warning("Invalid magic link token") - return None - - -def _send_magic_email(email: str, magic_url: str) -> bool: - """Send magic link email via SMTP relay. - - Returns True if sent successfully, False otherwise. - """ - smtp_host = Config.SMTP_HOST - if not smtp_host: - return False - - msg = MIMEMultipart("alternative") - msg["Subject"] = f"Sign in to {Config.INSTANCE_NAME}" - msg["From"] = Config.SMTP_FROM - msg["To"] = email - - text_body = ( - f"Sign in to {Config.INSTANCE_NAME}\n\n" - f"Click the link below to sign in:\n{magic_url}\n\n" - f"This link expires in 15 minutes.\n" - f"If you didn't request this, ignore this email." - ) - - html_body = f""" - - -

Sign in to {Config.INSTANCE_NAME}

-

Click the button below to sign in:

-

- - Sign In - -

-

- This link expires in 15 minutes.
- If you didn't request this, ignore this email. -

-
-

- Or copy and paste this URL into your browser:
- {magic_url} -

- -""" - - msg.attach(MIMEText(text_body, "plain")) - msg.attach(MIMEText(html_body, "html")) - - try: - smtp_port = Config.SMTP_PORT - use_tls = Config.SMTP_USE_TLS - - if smtp_port == 465: - server = smtplib.SMTP_SSL(smtp_host, smtp_port, timeout=10) - else: - server = smtplib.SMTP(smtp_host, smtp_port, timeout=10) - if use_tls: - server.starttls() - - smtp_user = Config.SMTP_USER - smtp_password = Config.SMTP_PASSWORD - if smtp_user and smtp_password: - server.login(smtp_user, smtp_password) - - server.sendmail(Config.SMTP_FROM, [email], msg.as_string()) - server.quit() - logger.info("Magic link email sent to %s via SMTP", email) - return True - - except Exception as e: - logger.error("Failed to send magic link email to %s: %s", email, e) - return False - - -# --- Routes --- - - -@email_bp.route("/login/email") -def login_email_form(): - """Show email input form.""" - return render_template( - "login_magic_link.html", - allowed_domains=Config.ALLOWED_DOMAINS, - ) - - -@email_bp.route("/login/email/send", methods=["POST"]) -def send_magic_link(): - """Validate email domain and send magic link.""" - email = request.form.get("email", "").strip().lower() - - if not email: - flash("Please enter your email address.", "error") - return redirect(url_for("email_auth.login_email_form")) - - if not validate_email_domain(email): - domains_str = ", ".join(f"@{d}" for d in Config.ALLOWED_DOMAINS) - flash( - f"Only {domains_str} email addresses are allowed.", - "error", - ) - return redirect(url_for("email_auth.login_email_form")) - - # Generate magic link - token = _generate_magic_token(email) - magic_url = url_for("email_auth.verify_magic_link", token=token, _external=True) - - # Try SMTP first, fall back to console mode - smtp_sent = _send_magic_email(email, magic_url) - - if smtp_sent: - flash("Check your email for the sign-in link.", "info") - return render_template( - "login_magic_link_sent.html", - email=email, - console_mode=False, - ) - else: - # Console/development mode - show link directly - logger.info("MAGIC LINK for %s: %s", email, magic_url) - return render_template( - "login_magic_link_sent.html", - email=email, - magic_url=magic_url, - console_mode=True, - ) - - -@email_bp.route("/login/email/verify/") -def verify_magic_link(token: str): - """Verify magic link token and log user in.""" - email = _verify_magic_token(token) - - if not email: - flash("This sign-in link has expired or is invalid. Please try again.", "error") - return redirect(url_for("email_auth.login_email_form")) - - # Double-check domain (in case config changed since token was issued) - if not validate_email_domain(email): - flash("Your email is no longer authorized.", "error") - return redirect(url_for("auth.login")) - - # Set session (shared contract across all auth providers) - name = email.split("@")[0].replace(".", " ").title() - session["user"] = { - "email": email, - "name": name, - "picture": "", - } - - logger.info("User logged in via magic link: %s", email) - return redirect(url_for("dashboard")) - - -# --- Provider class --- - - -class EmailAuthProvider(AuthProvider): - """Email magic link authentication provider.""" - - def get_name(self) -> str: - return "email" - - def get_display_name(self) -> str: - return "Email" - - def get_blueprint(self) -> Blueprint: - return email_bp - - def get_login_button(self) -> dict: - domains = Config.ALLOWED_DOMAINS - if len(domains) > 1: - domain_str = ", ".join(f"@{d}" for d in domains) - elif domains: - domain_str = f"@{domains[0]}" - else: - domain_str = "" - return { - "text": "Sign in with Email", - "url": "/login/email", - "icon_html": _EMAIL_ICON_HTML, - "subtitle": f'For {domain_str} email addresses.' if domain_str else "", - "order": 20, - "css_class": "btn-email", - "visible": True, - } - - def is_available(self) -> bool: - """Available when at least one allowed domain is configured.""" - return len(Config.ALLOWED_DOMAINS) > 0 - - def init_app(self, app) -> None: - """No additional initialization needed.""" - pass - - -# Module-level provider instance for auto-discovery -provider = EmailAuthProvider() diff --git a/auth/google/__init__.py b/auth/google/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/auth/google/provider.py b/auth/google/provider.py deleted file mode 100644 index 79e1d1b..0000000 --- a/auth/google/provider.py +++ /dev/null @@ -1,157 +0,0 @@ -""" -Google OAuth authentication provider. - -Handles Google Sign-In flow with domain validation. -Google OAuth flow with domain validation (Flask blueprint). -""" - -import logging - -from authlib.integrations.flask_client import OAuth -from flask import Blueprint, flash, redirect, session, url_for - -import os - -from auth import AuthProvider -from app.instance_config import get_allowed_domains - -_ALLOWED_DOMAINS = get_allowed_domains() -_ALLOWED_EMAILS = [ - e.strip().lower() - for e in os.environ.get("ALLOWED_EMAILS", "").split(",") - if e.strip() -] - - -def validate_email_domain(email: str) -> bool: - if not email: - return False - email_lower = email.lower() - if email_lower in _ALLOWED_EMAILS: - return True - domain = email_lower.split("@")[-1] - return domain in _ALLOWED_DOMAINS - - -class _Config: - ALLOWED_DOMAINS = _ALLOWED_DOMAINS - GOOGLE_CLIENT_ID = os.environ.get("GOOGLE_CLIENT_ID", "") - GOOGLE_CLIENT_SECRET = os.environ.get("GOOGLE_CLIENT_SECRET", "") - - -Config = _Config - -logger = logging.getLogger(__name__) - -google_bp = Blueprint("google_auth", __name__) -oauth = OAuth() - -# Google SVG icon for the login button -_GOOGLE_ICON_HTML = ( - '' - '" - '" - '' - '' - "" -) - - -@google_bp.route("/login/google") -def login_google(): - """Initiate Google OAuth flow.""" - redirect_uri = url_for("google_auth.authorize", _external=True) - return oauth.google.authorize_redirect(redirect_uri) - - -@google_bp.route("/authorize") -def authorize(): - """Handle OAuth callback from Google.""" - try: - token = oauth.google.authorize_access_token() - userinfo = token.get("userinfo") - - if not userinfo: - logger.warning("No userinfo in OAuth response") - flash("Failed to get user information from Google.", "error") - return redirect(url_for("auth.login")) - - email = userinfo.get("email", "") - name = userinfo.get("name", "") - - # Validate domain - if not validate_email_domain(email): - logger.warning(f"Login attempt from non-allowed domain: {email}") - domains_str = ", ".join(f"@{d}" for d in Config.ALLOWED_DOMAINS) - flash( - f"Only {domains_str} email addresses are allowed.", "error" - ) - return redirect(url_for("auth.login")) - - # Store user info in session (shared contract across all providers) - session["user"] = { - "email": email, - "name": name, - "picture": userinfo.get("picture", ""), - } - - logger.info(f"User logged in via Google: {email}") - return redirect(url_for("dashboard")) - - except Exception as e: - logger.exception(f"OAuth error: {e}") - flash("Authentication failed. Please try again.", "error") - return redirect(url_for("auth.login")) - - -class GoogleAuthProvider(AuthProvider): - """Google OAuth authentication provider.""" - - def get_name(self) -> str: - return "google" - - def get_display_name(self) -> str: - return "Google" - - def get_blueprint(self) -> Blueprint: - return google_bp - - def get_login_button(self) -> dict: - domains = Config.ALLOWED_DOMAINS - if len(domains) > 1: - domain_str = ", ".join(f"@{d}" for d in domains) - else: - domain_str = f"@{domains[0]}" if domains else "" - return { - "text": "Sign in with Google", - "url": "/login/google", - "icon_html": _GOOGLE_ICON_HTML, - "subtitle": f'For {domain_str} email addresses.' if domain_str else "", - "order": 10, - "css_class": "btn-google", - "visible": True, - } - - def is_available(self) -> bool: - return bool(Config.GOOGLE_CLIENT_ID) - - def init_app(self, app) -> None: - """Initialize OAuth with the Flask app.""" - oauth.init_app(app) - oauth.register( - name="google", - client_id=Config.GOOGLE_CLIENT_ID, - client_secret=Config.GOOGLE_CLIENT_SECRET, - server_metadata_url="https://accounts.google.com/.well-known/openid-configuration", - client_kwargs={ - "scope": "openid email profile", - }, - ) - - -# Module-level provider instance for auto-discovery -provider = GoogleAuthProvider() diff --git a/auth/password/__init__.py b/auth/password/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/auth/password/provider.py b/auth/password/provider.py deleted file mode 100644 index 81d99f4..0000000 --- a/auth/password/provider.py +++ /dev/null @@ -1,59 +0,0 @@ -""" -Email/password authentication provider. - -Email/password authentication (Flask blueprint). -Available only when SENDGRID_API_KEY is configured. -""" - -import logging - -from flask import Blueprint - -import os - -from auth import AuthProvider - - -class _Config: - SENDGRID_API_KEY = os.environ.get("SENDGRID_API_KEY", "") - - -Config = _Config - -logger = logging.getLogger(__name__) - - -class PasswordAuthProvider(AuthProvider): - """Email/password authentication provider for external users.""" - - def get_name(self) -> str: - return "password" - - def get_display_name(self) -> str: - return "Email" - - def get_blueprint(self) -> Blueprint: - # Legacy Flask blueprint — removed with webapp/ - return Blueprint("password_auth", __name__) - - def get_login_button(self) -> dict: - return { - "text": "Sign in with Email", - "url": "/login/email", - "icon_html": "", - "subtitle": "For external users (investors, partners).", - "order": 20, - "css_class": "btn-secondary", - "visible": True, - } - - def is_available(self) -> bool: - return bool(Config.SENDGRID_API_KEY) - - def init_app(self, app) -> None: - """No additional initialization needed.""" - pass - - -# Module-level provider instance for auto-discovery -provider = PasswordAuthProvider() diff --git a/pyproject.toml b/pyproject.toml index ff8bd3a..d353dfa 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,33 +6,43 @@ requires-python = ">=3.9" license = "MIT" dependencies = [ + # Core database "duckdb>=0.9.0", + # Web framework (FastAPI) "fastapi>=0.115.0", "uvicorn[standard]>=0.32.0", "python-multipart>=0.0.9", "jinja2>=3.1.0", + # Authentication "PyJWT>=2.8.0", + "itsdangerous>=2.1.0", + # HTTP client "httpx>=0.27.0", + # CLI "typer>=0.12.0", "rich>=13.0.0", + # Configuration "python-dotenv>=1.0.0", "pyyaml>=6.0", + # Data processing + "pandas>=2.0.0", + "pyarrow>=12.0.0", + "pytz>=2024.1", + # Data source connectors + "kbcstorage>=0.9.0", + "google-cloud-bigquery>=3.0.0", + "google-cloud-bigquery-storage>=2.0.0", + # Profiler visualizations + "matplotlib>=3.8.0", + "numpy>=1.24.0", + # Sample data generation + "faker>=24.0.0", ] [project.scripts] da = "cli.main:app" [project.optional-dependencies] -connectors = [ - "kbcstorage>=0.9.0", - "google-cloud-bigquery>=3.0.0", - "google-cloud-bigquery-storage>=2.0.0", - "pandas>=2.0.0", - "pyarrow>=12.0.0", -] -telegram = [ - "aiohttp>=3.9.0", -] dev = [ "pytest>=7.0.0", "pytest-mock>=3.0.0", diff --git a/requirements.txt b/requirements.txt index 945407d..0111ebe 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,69 +1,40 @@ -# Data source adapters (install only what you need) -kbcstorage>=0.9.0 # For Keboola adapter -google-cloud-bigquery>=3.0.0 # For BigQuery adapter -google-cloud-bigquery-storage>=2.0.0 # For BigQuery adapter (fast Arrow transfer) - -# Data processing -# pandas - core tabular data processing library -# pyarrow - Parquet format support and fast operations -# pytz - timezone support required by DuckDB for reading timezone-aware Parquet columns -pandas>=2.0.0 -pyarrow>=12.0.0 -pytz>=2024.1 - -# Analytical database -# DuckDB - in-process SQL OLAP database for analytical queries +# Core database duckdb>=0.9.0 -# Configuration -# python-dotenv - loading environment variables from .env files -# pyyaml - parsing YAML configuration from data_description.md -python-dotenv>=1.0.0 -pyyaml>=6.0 - -# Progress tracking and logging -# tqdm - progress bars for long-running operations (download, sync) -tqdm>=4.65.0 - -# Web application (Google SSO portal) -# flask - web framework for self-service portal (legacy, being replaced by FastAPI) -# authlib - OAuth 2.0 / OpenID Connect library for Google SSO -# gunicorn - WSGI server for production deployment -flask>=3.0.0 -authlib>=1.3.0 -gunicorn>=21.0.0 - -# FastAPI - new unified web framework (API + web UI) +# Web framework (FastAPI) fastapi>=0.115.0 uvicorn[standard]>=0.32.0 python-multipart>=0.0.9 jinja2>=3.1.0 -# Telegram notification bot -# httpx - async HTTP client for Telegram API and unix socket communication -# aiohttp - async HTTP server for bot's internal send API +# Authentication +PyJWT>=2.8.0 +itsdangerous>=2.1.0 + +# HTTP client httpx>=0.27.0 -aiohttp>=3.9.0 + +# CLI +typer>=0.12.0 +rich>=13.0.0 + +# Configuration +python-dotenv>=1.0.0 +pyyaml>=6.0 + +# Data processing +pandas>=2.0.0 +pyarrow>=12.0.0 +pytz>=2024.1 + +# Data source connectors +kbcstorage>=0.9.0 +google-cloud-bigquery>=3.0.0 +google-cloud-bigquery-storage>=2.0.0 + +# Profiler visualizations matplotlib>=3.8.0 numpy>=1.24.0 -# Desktop app authentication -# PyJWT - JWT token creation and validation for desktop app auth -PyJWT>=2.8.0 - -# Password authentication for external users -# argon2-cffi - modern password hashing algorithm (Argon2id) -# sendgrid - email service for setup/reset links -argon2-cffi>=23.1.0 -sendgrid>=6.11.0 - -# Corporate Memory knowledge extraction -# anthropic - Claude API client for HAIKU-based knowledge extraction -anthropic>=0.39.0 - -# OpenAI-compatible API client for LLM proxy routing (LiteLLM, OpenRouter, etc.) -openai>=1.0.0 - -# Sample data generation (development/testing) -# faker - realistic synthetic data for demo datasets +# Sample data generation faker>=24.0.0 diff --git a/scripts/init.sh b/scripts/init.sh index 89ce59c..2fec349 100755 --- a/scripts/init.sh +++ b/scripts/init.sh @@ -39,16 +39,19 @@ echo "3️⃣ Activating virtual environment..." source .venv/bin/activate echo " ✅ Virtual environment activated" -# Upgrade pip +# Install uv if not available echo "" -echo "4️⃣ Upgrading pip..." -pip install --upgrade pip --quiet -echo " ✅ pip upgraded" +echo "4️⃣ Checking uv..." +if ! command -v uv &> /dev/null; then + echo " Installing uv..." + curl -LsSf https://astral.sh/uv/install.sh | sh +fi +echo " ✅ uv available" # Install dependencies echo "" echo "5️⃣ Installing dependencies from requirements.txt..." -pip install -r requirements.txt --quiet +uv pip install -r requirements.txt --quiet echo " ✅ Dependencies installed" # Create folders diff --git a/src/orchestrator.py b/src/orchestrator.py index 62d903e..6ef5118 100644 --- a/src/orchestrator.py +++ b/src/orchestrator.py @@ -1,5 +1,6 @@ """Sync orchestrator — ATTACHes extract.duckdb files into master analytics.duckdb.""" +import hashlib import logging import os import threading @@ -144,28 +145,39 @@ class SyncOrchestrator: tables.append(table_name) # Update sync_state in system DB - self._update_sync_state(meta_rows) + self._update_sync_state(meta_rows, source_name) except Exception as e: logger.error("Failed to attach %s: %s", source_name, e) return tables - def _update_sync_state(self, meta_rows: list) -> None: + def _update_sync_state(self, meta_rows: list, source_name: str) -> None: """Update sync_state table in system.duckdb from _meta entries.""" try: from src.db import get_system_db from src.repositories.sync_state import SyncStateRepository + extracts_dir = _get_extracts_dir() sys_conn = get_system_db() try: repo = SyncStateRepository(sys_conn) for table_name, rows, size_bytes, query_mode in meta_rows: + # Compute hash from parquet file stats (fast, no file read) + pq_path = extracts_dir / source_name / "data" / f"{table_name}.parquet" + if pq_path.exists(): + stat = pq_path.stat() + file_hash = hashlib.md5( + f"{stat.st_mtime_ns}:{stat.st_size}".encode() + ).hexdigest()[:12] + else: + file_hash = "" + repo.update_sync( table_id=table_name, rows=rows or 0, file_size_bytes=size_bytes or 0, - hash="", # TODO: compute from parquet file + hash=file_hash, ) finally: sys_conn.close()