refactor: final cleanup — delete legacy auth, clean deps, fix hash, migrate to uv

- Delete root auth/ directory (legacy Flask providers, orphaned)
- Clean requirements.txt: remove Flask, gunicorn, authlib, sendgrid,
  anthropic, openai, argon2-cffi (9 unused deps)
- Fix hash computation in orchestrator: MD5 of parquet mtime+size
  (CLI sync now skips unchanged tables correctly)
- Migrate pip → uv in CLAUDE.md, scripts/init.sh, pyproject.toml
- Sync pyproject.toml dependencies with requirements.txt

578 tests passing.
This commit is contained in:
ZdenekSrotyr 2026-03-31 19:17:44 +02:00
parent 2b7348a773
commit 5ee12d78e7
14 changed files with 77 additions and 784 deletions

View file

@ -47,7 +47,7 @@ docker compose --profile full up # Include telegram bot
│ ├── bigquery/ # BigQuery: extractor.py (remote-only via DuckDB BQ extension)
│ └── jira/ # Jira: webhook + incremental parquet → extract.duckdb
├── cli/ # CLI tool (`da sync`, `da query`, `da admin`)
├── auth/ # Authentication providers (google, email, password, desktop)
├── app/auth/ # Authentication (FastAPI-based providers)
├── services/ # Standalone services (scheduler, telegram_bot, ws_gateway, etc.)
├── server/ # Legacy deployment infrastructure
├── scripts/ # Utility + migration scripts
@ -105,7 +105,7 @@ Table definitions: DuckDB `table_registry` table in `system.duckdb`.
```bash
# Setup
python3 -m venv .venv && source .venv/bin/activate
pip install -r requirements.txt
uv pip install -r requirements.txt
# Run FastAPI locally
uvicorn app.main:app --reload
@ -128,12 +128,10 @@ Must create `_meta` table with columns: table_name, description, rows, size_byte
Orchestrator ATTACHes it automatically.
### Authentication
Pluggable auth providers in `auth/`:
- **Google** (`google`): OAuth via Google
- **Email** (`email`): Email magic link (itsdangerous token)
- **Password** (`password`): Username/password
- **Desktop** (`desktop`): JWT for API
- New provider = `auth/<name>/provider.py` implementing `AuthProvider`
Auth providers in `app/auth/` (FastAPI-based):
- **Google**: OAuth via Google
- **Email**: Email magic link (itsdangerous token)
- **Desktop**: JWT for API
## Key Implementation Details

View file

@ -1,111 +0,0 @@
"""
Pluggable authentication provider system.
Each auth provider lives in auth/<name>/provider.py and implements AuthProvider.
Providers are auto-discovered and registered with the Flask app.
To add a new provider (e.g., Okta):
1. Create auth/okta/provider.py
2. Implement AuthProvider subclass
3. Export `provider` instance at module level
4. That's it - no changes to core code needed.
"""
import importlib
import logging
from abc import ABC, abstractmethod
from pathlib import Path
from typing import Optional
from flask import Blueprint
logger = logging.getLogger(__name__)
class AuthProvider(ABC):
"""Base class for authentication providers."""
@abstractmethod
def get_name(self) -> str:
"""Internal name (e.g., 'google', 'password')."""
@abstractmethod
def get_blueprint(self) -> Blueprint:
"""Flask blueprint with auth routes."""
@abstractmethod
def get_login_button(self) -> dict:
"""Login button definition for the login page.
Returns dict with:
text: str - Button label (e.g., "Sign in with Google")
url: str - Route URL (e.g., "/login/google")
icon_html: str - SVG or HTML for the icon
subtitle: str - Optional help text below button
order: int - Sort order (lower = higher on page)
css_class: str - Optional CSS class for the button (e.g., "btn-google")
visible: bool - Whether to show on login page (default True)
"""
def is_available(self) -> bool:
"""Check if provider is configured and ready.
Override to check env vars, API keys, etc.
Returns False to skip this provider."""
return True
def get_display_name(self) -> str:
"""Human-readable name for UI."""
return self.get_name().title()
def init_app(self, app) -> None:
"""Optional: initialize provider with Flask app (e.g., for OAuth setup)."""
pass
def discover_providers() -> list[AuthProvider]:
"""Auto-discover auth providers from auth/*/provider.py.
Each provider module must export a `provider` instance of AuthProvider.
Providers are sorted by login button order.
Only available providers (is_available() == True) are returned.
Providers listed in Config.AUTH_DISABLED_PROVIDERS are skipped.
"""
from app.instance_config import get_value
disabled_raw = get_value("auth", "disabled_providers", default=[])
disabled = [name.lower() for name in (disabled_raw or [])]
providers = []
auth_dir = Path(__file__).parent
for subdir in sorted(auth_dir.iterdir()):
if not subdir.is_dir() or subdir.name.startswith("_"):
continue
provider_file = subdir / "provider.py"
if not provider_file.exists():
continue
try:
mod = importlib.import_module(f"auth.{subdir.name}.provider")
provider_instance = getattr(mod, "provider", None)
if provider_instance and isinstance(provider_instance, AuthProvider):
if provider_instance.get_name().lower() in disabled:
logger.info(
f"Auth provider disabled by config: {provider_instance.get_name()}"
)
elif provider_instance.is_available():
providers.append(provider_instance)
logger.info(f"Auth provider loaded: {provider_instance.get_name()}")
else:
logger.debug(
f"Auth provider skipped (not available): {subdir.name}"
)
else:
logger.warning(
f"Auth provider {subdir.name} has no 'provider' instance"
)
except Exception as e:
logger.warning(f"Failed to load auth provider {subdir.name}: {e}")
# Sort by login button order
providers.sort(key=lambda p: p.get_login_button().get("order", 50))
return providers

View file

@ -1,60 +0,0 @@
"""
Desktop JWT authentication provider.
Desktop JWT authentication (Flask blueprint).
This is NOT a login provider (no login button) - it provides
JWT-based API authentication for the native desktop application.
"""
import logging
from flask import Blueprint
import os
from auth import AuthProvider
class _Config:
DESKTOP_JWT_SECRET = os.environ.get("DESKTOP_JWT_SECRET", "")
Config = _Config
logger = logging.getLogger(__name__)
class DesktopAuthProvider(AuthProvider):
"""Desktop app JWT authentication provider."""
def get_name(self) -> str:
return "desktop"
def get_display_name(self) -> str:
return "Desktop App"
def get_blueprint(self) -> Blueprint:
# Legacy Flask blueprint — removed with webapp/
return Blueprint("desktop_auth", __name__)
def get_login_button(self) -> dict:
return {
"text": "",
"url": "",
"icon_html": "",
"subtitle": "",
"order": 100,
"css_class": "",
"visible": False,
}
def is_available(self) -> bool:
return bool(Config.DESKTOP_JWT_SECRET)
def init_app(self, app) -> None:
"""No additional initialization needed."""
pass
# Module-level provider instance for auto-discovery
provider = DesktopAuthProvider()

View file

View file

@ -1,314 +0,0 @@
"""
Email magic link authentication provider.
Users enter their email, receive a magic link, click it and they're logged in.
No passwords needed. Domain restriction ensures only allowed users can access.
Email delivery modes:
1. SMTP relay (recommended) - configure SMTP_HOST, SMTP_PORT, etc. in .env
2. Console mode (development) - link printed to server log, shown in browser
"""
import logging
import smtplib
import time
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
from flask import (
Blueprint,
current_app,
flash,
redirect,
render_template,
request,
session,
url_for,
)
from itsdangerous import BadSignature, SignatureExpired, URLSafeTimedSerializer
import os
from auth import AuthProvider
from app.instance_config import get_allowed_domains, get_value
_ALLOWED_DOMAINS = get_allowed_domains()
_ALLOWED_EMAILS = [
e.strip().lower()
for e in os.environ.get("ALLOWED_EMAILS", "").split(",")
if e.strip()
]
def validate_email_domain(email: str) -> bool:
if not email:
return False
email_lower = email.lower()
if email_lower in _ALLOWED_EMAILS:
return True
domain = email_lower.split("@")[-1]
return domain in _ALLOWED_DOMAINS
class _Config:
SECRET_KEY = os.environ.get("WEBAPP_SECRET_KEY", "dev-secret-key-change-me")
ALLOWED_DOMAINS = _ALLOWED_DOMAINS
SMTP_HOST = os.environ.get("SMTP_HOST", "")
SMTP_PORT = int(os.environ.get("SMTP_PORT", "587"))
SMTP_USER = os.environ.get("SMTP_USER", "")
SMTP_PASSWORD = os.environ.get("SMTP_PASSWORD", "")
SMTP_FROM = os.environ.get("SMTP_FROM",
os.environ.get("SMTP_USER",
get_value("email", "from_address", default="noreply@example.com")))
SMTP_USE_TLS = os.environ.get("SMTP_USE_TLS", "true").lower() == "true"
INSTANCE_NAME = get_value("instance", "name", default="AI Data Analyst")
Config = _Config
logger = logging.getLogger(__name__)
email_bp = Blueprint("email_auth", __name__)
# SVG envelope icon for the login button
_EMAIL_ICON_HTML = (
'<svg width="24" height="24" viewBox="0 0 24 24" fill="none" '
'stroke="currentColor" stroke-width="2" stroke-linecap="round" '
'stroke-linejoin="round">'
'<rect x="2" y="4" width="20" height="16" rx="2"/>'
'<path d="m22 7-8.97 5.7a1.94 1.94 0 0 1-2.06 0L2 7"/>'
"</svg>"
)
def _get_serializer() -> URLSafeTimedSerializer:
"""Create token serializer using the app secret key."""
return URLSafeTimedSerializer(Config.SECRET_KEY, salt="email-magic-link")
def _generate_magic_token(email: str) -> str:
"""Generate a signed, time-limited token containing the email."""
s = _get_serializer()
return s.dumps({"email": email.lower(), "t": int(time.time())})
def _verify_magic_token(token: str, max_age_seconds: int = 900) -> str | None:
"""Verify magic link token. Returns email if valid, None otherwise.
Args:
token: The signed token from the magic link URL.
max_age_seconds: Token validity period (default 15 minutes).
Returns:
Email address if token is valid, None otherwise.
"""
s = _get_serializer()
try:
data = s.loads(token, max_age=max_age_seconds)
return data.get("email")
except SignatureExpired:
logger.warning("Magic link token expired")
return None
except BadSignature:
logger.warning("Invalid magic link token")
return None
def _send_magic_email(email: str, magic_url: str) -> bool:
"""Send magic link email via SMTP relay.
Returns True if sent successfully, False otherwise.
"""
smtp_host = Config.SMTP_HOST
if not smtp_host:
return False
msg = MIMEMultipart("alternative")
msg["Subject"] = f"Sign in to {Config.INSTANCE_NAME}"
msg["From"] = Config.SMTP_FROM
msg["To"] = email
text_body = (
f"Sign in to {Config.INSTANCE_NAME}\n\n"
f"Click the link below to sign in:\n{magic_url}\n\n"
f"This link expires in 15 minutes.\n"
f"If you didn't request this, ignore this email."
)
html_body = f"""<!DOCTYPE html>
<html>
<body style="font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif; max-width: 480px; margin: 0 auto; padding: 20px;">
<h2 style="color: #1a1a2e;">Sign in to {Config.INSTANCE_NAME}</h2>
<p>Click the button below to sign in:</p>
<p style="text-align: center; margin: 30px 0;">
<a href="{magic_url}"
style="background: #4361ee; color: white; padding: 12px 32px;
text-decoration: none; border-radius: 6px; font-weight: 500;">
Sign In
</a>
</p>
<p style="color: #666; font-size: 14px;">
This link expires in 15 minutes.<br>
If you didn't request this, ignore this email.
</p>
<hr style="border: none; border-top: 1px solid #eee; margin: 20px 0;">
<p style="color: #999; font-size: 12px;">
Or copy and paste this URL into your browser:<br>
<code style="word-break: break-all;">{magic_url}</code>
</p>
</body>
</html>"""
msg.attach(MIMEText(text_body, "plain"))
msg.attach(MIMEText(html_body, "html"))
try:
smtp_port = Config.SMTP_PORT
use_tls = Config.SMTP_USE_TLS
if smtp_port == 465:
server = smtplib.SMTP_SSL(smtp_host, smtp_port, timeout=10)
else:
server = smtplib.SMTP(smtp_host, smtp_port, timeout=10)
if use_tls:
server.starttls()
smtp_user = Config.SMTP_USER
smtp_password = Config.SMTP_PASSWORD
if smtp_user and smtp_password:
server.login(smtp_user, smtp_password)
server.sendmail(Config.SMTP_FROM, [email], msg.as_string())
server.quit()
logger.info("Magic link email sent to %s via SMTP", email)
return True
except Exception as e:
logger.error("Failed to send magic link email to %s: %s", email, e)
return False
# --- Routes ---
@email_bp.route("/login/email")
def login_email_form():
"""Show email input form."""
return render_template(
"login_magic_link.html",
allowed_domains=Config.ALLOWED_DOMAINS,
)
@email_bp.route("/login/email/send", methods=["POST"])
def send_magic_link():
"""Validate email domain and send magic link."""
email = request.form.get("email", "").strip().lower()
if not email:
flash("Please enter your email address.", "error")
return redirect(url_for("email_auth.login_email_form"))
if not validate_email_domain(email):
domains_str = ", ".join(f"@{d}" for d in Config.ALLOWED_DOMAINS)
flash(
f"Only {domains_str} email addresses are allowed.",
"error",
)
return redirect(url_for("email_auth.login_email_form"))
# Generate magic link
token = _generate_magic_token(email)
magic_url = url_for("email_auth.verify_magic_link", token=token, _external=True)
# Try SMTP first, fall back to console mode
smtp_sent = _send_magic_email(email, magic_url)
if smtp_sent:
flash("Check your email for the sign-in link.", "info")
return render_template(
"login_magic_link_sent.html",
email=email,
console_mode=False,
)
else:
# Console/development mode - show link directly
logger.info("MAGIC LINK for %s: %s", email, magic_url)
return render_template(
"login_magic_link_sent.html",
email=email,
magic_url=magic_url,
console_mode=True,
)
@email_bp.route("/login/email/verify/<token>")
def verify_magic_link(token: str):
"""Verify magic link token and log user in."""
email = _verify_magic_token(token)
if not email:
flash("This sign-in link has expired or is invalid. Please try again.", "error")
return redirect(url_for("email_auth.login_email_form"))
# Double-check domain (in case config changed since token was issued)
if not validate_email_domain(email):
flash("Your email is no longer authorized.", "error")
return redirect(url_for("auth.login"))
# Set session (shared contract across all auth providers)
name = email.split("@")[0].replace(".", " ").title()
session["user"] = {
"email": email,
"name": name,
"picture": "",
}
logger.info("User logged in via magic link: %s", email)
return redirect(url_for("dashboard"))
# --- Provider class ---
class EmailAuthProvider(AuthProvider):
"""Email magic link authentication provider."""
def get_name(self) -> str:
return "email"
def get_display_name(self) -> str:
return "Email"
def get_blueprint(self) -> Blueprint:
return email_bp
def get_login_button(self) -> dict:
domains = Config.ALLOWED_DOMAINS
if len(domains) > 1:
domain_str = ", ".join(f"@{d}" for d in domains)
elif domains:
domain_str = f"@{domains[0]}"
else:
domain_str = ""
return {
"text": "Sign in with Email",
"url": "/login/email",
"icon_html": _EMAIL_ICON_HTML,
"subtitle": f'For <strong>{domain_str}</strong> email addresses.' if domain_str else "",
"order": 20,
"css_class": "btn-email",
"visible": True,
}
def is_available(self) -> bool:
"""Available when at least one allowed domain is configured."""
return len(Config.ALLOWED_DOMAINS) > 0
def init_app(self, app) -> None:
"""No additional initialization needed."""
pass
# Module-level provider instance for auto-discovery
provider = EmailAuthProvider()

View file

@ -1,157 +0,0 @@
"""
Google OAuth authentication provider.
Handles Google Sign-In flow with domain validation.
Google OAuth flow with domain validation (Flask blueprint).
"""
import logging
from authlib.integrations.flask_client import OAuth
from flask import Blueprint, flash, redirect, session, url_for
import os
from auth import AuthProvider
from app.instance_config import get_allowed_domains
_ALLOWED_DOMAINS = get_allowed_domains()
_ALLOWED_EMAILS = [
e.strip().lower()
for e in os.environ.get("ALLOWED_EMAILS", "").split(",")
if e.strip()
]
def validate_email_domain(email: str) -> bool:
if not email:
return False
email_lower = email.lower()
if email_lower in _ALLOWED_EMAILS:
return True
domain = email_lower.split("@")[-1]
return domain in _ALLOWED_DOMAINS
class _Config:
ALLOWED_DOMAINS = _ALLOWED_DOMAINS
GOOGLE_CLIENT_ID = os.environ.get("GOOGLE_CLIENT_ID", "")
GOOGLE_CLIENT_SECRET = os.environ.get("GOOGLE_CLIENT_SECRET", "")
Config = _Config
logger = logging.getLogger(__name__)
google_bp = Blueprint("google_auth", __name__)
oauth = OAuth()
# Google SVG icon for the login button
_GOOGLE_ICON_HTML = (
'<svg class="google-icon" viewBox="0 0 24 24" width="24" height="24">'
'<path fill="#4285F4" d="M22.56 12.25c0-.78-.07-1.53-.2-2.25H12v4.26h5.92c-.26 '
"1.37-1.04 2.53-2.21 3.31v2.77h3.57c2.08-1.92 3.28-4.74 3.28-8.09z\"/>"
'<path fill="#34A853" d="M12 23c2.97 0 5.46-.98 7.28-2.66l-3.57-2.77c-.98.66-2.23 '
"1.06-3.71 1.06-2.86 0-5.29-1.93-6.16-4.53H2.18v2.84C3.99 20.53 7.7 23 12 23z\"/>"
'<path fill="#FBBC05" d="M5.84 14.09c-.22-.66-.35-1.36-.35-2.09s.13-1.43.35-2.09V7.07'
'H2.18C1.43 8.55 1 10.22 1 12s.43 3.45 1.18 4.93l2.85-2.22.81-.62z"/>'
'<path fill="#EA4335" d="M12 5.38c1.62 0 3.06.56 4.21 1.64l3.15-3.15C17.45 2.09 '
'14.97 1 12 1 7.7 1 3.99 3.47 2.18 7.07l3.66 2.84c.87-2.6 3.3-4.53 6.16-4.53z"/>'
"</svg>"
)
@google_bp.route("/login/google")
def login_google():
"""Initiate Google OAuth flow."""
redirect_uri = url_for("google_auth.authorize", _external=True)
return oauth.google.authorize_redirect(redirect_uri)
@google_bp.route("/authorize")
def authorize():
"""Handle OAuth callback from Google."""
try:
token = oauth.google.authorize_access_token()
userinfo = token.get("userinfo")
if not userinfo:
logger.warning("No userinfo in OAuth response")
flash("Failed to get user information from Google.", "error")
return redirect(url_for("auth.login"))
email = userinfo.get("email", "")
name = userinfo.get("name", "")
# Validate domain
if not validate_email_domain(email):
logger.warning(f"Login attempt from non-allowed domain: {email}")
domains_str = ", ".join(f"@{d}" for d in Config.ALLOWED_DOMAINS)
flash(
f"Only {domains_str} email addresses are allowed.", "error"
)
return redirect(url_for("auth.login"))
# Store user info in session (shared contract across all providers)
session["user"] = {
"email": email,
"name": name,
"picture": userinfo.get("picture", ""),
}
logger.info(f"User logged in via Google: {email}")
return redirect(url_for("dashboard"))
except Exception as e:
logger.exception(f"OAuth error: {e}")
flash("Authentication failed. Please try again.", "error")
return redirect(url_for("auth.login"))
class GoogleAuthProvider(AuthProvider):
"""Google OAuth authentication provider."""
def get_name(self) -> str:
return "google"
def get_display_name(self) -> str:
return "Google"
def get_blueprint(self) -> Blueprint:
return google_bp
def get_login_button(self) -> dict:
domains = Config.ALLOWED_DOMAINS
if len(domains) > 1:
domain_str = ", ".join(f"@{d}" for d in domains)
else:
domain_str = f"@{domains[0]}" if domains else ""
return {
"text": "Sign in with Google",
"url": "/login/google",
"icon_html": _GOOGLE_ICON_HTML,
"subtitle": f'For <strong>{domain_str}</strong> email addresses.' if domain_str else "",
"order": 10,
"css_class": "btn-google",
"visible": True,
}
def is_available(self) -> bool:
return bool(Config.GOOGLE_CLIENT_ID)
def init_app(self, app) -> None:
"""Initialize OAuth with the Flask app."""
oauth.init_app(app)
oauth.register(
name="google",
client_id=Config.GOOGLE_CLIENT_ID,
client_secret=Config.GOOGLE_CLIENT_SECRET,
server_metadata_url="https://accounts.google.com/.well-known/openid-configuration",
client_kwargs={
"scope": "openid email profile",
},
)
# Module-level provider instance for auto-discovery
provider = GoogleAuthProvider()

View file

@ -1,59 +0,0 @@
"""
Email/password authentication provider.
Email/password authentication (Flask blueprint).
Available only when SENDGRID_API_KEY is configured.
"""
import logging
from flask import Blueprint
import os
from auth import AuthProvider
class _Config:
SENDGRID_API_KEY = os.environ.get("SENDGRID_API_KEY", "")
Config = _Config
logger = logging.getLogger(__name__)
class PasswordAuthProvider(AuthProvider):
"""Email/password authentication provider for external users."""
def get_name(self) -> str:
return "password"
def get_display_name(self) -> str:
return "Email"
def get_blueprint(self) -> Blueprint:
# Legacy Flask blueprint — removed with webapp/
return Blueprint("password_auth", __name__)
def get_login_button(self) -> dict:
return {
"text": "Sign in with Email",
"url": "/login/email",
"icon_html": "",
"subtitle": "For external users (investors, partners).",
"order": 20,
"css_class": "btn-secondary",
"visible": True,
}
def is_available(self) -> bool:
return bool(Config.SENDGRID_API_KEY)
def init_app(self, app) -> None:
"""No additional initialization needed."""
pass
# Module-level provider instance for auto-discovery
provider = PasswordAuthProvider()

View file

@ -6,33 +6,43 @@ requires-python = ">=3.9"
license = "MIT"
dependencies = [
# Core database
"duckdb>=0.9.0",
# Web framework (FastAPI)
"fastapi>=0.115.0",
"uvicorn[standard]>=0.32.0",
"python-multipart>=0.0.9",
"jinja2>=3.1.0",
# Authentication
"PyJWT>=2.8.0",
"itsdangerous>=2.1.0",
# HTTP client
"httpx>=0.27.0",
# CLI
"typer>=0.12.0",
"rich>=13.0.0",
# Configuration
"python-dotenv>=1.0.0",
"pyyaml>=6.0",
# Data processing
"pandas>=2.0.0",
"pyarrow>=12.0.0",
"pytz>=2024.1",
# Data source connectors
"kbcstorage>=0.9.0",
"google-cloud-bigquery>=3.0.0",
"google-cloud-bigquery-storage>=2.0.0",
# Profiler visualizations
"matplotlib>=3.8.0",
"numpy>=1.24.0",
# Sample data generation
"faker>=24.0.0",
]
[project.scripts]
da = "cli.main:app"
[project.optional-dependencies]
connectors = [
"kbcstorage>=0.9.0",
"google-cloud-bigquery>=3.0.0",
"google-cloud-bigquery-storage>=2.0.0",
"pandas>=2.0.0",
"pyarrow>=12.0.0",
]
telegram = [
"aiohttp>=3.9.0",
]
dev = [
"pytest>=7.0.0",
"pytest-mock>=3.0.0",

View file

@ -1,69 +1,40 @@
# Data source adapters (install only what you need)
kbcstorage>=0.9.0 # For Keboola adapter
google-cloud-bigquery>=3.0.0 # For BigQuery adapter
google-cloud-bigquery-storage>=2.0.0 # For BigQuery adapter (fast Arrow transfer)
# Data processing
# pandas - core tabular data processing library
# pyarrow - Parquet format support and fast operations
# pytz - timezone support required by DuckDB for reading timezone-aware Parquet columns
pandas>=2.0.0
pyarrow>=12.0.0
pytz>=2024.1
# Analytical database
# DuckDB - in-process SQL OLAP database for analytical queries
# Core database
duckdb>=0.9.0
# Configuration
# python-dotenv - loading environment variables from .env files
# pyyaml - parsing YAML configuration from data_description.md
python-dotenv>=1.0.0
pyyaml>=6.0
# Progress tracking and logging
# tqdm - progress bars for long-running operations (download, sync)
tqdm>=4.65.0
# Web application (Google SSO portal)
# flask - web framework for self-service portal (legacy, being replaced by FastAPI)
# authlib - OAuth 2.0 / OpenID Connect library for Google SSO
# gunicorn - WSGI server for production deployment
flask>=3.0.0
authlib>=1.3.0
gunicorn>=21.0.0
# FastAPI - new unified web framework (API + web UI)
# Web framework (FastAPI)
fastapi>=0.115.0
uvicorn[standard]>=0.32.0
python-multipart>=0.0.9
jinja2>=3.1.0
# Telegram notification bot
# httpx - async HTTP client for Telegram API and unix socket communication
# aiohttp - async HTTP server for bot's internal send API
# Authentication
PyJWT>=2.8.0
itsdangerous>=2.1.0
# HTTP client
httpx>=0.27.0
aiohttp>=3.9.0
# CLI
typer>=0.12.0
rich>=13.0.0
# Configuration
python-dotenv>=1.0.0
pyyaml>=6.0
# Data processing
pandas>=2.0.0
pyarrow>=12.0.0
pytz>=2024.1
# Data source connectors
kbcstorage>=0.9.0
google-cloud-bigquery>=3.0.0
google-cloud-bigquery-storage>=2.0.0
# Profiler visualizations
matplotlib>=3.8.0
numpy>=1.24.0
# Desktop app authentication
# PyJWT - JWT token creation and validation for desktop app auth
PyJWT>=2.8.0
# Password authentication for external users
# argon2-cffi - modern password hashing algorithm (Argon2id)
# sendgrid - email service for setup/reset links
argon2-cffi>=23.1.0
sendgrid>=6.11.0
# Corporate Memory knowledge extraction
# anthropic - Claude API client for HAIKU-based knowledge extraction
anthropic>=0.39.0
# OpenAI-compatible API client for LLM proxy routing (LiteLLM, OpenRouter, etc.)
openai>=1.0.0
# Sample data generation (development/testing)
# faker - realistic synthetic data for demo datasets
# Sample data generation
faker>=24.0.0

View file

@ -39,16 +39,19 @@ echo "3⃣ Activating virtual environment..."
source .venv/bin/activate
echo " ✅ Virtual environment activated"
# Upgrade pip
# Install uv if not available
echo ""
echo "4⃣ Upgrading pip..."
pip install --upgrade pip --quiet
echo " ✅ pip upgraded"
echo "4⃣ Checking uv..."
if ! command -v uv &> /dev/null; then
echo " Installing uv..."
curl -LsSf https://astral.sh/uv/install.sh | sh
fi
echo " ✅ uv available"
# Install dependencies
echo ""
echo "5⃣ Installing dependencies from requirements.txt..."
pip install -r requirements.txt --quiet
uv pip install -r requirements.txt --quiet
echo " ✅ Dependencies installed"
# Create folders

View file

@ -1,5 +1,6 @@
"""Sync orchestrator — ATTACHes extract.duckdb files into master analytics.duckdb."""
import hashlib
import logging
import os
import threading
@ -144,28 +145,39 @@ class SyncOrchestrator:
tables.append(table_name)
# Update sync_state in system DB
self._update_sync_state(meta_rows)
self._update_sync_state(meta_rows, source_name)
except Exception as e:
logger.error("Failed to attach %s: %s", source_name, e)
return tables
def _update_sync_state(self, meta_rows: list) -> None:
def _update_sync_state(self, meta_rows: list, source_name: str) -> None:
"""Update sync_state table in system.duckdb from _meta entries."""
try:
from src.db import get_system_db
from src.repositories.sync_state import SyncStateRepository
extracts_dir = _get_extracts_dir()
sys_conn = get_system_db()
try:
repo = SyncStateRepository(sys_conn)
for table_name, rows, size_bytes, query_mode in meta_rows:
# Compute hash from parquet file stats (fast, no file read)
pq_path = extracts_dir / source_name / "data" / f"{table_name}.parquet"
if pq_path.exists():
stat = pq_path.stat()
file_hash = hashlib.md5(
f"{stat.st_mtime_ns}:{stat.st_size}".encode()
).hexdigest()[:12]
else:
file_hash = ""
repo.update_sync(
table_id=table_name,
rows=rows or 0,
file_size_bytes=size_bytes or 0,
hash="", # TODO: compute from parquet file
hash=file_hash,
)
finally:
sys_conn.close()