agnes-the-ai-analyst/src/scheduler.py
Petr 80c5b902e0 Add scheduled data sync and catalog refresh with systemd timers
- New sync_schedule and profile_after_sync fields in TableConfig
  (formats: "every 15m", "every 1h", "daily 05:00")
- New src/scheduler.py with schedule evaluation logic (is_table_due)
- New --scheduled mode in data_sync.py: only syncs tables that are due,
  respects profile_after_sync flag, auto-restarts webapp after profiling
- Systemd timer+service for data-refresh (every 15 min)
- Systemd timer+service for catalog-refresh (every 15 min)
- deploy.sh enables new timers automatically
- Complete table config reference in data_description.md.example
- 58 new scheduler tests
2026-03-15 02:16:31 +01:00

158 lines
4.5 KiB
Python

"""
Schedule evaluation for automatic data sync.
Parses sync_schedule strings from table configuration and determines
whether a table is due for synchronization based on its last sync time.
Schedule formats:
"every 15m" - every 15 minutes
"every 1h" - every hour
"daily 05:00" - once per day at 05:00 UTC
"""
import logging
import re
from datetime import datetime, timezone
from typing import Optional
logger = logging.getLogger(__name__)
# Pattern: "every 15m", "every 2h"
INTERVAL_PATTERN = re.compile(r"^every (\d+)([mh])$")
# Pattern: "daily 05:00", "daily 17:30"
DAILY_PATTERN = re.compile(r"^daily (\d{2}):(\d{2})$")
def parse_interval_minutes(schedule: str) -> Optional[int]:
"""Parse an interval schedule into minutes.
Args:
schedule: Schedule string like "every 15m" or "every 1h"
Returns:
Interval in minutes, or None if not an interval schedule.
"""
match = INTERVAL_PATTERN.match(schedule)
if not match:
return None
value = int(match.group(1))
unit = match.group(2)
if unit == "h":
return value * 60
return value
def is_table_due(
schedule: str,
last_sync_iso: Optional[str],
now: Optional[datetime] = None,
) -> bool:
"""Determine whether a table is due for sync based on its schedule.
Args:
schedule: Schedule string from table config (e.g., "every 1h", "daily 05:00")
last_sync_iso: ISO timestamp of last sync, or None if never synced
now: Current time (UTC). Defaults to datetime.now(timezone.utc).
Returns:
True if the table should be synced now.
"""
if now is None:
now = datetime.now(timezone.utc)
# Never synced -> always due
if not last_sync_iso:
logger.info("Table never synced, marking as due")
return True
# Parse last_sync timestamp
last_sync = _parse_timestamp(last_sync_iso)
if last_sync is None:
logger.warning(f"Cannot parse last_sync timestamp: {last_sync_iso}, marking as due")
return True
# Ensure timezone-aware comparison
if last_sync.tzinfo is None:
last_sync = last_sync.replace(tzinfo=timezone.utc)
# Check interval schedule: "every Xm" / "every Xh"
interval_minutes = parse_interval_minutes(schedule)
if interval_minutes is not None:
elapsed_minutes = (now - last_sync).total_seconds() / 60
due = elapsed_minutes >= interval_minutes
if due:
logger.debug(
f"Interval schedule: {elapsed_minutes:.0f}m elapsed >= {interval_minutes}m interval"
)
return due
# Check daily schedule: "daily HH:MM"
match = DAILY_PATTERN.match(schedule)
if match:
target_hour = int(match.group(1))
target_minute = int(match.group(2))
return _is_daily_due(last_sync, now, target_hour, target_minute)
logger.warning(f"Unknown schedule format: {schedule}")
return False
def _is_daily_due(
last_sync: datetime,
now: datetime,
target_hour: int,
target_minute: int,
) -> bool:
"""Check if a daily schedule is due.
A daily schedule at HH:MM is due when:
1. Current time is at or past HH:MM today, AND
2. Last sync was before HH:MM today
This means: once HH:MM passes, the first scheduler tick will trigger it,
and subsequent ticks on the same day will skip it.
"""
# Today's target time
today_target = now.replace(
hour=target_hour, minute=target_minute, second=0, microsecond=0
)
# Not yet time today
if now < today_target:
return False
# Time has passed, check if we already synced after today's target
if last_sync >= today_target:
return False
logger.debug(
f"Daily schedule: target {target_hour:02d}:{target_minute:02d} UTC, "
f"last sync {last_sync.isoformat()}, now {now.isoformat()} -> due"
)
return True
def _parse_timestamp(iso_string: str) -> Optional[datetime]:
"""Parse an ISO timestamp string, handling various formats.
Args:
iso_string: ISO 8601 timestamp string
Returns:
datetime object or None if parsing fails
"""
try:
# Python 3.11+ fromisoformat handles most formats
return datetime.fromisoformat(iso_string)
except (ValueError, TypeError):
pass
# Fallback: try common formats
for fmt in ("%Y-%m-%dT%H:%M:%S.%f", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%d %H:%M:%S"):
try:
return datetime.strptime(iso_string, fmt)
except ValueError:
continue
return None