agnes-the-ai-analyst/services/corporate_memory/entities.py

"""Entity resolution v1 for corporate memory.

Simple case-insensitive string matching against a static entity registry.
Runs as post-processing on new knowledge items to tag them with recognized entities.
"""

import json
import logging
from typing import Any

logger = logging.getLogger(__name__)


def build_entity_registry(
    groups: dict[str, Any] | None = None,
    domain_owners: dict[str, list[str]] | None = None,
    entity_config: dict[str, list[str]] | None = None,
    metric_names: list[str] | None = None,
) -> dict[str, list[str]]:
    """Build a flat entity registry from various config sources.

    Returns dict mapping category -> list of entity names.
    """
    registry: dict[str, list[str]] = {}

    if groups:
        registry["teams"] = list(groups.keys())

    if domain_owners:
        registry["domains"] = list(domain_owners.keys())

    if entity_config:
        for category, entities in entity_config.items():
            registry[category] = entities

    if metric_names:
        existing_metrics = registry.get("metrics", [])
        registry["metrics"] = list(set(existing_metrics + metric_names))

    return registry


def resolve_entities(
    content: str,
    title: str,
    entity_registry: dict[str, list[str]],
) -> list[str]:
    """Find entity matches in title and content using case-insensitive substring matching.

    Returns deduplicated list of matched entity names.
    """
    text = f"{title} {content}".lower()
    matched: set[str] = set()

    for _category, entities in entity_registry.items():
        for entity in entities:
            if entity.lower() in text:
                matched.add(entity)

    return sorted(matched)


def resolve_and_merge(
    item: dict,
    entity_registry: dict[str, list[str]],
) -> list[str]:
    """Resolve entities for an item and merge with any existing entity tags.

    Returns combined deduplicated entity list.
    """
    existing = item.get("entities") or []
    if isinstance(existing, str):
        try:
            existing = json.loads(existing)
        except (json.JSONDecodeError, TypeError):
            existing = []

    resolved = resolve_entities(
        content=item.get("content", ""),
        title=item.get("title", ""),
        entity_registry=entity_registry,
    )

    combined = set(existing) | set(resolved)
    return sorted(combined)