feat(batch-agent): add E2E evaluation harness with Langfuse integration

- eval/mock_executor.py: intercepts execute_on_client, serves fixture files from disk, records all mutations (insert/update/delete) - eval/config.py: YAML fixture loader with prompt variants, expected results, seed records, model overrides - eval/scorer.py: FieldMatchScorer (fuzzy title match, per-field accuracy, precision/recall/F1) + LLMJudgeScorer (semantic eval) - eval/langfuse_eval.py: sync fixtures to Langfuse datasets, create dataset runs, post scores, link traces to runs - eval/runner.py: orchestrates fixture → mock → agent pipeline → scoring → Langfuse reporting - eval/cli.py: CLI (python -m eval run/list/sync) with --models, --variants, --fixture, --no-judge flags - eval/fixtures/: example Italian freelance scenario with 3 prompt variants (baseline, detailed_italian, minimal)
2026-03-23 08:54:19 +01:00
parent 971f1dd84f
commit 75a826c9d8
12 changed files with 1382 additions and 0 deletions
--- a/services/batch-agent/eval/config.py
+++ b/services/batch-agent/eval/config.py
@@ -0,0 +1,129 @@
+"""Eval configuration — YAML fixture loader and dataclasses.
+
+A *fixture* is a YAML file that defines a complete test scenario:
+
+.. code-block:: yaml
+
+    name: freelance-invoices
+    description: Extract tasks and notes from invoice PDFs (text layer)
+    directory: sample_files/invoices      # relative to fixture dir
+    data_types: [tasks, notes]
+    file_extensions: [txt, md]
+
+    # Preseeded records the agent "sees" as existing data
+    seed_records:
+      projects:
+        - id: proj-1
+          name: "Website Redesign"
+          status: active
+      tasks: []
+
+    # Prompt variations to test (at least one required)
+    prompt_variants:
+      baseline: |
+        Extract action items as tasks and meeting summaries as notes.
+        Set priority based on urgency keywords.
+      detailed: |
+        Extract action items as tasks. Map "URGENT" to high priority,
+        "ASAP" to medium. Summaries become notes with full content.
+
+    # Expected extractions — what the agent SHOULD produce
+    expected:
+      tasks:
+        - title: "Send revised invoice to client"
+          priority: high
+          status: todo
+        - title: "Update project timeline"
+          priority: medium
+      notes:
+        - title: "Meeting summary - March kickoff"
+
+    # Optional: models to test (overrides CLI --models)
+    models: []
+"""
+
+from __future__ import annotations
+
+import logging
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any
+
+import yaml
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class ExpectedRecord:
+    """A single expected extraction result.
+
+    Only the fields specified are checked — unspecified fields are ignored.
+    """
+
+    table: str  # tasks | notes | timelines | projects
+    fields: dict[str, Any]  # field_name → expected_value
+
+
+@dataclass
+class EvalFixture:
+    """A complete test scenario loaded from YAML."""
+
+    name: str
+    description: str
+    directory: str  # relative path to sample files
+    data_types: list[str]
+    file_extensions: list[str]
+    seed_records: dict[str, list[dict]]
+    prompt_variants: dict[str, str]  # variant_name → prompt_template
+    expected: list[ExpectedRecord]
+    models: list[str]  # if empty, use CLI default
+    fixture_path: Path = field(default_factory=lambda: Path("."))
+
+    @property
+    def fixture_dir(self) -> Path:
+        """Absolute path to the sample files directory."""
+        return self.fixture_path.parent / self.directory
+
+    @classmethod
+    def from_yaml(cls, path: Path) -> "EvalFixture":
+        """Load a fixture from a YAML file."""
+        raw = yaml.safe_load(path.read_text(encoding="utf-8"))
+
+        expected: list[ExpectedRecord] = []
+        for table, records in (raw.get("expected") or {}).items():
+            for rec in records:
+                expected.append(ExpectedRecord(table=table, fields=rec))
+
+        return cls(
+            name=raw["name"],
+            description=raw.get("description", ""),
+            directory=raw.get("directory", "sample_files"),
+            data_types=raw.get("data_types", ["tasks"]),
+            file_extensions=raw.get("file_extensions", []),
+            seed_records=raw.get("seed_records", {}),
+            prompt_variants=raw.get("prompt_variants", {"default": ""}),
+            expected=expected,
+            models=raw.get("models", []),
+            fixture_path=path,
+        )
+
+
+def discover_fixtures(fixtures_dir: Path | None = None) -> list[EvalFixture]:
+    """Find and load all YAML fixtures in the fixtures directory."""
+    if fixtures_dir is None:
+        fixtures_dir = Path(__file__).parent / "fixtures"
+
+    fixtures: list[EvalFixture] = []
+    if not fixtures_dir.is_dir():
+        logger.warning("eval: fixtures directory not found: %s", fixtures_dir)
+        return fixtures
+
+    for yaml_path in sorted(fixtures_dir.glob("*.yaml")):
+        try:
+            fixtures.append(EvalFixture.from_yaml(yaml_path))
+            logger.info("eval: loaded fixture %s from %s", fixtures[-1].name, yaml_path.name)
+        except Exception as exc:
+            logger.error("eval: failed to load fixture %s: %s", yaml_path.name, exc)
+
+    return fixtures