refactor(eval): 3-mode eval harness (step1/step2/full) with Langfuse fixes
- Rewrite eval config with EvalMode (step1, step2, full) replacing prompt_variants - Rewrite runner with _run_step1, _run_step2, _run_full dispatch - CLI: replace --variants with --mode flag - Add 3 fixture YAMLs: classify_invoices (step1), process_invoices (step2), full_invoices (full) - Remove old freelance_invoices fixture - Langfuse: mode-aware dataset items (classifications for step1, extraction for step2, both for full) - Langfuse: link both prompts (batch_file_classifier + batch_processing) in full mode - Langfuse: post separate classification_precision/recall/f1 scores for full mode - Langfuse: skip misleading field_accuracy=0 when field_scores is empty (step1) - Langfuse: include step1_results in trace output - MockExecutor: mock async_session to bypass DB in full mode - Journey fixture: remove user_messages (only interactive test kept)
This commit is contained in:
@@ -4,14 +4,15 @@ Usage::
|
||||
|
||||
# From services/batch-agent/:
|
||||
python -m eval run # all agent fixtures, default model
|
||||
python -m eval run --fixture=freelance-invoices # single fixture
|
||||
python -m eval run --models=gpt-4o,anthropic/claude-sonnet-4
|
||||
python -m eval run --variants=baseline,detailed # specific prompt variants
|
||||
python -m eval run --fixture=classify-invoices # single fixture
|
||||
python -m eval run --models=gpt-4o,gpt-5.3-codex # multiple models
|
||||
python -m eval run --mode=step1 # only step1 fixtures
|
||||
python -m eval run --no-judge # skip LLM judge scoring
|
||||
|
||||
python -m eval journey # all journey fixtures
|
||||
python -m eval journey --fixture=journey-invoices # single journey fixture
|
||||
python -m eval journey --models=gpt-4o,anthropic/claude-sonnet-4
|
||||
python -m eval interactive # interactive journey session
|
||||
python -m eval interactive --fixture=journey-invoice-setup
|
||||
python -m eval interactive --model=gpt-4o
|
||||
python -m eval interactive --judge-model=github_copilot/gpt-4o-mini
|
||||
|
||||
python -m eval list # list all fixtures
|
||||
python -m eval sync # sync fixtures to Langfuse datasets
|
||||
@@ -25,16 +26,24 @@ import logging
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# Ensure the service root and repo root are in sys.path
|
||||
# Ensure the service root and repo root are in sys.path.
|
||||
# Service root must come BEFORE repo root so its ``app/`` package
|
||||
# shadows the monolith ``app/`` in the repo root.
|
||||
_SERVICE_ROOT = Path(__file__).resolve().parent.parent
|
||||
_REPO_ROOT = _SERVICE_ROOT.parent.parent
|
||||
for p in (_SERVICE_ROOT, _REPO_ROOT):
|
||||
if str(p) not in sys.path:
|
||||
sys.path.insert(0, str(p))
|
||||
_sr = str(_SERVICE_ROOT)
|
||||
_rr = str(_REPO_ROOT)
|
||||
if _rr not in sys.path:
|
||||
sys.path.insert(0, _rr)
|
||||
# Always force service root to position 0 (python -m may have already
|
||||
# added CWD further down the list, which loses to repo root).
|
||||
if _sr in sys.path:
|
||||
sys.path.remove(_sr)
|
||||
sys.path.insert(0, _sr)
|
||||
|
||||
from eval.config import discover_fixtures, discover_journey_fixtures
|
||||
from eval.runner import run_fixture_eval, print_results
|
||||
from eval.journey_runner import run_journey_fixture_eval, print_journey_results
|
||||
from eval.interactive import run_interactive
|
||||
from eval import langfuse_eval
|
||||
|
||||
|
||||
@@ -65,13 +74,14 @@ def _parse_args() -> argparse.Namespace:
|
||||
)
|
||||
run_cmd.add_argument(
|
||||
"--models", "-m",
|
||||
default="gpt-4o",
|
||||
help="Comma-separated list of models to test (default: gpt-4o)",
|
||||
default="github_copilot/gpt-5.3-codex",
|
||||
help="Comma-separated list of models to test (default: github_copilot/gpt-5.3-codex)",
|
||||
)
|
||||
run_cmd.add_argument(
|
||||
"--variants", "-p",
|
||||
"--mode",
|
||||
default=None,
|
||||
help="Comma-separated prompt variants to test (default: all in fixture)",
|
||||
choices=["step1", "step2", "full"],
|
||||
help="Only run fixtures with this mode (default: all)",
|
||||
)
|
||||
run_cmd.add_argument(
|
||||
"--no-judge",
|
||||
@@ -80,8 +90,8 @@ def _parse_args() -> argparse.Namespace:
|
||||
)
|
||||
run_cmd.add_argument(
|
||||
"--judge-model",
|
||||
default="gpt-4o-mini",
|
||||
help="Model for LLM judge (default: gpt-4o-mini)",
|
||||
default="gpt-4o",
|
||||
help="Model for LLM judge (default: gpt-4o)",
|
||||
)
|
||||
run_cmd.add_argument(
|
||||
"--fixtures-dir",
|
||||
@@ -95,35 +105,40 @@ def _parse_args() -> argparse.Namespace:
|
||||
list_cmd.add_argument("--fixtures-dir", default=None)
|
||||
list_cmd.add_argument("-v", "--verbose", action="store_true")
|
||||
|
||||
# ── journey ───────────────────────────────────────────────────
|
||||
journey_cmd = sub.add_parser("journey", help="Run journey evaluations")
|
||||
journey_cmd.add_argument(
|
||||
"--fixture", "-f",
|
||||
help="Run only the named journey fixture (default: all)",
|
||||
)
|
||||
journey_cmd.add_argument(
|
||||
"--models", "-m",
|
||||
default="gpt-4o",
|
||||
help="Comma-separated list of models to test (default: gpt-4o)",
|
||||
)
|
||||
journey_cmd.add_argument(
|
||||
"--judge-model",
|
||||
default="gpt-4o-mini",
|
||||
help="Model for LLM judge (default: gpt-4o-mini)",
|
||||
)
|
||||
journey_cmd.add_argument(
|
||||
"--fixtures-dir",
|
||||
default=None,
|
||||
help="Path to fixtures directory (default: eval/fixtures/)",
|
||||
)
|
||||
journey_cmd.add_argument("-v", "--verbose", action="store_true")
|
||||
|
||||
# ── sync ──────────────────────────────────────────────────────
|
||||
sync_cmd = sub.add_parser("sync", help="Sync fixtures to Langfuse datasets")
|
||||
sync_cmd.add_argument("--fixture", "-f", default=None, help="Sync only the named fixture")
|
||||
sync_cmd.add_argument("--fixtures-dir", default=None)
|
||||
sync_cmd.add_argument("-v", "--verbose", action="store_true")
|
||||
|
||||
# ── interactive ───────────────────────────────────────────────
|
||||
inter_cmd = sub.add_parser("interactive", help="Interactive journey session (human-in-the-loop)")
|
||||
inter_cmd.add_argument(
|
||||
"--fixture", "-f",
|
||||
help="Journey fixture to use (default: pick interactively)",
|
||||
)
|
||||
inter_cmd.add_argument(
|
||||
"--model", "-m",
|
||||
default="github_copilot/gpt-5.3-codex",
|
||||
help="Model for the journey AI (default: github_copilot/gpt-5.3-codex)",
|
||||
)
|
||||
inter_cmd.add_argument(
|
||||
"--judge-model",
|
||||
default="gpt-4o",
|
||||
help="Model for LLM judge (default: gpt-4o)",
|
||||
)
|
||||
inter_cmd.add_argument(
|
||||
"--fixtures-dir",
|
||||
default=None,
|
||||
help="Path to fixtures directory (default: eval/fixtures/)",
|
||||
)
|
||||
inter_cmd.add_argument(
|
||||
"--data-dir",
|
||||
default=None,
|
||||
help="Override sample data directory (e.g. path to private test files not in git)",
|
||||
)
|
||||
inter_cmd.add_argument("-v", "--verbose", action="store_true")
|
||||
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
@@ -146,14 +161,14 @@ async def _cmd_run(args: argparse.Namespace) -> None:
|
||||
return
|
||||
|
||||
models = [m.strip() for m in args.models.split(",")]
|
||||
variants = [v.strip() for v in args.variants.split(",")] if args.variants else None
|
||||
|
||||
all_results = []
|
||||
for fixture in fixtures:
|
||||
if args.mode and fixture.mode != args.mode:
|
||||
continue
|
||||
results = await run_fixture_eval(
|
||||
fixture,
|
||||
models=models,
|
||||
variants=variants,
|
||||
use_llm_judge=not args.no_judge,
|
||||
judge_model=args.judge_model,
|
||||
)
|
||||
@@ -172,12 +187,12 @@ def _cmd_list(args: argparse.Namespace) -> None:
|
||||
|
||||
if fixtures:
|
||||
print(f"\n{'[Agent Fixtures]'}")
|
||||
print(f"{'Name':<30} {'Types':<25} {'Variants':<20} {'Expected'}")
|
||||
print(f"{'Name':<30} {'Mode':<6} {'Types':<25} {'Expected'}")
|
||||
print("-" * 90)
|
||||
for f in fixtures:
|
||||
variants = ", ".join(f.prompt_variants.keys())
|
||||
types = ", ".join(f.data_types)
|
||||
print(f"{f.name:<30} {types:<25} {variants:<20} {len(f.expected)}")
|
||||
n_expected = len(f.expected) + len(f.expected_classification)
|
||||
print(f"{f.name:<30} {f.mode:<6} {types:<25} {n_expected}")
|
||||
|
||||
if journey_fixtures:
|
||||
print(f"\n{'[Journey Fixtures]'}")
|
||||
@@ -217,30 +232,39 @@ def _cmd_sync(args: argparse.Namespace) -> None:
|
||||
print(f"Skipped: {fixture.name} (Langfuse not configured)")
|
||||
|
||||
|
||||
async def _cmd_journey(args: argparse.Namespace) -> None:
|
||||
async def _cmd_interactive(args: argparse.Namespace) -> None:
|
||||
journey_fixtures = discover_journey_fixtures(_fixtures_dir(args.fixtures_dir))
|
||||
if not journey_fixtures:
|
||||
print("No journey fixtures found. Create YAML files with type: journey in eval/fixtures/.")
|
||||
return
|
||||
|
||||
if args.fixture:
|
||||
journey_fixtures = [f for f in journey_fixtures if f.name == args.fixture]
|
||||
if not journey_fixtures:
|
||||
fixtures = [f for f in journey_fixtures if f.name == args.fixture]
|
||||
if not fixtures:
|
||||
print(f"Journey fixture '{args.fixture}' not found.")
|
||||
return
|
||||
fixture = fixtures[0]
|
||||
elif len(journey_fixtures) == 1:
|
||||
fixture = journey_fixtures[0]
|
||||
else:
|
||||
# Let user pick
|
||||
print("\nAvailable journey fixtures:")
|
||||
for i, f in enumerate(journey_fixtures, 1):
|
||||
print(f" {i}. {f.name} — {f.description[:60]}")
|
||||
print()
|
||||
try:
|
||||
choice = int(input("Pick a fixture number: ").strip()) - 1
|
||||
fixture = journey_fixtures[choice]
|
||||
except (ValueError, IndexError, EOFError, KeyboardInterrupt):
|
||||
print("Invalid choice.")
|
||||
return
|
||||
|
||||
models = [m.strip() for m in args.models.split(",")]
|
||||
|
||||
all_results = []
|
||||
for fixture in journey_fixtures:
|
||||
results = await run_journey_fixture_eval(
|
||||
fixture,
|
||||
models=models,
|
||||
judge_model=args.judge_model,
|
||||
)
|
||||
all_results.extend(results)
|
||||
|
||||
print_journey_results(all_results)
|
||||
await run_interactive(
|
||||
fixture,
|
||||
model=args.model,
|
||||
judge_model=args.judge_model,
|
||||
data_dir=Path(args.data_dir).resolve() if args.data_dir else None,
|
||||
)
|
||||
|
||||
|
||||
def main() -> None:
|
||||
@@ -249,8 +273,8 @@ def main() -> None:
|
||||
|
||||
if args.command == "run":
|
||||
asyncio.run(_cmd_run(args))
|
||||
elif args.command == "journey":
|
||||
asyncio.run(_cmd_journey(args))
|
||||
elif args.command == "interactive":
|
||||
asyncio.run(_cmd_interactive(args))
|
||||
elif args.command == "list":
|
||||
_cmd_list(args)
|
||||
elif args.command == "sync":
|
||||
|
||||
@@ -1,70 +1,16 @@
|
||||
"""Eval configuration — YAML fixture loader and dataclasses.
|
||||
|
||||
A *fixture* is a YAML file that defines a complete test scenario:
|
||||
Fixtures come in two families:
|
||||
|
||||
.. code-block:: yaml
|
||||
1. **Agent fixtures** — test the batch agent pipeline.
|
||||
Three modes controlled by ``mode``:
|
||||
|
||||
name: freelance-invoices
|
||||
description: Extract tasks and notes from invoice PDFs (text layer)
|
||||
directory: sample_files/invoices # relative to fixture dir
|
||||
data_types: [tasks, notes]
|
||||
file_extensions: [txt, md]
|
||||
``step1`` — classification prompt only.
|
||||
``step2`` — processing prompt only.
|
||||
``full`` — both steps in sequence.
|
||||
|
||||
# Preseeded records the agent "sees" as existing data
|
||||
seed_records:
|
||||
projects:
|
||||
- id: proj-1
|
||||
name: "Website Redesign"
|
||||
status: active
|
||||
tasks: []
|
||||
|
||||
# Prompt variations to test (at least one required)
|
||||
prompt_variants:
|
||||
baseline: |
|
||||
Extract action items as tasks and meeting summaries as notes.
|
||||
Set priority based on urgency keywords.
|
||||
detailed: |
|
||||
Extract action items as tasks. Map "URGENT" to high priority,
|
||||
"ASAP" to medium. Summaries become notes with full content.
|
||||
|
||||
# Expected extractions — what the agent SHOULD produce
|
||||
expected:
|
||||
tasks:
|
||||
- title: "Send revised invoice to client"
|
||||
priority: high
|
||||
status: todo
|
||||
- title: "Update project timeline"
|
||||
priority: medium
|
||||
notes:
|
||||
- title: "Meeting summary - March kickoff"
|
||||
|
||||
# Optional: models to test (overrides CLI --models)
|
||||
models: []
|
||||
|
||||
A *journey fixture* tests the prompt-template builder conversation:
|
||||
|
||||
.. code-block:: yaml
|
||||
|
||||
type: journey
|
||||
name: journey-invoices
|
||||
description: Test journey builds a good template for invoices
|
||||
directory: sample_files/invoices
|
||||
data_types: [tasks, notes]
|
||||
|
||||
# Simulated user responses for multi-turn conversation
|
||||
user_messages:
|
||||
- "I want to extract action items and meeting summaries"
|
||||
- "Yes, map URGENTE to high priority"
|
||||
- "That looks good, generate the template"
|
||||
|
||||
# Criteria the generated prompt_template should satisfy
|
||||
expected_template_criteria:
|
||||
- "mentions tasks and notes as target entities"
|
||||
- "includes priority mapping rules"
|
||||
- "references isAiSuggested=1"
|
||||
- "does not mention projectId"
|
||||
|
||||
models: []
|
||||
2. **Journey fixtures** — test the prompt-template builder conversation
|
||||
(unchanged).
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
@@ -72,12 +18,14 @@ from __future__ import annotations
|
||||
import logging
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
from typing import Any, Literal
|
||||
|
||||
import yaml
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
EvalMode = Literal["step1", "step2", "full"]
|
||||
|
||||
|
||||
@dataclass
|
||||
class ExpectedRecord:
|
||||
@@ -90,21 +38,52 @@ class ExpectedRecord:
|
||||
fields: dict[str, Any] # field_name → expected_value
|
||||
|
||||
|
||||
@dataclass
|
||||
class ExpectedClassification:
|
||||
"""Expected output of step-1 classification for one file."""
|
||||
|
||||
file: str # relative path to the sample file
|
||||
project_id: str # expected matched project id, or "new"
|
||||
domains: list[str] # expected domain list
|
||||
new_project_name: str | None = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class EvalFixture:
|
||||
"""A complete test scenario loaded from YAML."""
|
||||
"""A complete test scenario loaded from YAML.
|
||||
|
||||
``mode`` determines which pipeline steps are exercised:
|
||||
|
||||
- **step1**: only ``_classify_file``
|
||||
- **step2**: only the processing LLM + tool loop
|
||||
- **full**: both steps in sequence (``run_local_agent``)
|
||||
"""
|
||||
|
||||
name: str
|
||||
description: str
|
||||
mode: EvalMode
|
||||
directory: str # relative path to sample files
|
||||
data_types: list[str]
|
||||
file_extensions: list[str]
|
||||
seed_records: dict[str, list[dict]]
|
||||
prompt_variants: dict[str, str] # variant_name → prompt_template
|
||||
expected: list[ExpectedRecord]
|
||||
models: list[str] # if empty, use CLI default
|
||||
fixture_path: Path = field(default_factory=lambda: Path("."))
|
||||
|
||||
# ── Step-1 inputs (classification) ───────────────────────────
|
||||
domain_definitions: str = ""
|
||||
projects_list: list[dict[str, Any]] = field(default_factory=list)
|
||||
|
||||
# ── Step-2 inputs (processing) ───────────────────────────────
|
||||
existing_context: str = ""
|
||||
project_context: str = ""
|
||||
custom_prompt_section: str = ""
|
||||
|
||||
# ── Seed records for mock executor ───────────────────────────
|
||||
seed_records: dict[str, list[dict]] = field(default_factory=dict)
|
||||
|
||||
# ── Expected outputs ─────────────────────────────────────────
|
||||
expected_classification: list[ExpectedClassification] = field(default_factory=list)
|
||||
expected: list[ExpectedRecord] = field(default_factory=list)
|
||||
|
||||
@property
|
||||
def fixture_dir(self) -> Path:
|
||||
"""Absolute path to the sample files directory."""
|
||||
@@ -115,22 +94,44 @@ class EvalFixture:
|
||||
"""Load a fixture from a YAML file."""
|
||||
raw = yaml.safe_load(path.read_text(encoding="utf-8"))
|
||||
|
||||
mode: EvalMode = raw.get("mode", "full")
|
||||
|
||||
# Parse expected records (step2/full)
|
||||
expected: list[ExpectedRecord] = []
|
||||
for table, records in (raw.get("expected") or {}).items():
|
||||
for rec in records:
|
||||
expected.append(ExpectedRecord(table=table, fields=rec))
|
||||
|
||||
# Parse expected classification (step1/full)
|
||||
expected_classification: list[ExpectedClassification] = []
|
||||
for item in raw.get("expected_classification") or []:
|
||||
expected_classification.append(ExpectedClassification(
|
||||
file=item["file"],
|
||||
project_id=item["project_id"],
|
||||
domains=item.get("domains", []),
|
||||
new_project_name=item.get("new_project_name"),
|
||||
))
|
||||
|
||||
return cls(
|
||||
name=raw["name"],
|
||||
description=raw.get("description", ""),
|
||||
mode=mode,
|
||||
directory=raw.get("directory", "sample_files"),
|
||||
data_types=raw.get("data_types", ["tasks"]),
|
||||
file_extensions=raw.get("file_extensions", []),
|
||||
seed_records=raw.get("seed_records", {}),
|
||||
prompt_variants=raw.get("prompt_variants", {"default": ""}),
|
||||
expected=expected,
|
||||
models=raw.get("models", []),
|
||||
fixture_path=path,
|
||||
# Step-1 inputs
|
||||
domain_definitions=raw.get("domain_definitions", ""),
|
||||
projects_list=raw.get("projects_list", []),
|
||||
# Step-2 inputs
|
||||
existing_context=raw.get("existing_context", ""),
|
||||
project_context=raw.get("project_context", ""),
|
||||
custom_prompt_section=raw.get("custom_prompt_section", ""),
|
||||
# Shared
|
||||
seed_records=raw.get("seed_records", {}),
|
||||
expected_classification=expected_classification,
|
||||
expected=expected,
|
||||
)
|
||||
|
||||
|
||||
@@ -168,9 +169,9 @@ class JourneyFixture:
|
||||
description: str
|
||||
directory: str # relative path to sample files
|
||||
data_types: list[str]
|
||||
user_messages: list[str] # simulated user responses
|
||||
expected_template_criteria: list[str] # what the template should contain/satisfy
|
||||
models: list[str]
|
||||
user_messages: list[str] = field(default_factory=list) # for automated journey runs (unused in interactive mode)
|
||||
models: list[str] = field(default_factory=list)
|
||||
fixture_path: Path = field(default_factory=lambda: Path("."))
|
||||
|
||||
@property
|
||||
|
||||
40
services/batch-agent/eval/fixtures/classify_invoices.yaml
Normal file
40
services/batch-agent/eval/fixtures/classify_invoices.yaml
Normal file
@@ -0,0 +1,40 @@
|
||||
# Fixture: classify-invoices (step1)
|
||||
# Tests _STEP1_SYSTEM_PROMPT — file classification and project matching.
|
||||
# Verifies that the LLM correctly matches files to existing projects
|
||||
# and identifies the right data domains.
|
||||
|
||||
name: classify-invoices
|
||||
mode: step1
|
||||
description: >
|
||||
Test file classification on Italian freelance invoices and meeting notes.
|
||||
Verifies project matching and domain identification.
|
||||
|
||||
directory: sample_files/invoices
|
||||
data_types: [tasks, notes, timelines]
|
||||
file_extensions: [txt, md]
|
||||
|
||||
# ── Step-1 prompt variables ──────────────────────────────────────
|
||||
domain_definitions: |
|
||||
- tasks: Action items, deliverables, things to do — anything that someone needs to complete.
|
||||
- notes: Meeting summaries, decisions, reference information — permanent knowledge entries.
|
||||
- timelines: Project milestones, deadlines, scheduled events — specific dates that mark a point in the progress of a project.
|
||||
|
||||
projects_list:
|
||||
- id: "proj-web-redesign"
|
||||
name: "Redesign Sito Web Corporate"
|
||||
status: "active"
|
||||
aiSummary: "Corporate website redesign for Studio Architettura Bianchi"
|
||||
- id: "proj-ecommerce"
|
||||
name: "E-Commerce FashionStore"
|
||||
status: "active"
|
||||
aiSummary: "Next.js e-commerce platform for FashionStore srl"
|
||||
|
||||
# ── Expected classification results ─────────────────────────────
|
||||
expected_classification:
|
||||
- file: "sample_files/invoices/fattura_042.txt"
|
||||
project_id: "proj-web-redesign"
|
||||
domains: [tasks, notes, timelines]
|
||||
|
||||
- file: "sample_files/invoices/meeting_ecommerce.md"
|
||||
project_id: "proj-ecommerce"
|
||||
domains: [tasks, notes, timelines]
|
||||
@@ -1,86 +0,0 @@
|
||||
# Fixture: freelance-invoices
|
||||
# Tests extraction of tasks, notes, and timelines from
|
||||
# invoices and meeting notes typical of a freelance workflow.
|
||||
|
||||
name: freelance-invoices
|
||||
description: >
|
||||
Extract tasks, notes, and timeline events from Italian freelance
|
||||
invoices and meeting notes. Tests project matching, priority
|
||||
mapping, and bilingual content handling.
|
||||
|
||||
directory: sample_files/invoices
|
||||
data_types: [tasks, notes, timelines]
|
||||
file_extensions: [txt, md]
|
||||
|
||||
# Pre-existing records in the "database"
|
||||
seed_records:
|
||||
projects:
|
||||
- id: "proj-web-redesign"
|
||||
name: "Redesign Sito Web Corporate"
|
||||
status: "active"
|
||||
aiSummary: "Corporate website redesign for Studio Architettura Bianchi"
|
||||
- id: "proj-ecommerce"
|
||||
name: "E-Commerce FashionStore"
|
||||
status: "active"
|
||||
aiSummary: "Next.js e-commerce platform for FashionStore srl"
|
||||
tasks: []
|
||||
notes: []
|
||||
timelines: []
|
||||
|
||||
# Prompt variations to compare
|
||||
prompt_variants:
|
||||
baseline: |
|
||||
Extract action items as tasks and summaries as notes.
|
||||
For timelines, extract any mentioned dates and deadlines.
|
||||
Set isAiSuggested=1 on every record.
|
||||
|
||||
detailed_italian: |
|
||||
Estrai i dati dai file come segue:
|
||||
- TASK: ogni azione da fare, deliverable, o item con scadenza.
|
||||
Mappa "URGENTE" o "ALTA PRIORITÀ" → priority: high.
|
||||
Mappa "media priorità" → priority: medium.
|
||||
Mappa "bassa priorità" → priority: low.
|
||||
Se un item è marcato come "completato" o [x], impostalo status: done.
|
||||
Altrimenti status: todo.
|
||||
- NOTE: riassunti di meeting, decisioni prese, note tecniche.
|
||||
Il titolo deve essere descrittivo. Il content deve includere tutti i dettagli.
|
||||
- TIMELINE: date di scadenza, milestone, meeting futuri.
|
||||
Formato data: timestamp Unix in millisecondi.
|
||||
Imposta sempre isAiSuggested=1.
|
||||
|
||||
minimal: |
|
||||
Extract only high-priority action items as tasks.
|
||||
Ignore notes and timelines unless explicitly marked as important.
|
||||
Set isAiSuggested=1.
|
||||
|
||||
# Expected extractions (what the agent SHOULD produce)
|
||||
# Only key fields are specified — scorer uses fuzzy matching
|
||||
expected:
|
||||
tasks:
|
||||
- title: "Sviluppo frontend React"
|
||||
priority: "high"
|
||||
status: "todo"
|
||||
- title: "Integrazione API backend"
|
||||
priority: "medium"
|
||||
status: "todo"
|
||||
- title: "Testing cross-browser e fix bug responsive"
|
||||
status: "todo"
|
||||
- title: "Preparare wireframe homepage"
|
||||
priority: "high"
|
||||
status: "todo"
|
||||
- title: "Setup progetto Next.js e configurare CI/CD"
|
||||
priority: "medium"
|
||||
status: "todo"
|
||||
- title: "Ricerca plugin Stripe per gestione abbonamenti"
|
||||
priority: "low"
|
||||
status: "todo"
|
||||
|
||||
notes:
|
||||
- title: "Meeting Kickoff Progetto E-Commerce"
|
||||
|
||||
timelines:
|
||||
- title: "MVP E-Commerce pronto"
|
||||
- title: "Meeting di revisione"
|
||||
|
||||
# Models to test (can be overridden via CLI --models)
|
||||
models: []
|
||||
108
services/batch-agent/eval/fixtures/full_invoices.yaml
Normal file
108
services/batch-agent/eval/fixtures/full_invoices.yaml
Normal file
@@ -0,0 +1,108 @@
|
||||
# Fixture: full-invoices (full)
|
||||
# Tests both _STEP1_SYSTEM_PROMPT and _PROCESSING_SYSTEM_PROMPT in sequence
|
||||
# via run_local_agent(). Verifies end-to-end classification + extraction.
|
||||
|
||||
name: full-invoices
|
||||
mode: full
|
||||
description: >
|
||||
End-to-end test: classify Italian invoices/meeting notes into the
|
||||
correct project, then extract tasks, notes, and timeline events.
|
||||
|
||||
directory: sample_files/invoices
|
||||
data_types: [tasks, notes, timelines]
|
||||
file_extensions: [txt, md]
|
||||
|
||||
# ── Step-1 prompt variables ──────────────────────────────────────
|
||||
domain_definitions: |
|
||||
- tasks: Action items, deliverables, things to do — anything that someone needs to complete.
|
||||
- notes: Meeting summaries, decisions, reference information — permanent knowledge entries.
|
||||
- timelines: Project milestones, deadlines, scheduled events — specific dates that mark a point in the progress of a project.
|
||||
|
||||
projects_list:
|
||||
- id: "proj-web-redesign"
|
||||
name: "Redesign Sito Web Corporate"
|
||||
status: "active"
|
||||
aiSummary: "Corporate website redesign for Studio Architettura Bianchi"
|
||||
- id: "proj-ecommerce"
|
||||
name: "E-Commerce FashionStore"
|
||||
status: "active"
|
||||
aiSummary: "Next.js e-commerce platform for FashionStore srl"
|
||||
|
||||
# ── Step-2 prompt variables ──────────────────────────────────────
|
||||
existing_context: |
|
||||
Existing tasks:
|
||||
(none)
|
||||
|
||||
Existing notes:
|
||||
(none)
|
||||
|
||||
Existing timelines:
|
||||
(none)
|
||||
|
||||
project_context: ""
|
||||
|
||||
custom_prompt_section: |
|
||||
User instructions:
|
||||
Estrai i dati dai file come segue:
|
||||
- TASK: ogni azione da fare, deliverable, o item con scadenza.
|
||||
Mappa "URGENTE" o "ALTA PRIORITÀ" → priority: high.
|
||||
Mappa "media priorità" → priority: medium.
|
||||
Mappa "bassa priorità" → priority: low.
|
||||
Se un item è marcato come "completato" o [x], impostalo status: done.
|
||||
Altrimenti status: todo.
|
||||
- NOTE: riassunti di meeting, decisioni prese, note tecniche.
|
||||
- TIMELINE: date di scadenza, milestone, meeting futuri.
|
||||
Imposta sempre isAiSuggested=1.
|
||||
|
||||
# ── Seed records (pre-existing DB state) ─────────────────────────
|
||||
seed_records:
|
||||
projects:
|
||||
- id: "proj-web-redesign"
|
||||
name: "Redesign Sito Web Corporate"
|
||||
status: "active"
|
||||
aiSummary: "Corporate website redesign for Studio Architettura Bianchi"
|
||||
- id: "proj-ecommerce"
|
||||
name: "E-Commerce FashionStore"
|
||||
status: "active"
|
||||
aiSummary: "Next.js e-commerce platform for FashionStore srl"
|
||||
tasks: []
|
||||
notes: []
|
||||
timelines: []
|
||||
|
||||
# ── Expected classification (step 1) ─────────────────────────────
|
||||
expected_classification:
|
||||
- file: "sample_files/invoices/fattura_042.txt"
|
||||
project_id: "proj-web-redesign"
|
||||
domains: [tasks, notes, timelines]
|
||||
|
||||
- file: "sample_files/invoices/meeting_ecommerce.md"
|
||||
project_id: "proj-ecommerce"
|
||||
domains: [tasks, notes, timelines]
|
||||
|
||||
# ── Expected extractions (step 2) ────────────────────────────────
|
||||
expected:
|
||||
tasks:
|
||||
- title: "Sviluppo frontend React"
|
||||
priority: "high"
|
||||
status: "todo"
|
||||
- title: "Integrazione API backend"
|
||||
priority: "medium"
|
||||
status: "todo"
|
||||
- title: "Testing cross-browser e fix bug responsive"
|
||||
status: "todo"
|
||||
- title: "Preparare wireframe homepage"
|
||||
priority: "high"
|
||||
status: "todo"
|
||||
- title: "Setup progetto Next.js e configurare CI/CD"
|
||||
priority: "medium"
|
||||
status: "todo"
|
||||
- title: "Ricerca plugin Stripe per gestione abbonamenti"
|
||||
priority: "low"
|
||||
status: "todo"
|
||||
|
||||
notes:
|
||||
- title: "Meeting Kickoff Progetto E-Commerce"
|
||||
|
||||
timelines:
|
||||
- title: "MVP E-Commerce pronto"
|
||||
- title: "Meeting di revisione"
|
||||
@@ -1,43 +1,25 @@
|
||||
# Journey Fixture: journey-invoice-setup
|
||||
# Tests that the journey chatbot correctly builds a prompt_template
|
||||
# for extracting tasks and notes from Italian invoices and meeting notes.
|
||||
# Used by `python -m eval interactive` for human-in-the-loop testing
|
||||
# of the journey chatbot's prompt-building conversation.
|
||||
|
||||
type: journey
|
||||
name: journey-invoice-setup
|
||||
description: >
|
||||
Test the journey chatbot's ability to explore a directory of Italian
|
||||
invoices and meeting notes, ask relevant questions, and produce a
|
||||
well-structured prompt_template for data extraction.
|
||||
Interactive test for the journey chatbot — explore a directory of
|
||||
Italian invoices and meeting notes, answer the chatbot's questions,
|
||||
and verify it produces a well-structured prompt_template for data
|
||||
extraction.
|
||||
|
||||
directory: sample_files/invoices
|
||||
data_types: [tasks, notes, timelines]
|
||||
|
||||
# Simulated user responses (the journey starts with the LLM exploring
|
||||
# the directory and asking its first question)
|
||||
user_messages:
|
||||
- >
|
||||
I want to extract action items from invoices and meeting notes.
|
||||
The invoices are in Italian and contain work descriptions with
|
||||
deadlines. Meeting notes have action items with checkboxes.
|
||||
- >
|
||||
Yes, map Italian priority keywords: "URGENTE" and "ALTA PRIORITÀ"
|
||||
should be high priority, "media priorità" is medium, "bassa priorità"
|
||||
is low. Items marked with [x] are already completed.
|
||||
- >
|
||||
For notes, I want meeting summaries with the full content including
|
||||
decisions and attendees. For timelines, extract deadlines and
|
||||
scheduled meeting dates.
|
||||
- >
|
||||
That's everything I need. Please generate the template.
|
||||
data_types: [tasks, notes, timelines, projects]
|
||||
|
||||
# Criteria the generated prompt_template must satisfy
|
||||
# Each is scored 0-1 by an LLM judge
|
||||
expected_template_criteria:
|
||||
- "Mentions creating tasks from action items and work descriptions"
|
||||
- "Includes Italian priority keyword mapping (URGENTE→high, media priorità→medium, bassa priorità→low)"
|
||||
- "Handles completed items marked with [x] as status done"
|
||||
- "Mentions creating notes from meeting summaries"
|
||||
- "Mentions extracting timeline events from deadlines and meeting dates"
|
||||
- "Mentions creating projects from relevant information"
|
||||
- "Sets isAiSuggested=1 on all created records"
|
||||
- "Does NOT include projectId assignment logic"
|
||||
- "Uses camelCase field names (title, status, priority, dueDate, content)"
|
||||
|
||||
81
services/batch-agent/eval/fixtures/process_invoices.yaml
Normal file
81
services/batch-agent/eval/fixtures/process_invoices.yaml
Normal file
@@ -0,0 +1,81 @@
|
||||
# Fixture: process-invoices (step2)
|
||||
# Tests _PROCESSING_SYSTEM_PROMPT — data extraction & tool calling.
|
||||
# The classification step is skipped; prompt variables are injected directly.
|
||||
|
||||
name: process-invoices
|
||||
mode: step2
|
||||
description: >
|
||||
Test data extraction from Italian freelance invoices.
|
||||
Verifies correct record creation via tool calls with the right
|
||||
fields, priorities, and status values.
|
||||
|
||||
directory: sample_files/invoices
|
||||
data_types: [tasks, notes, timelines]
|
||||
file_extensions: [txt, md]
|
||||
|
||||
# ── Step-2 prompt variables ──────────────────────────────────────
|
||||
existing_context: |
|
||||
Existing tasks:
|
||||
(none)
|
||||
|
||||
Existing notes:
|
||||
(none)
|
||||
|
||||
Existing timelines:
|
||||
(none)
|
||||
|
||||
project_context: >
|
||||
Project: Redesign Sito Web Corporate (id: proj-web-redesign).
|
||||
Always set projectId to this id on every record you create.
|
||||
|
||||
custom_prompt_section: |
|
||||
User instructions:
|
||||
Estrai i dati dai file come segue:
|
||||
- TASK: ogni azione da fare, deliverable, o item con scadenza.
|
||||
Mappa "URGENTE" o "ALTA PRIORITÀ" → priority: high.
|
||||
Mappa "media priorità" → priority: medium.
|
||||
Mappa "bassa priorità" → priority: low.
|
||||
Se un item è marcato come "completato" o [x], impostalo status: done.
|
||||
Altrimenti status: todo.
|
||||
- NOTE: riassunti di meeting, decisioni prese, note tecniche.
|
||||
Il titolo deve essere descrittivo. Il content deve includere tutti i dettagli.
|
||||
- TIMELINE: date di scadenza, milestone, meeting futuri.
|
||||
Imposta sempre isAiSuggested=1.
|
||||
|
||||
# ── Seed records (pre-existing DB state) ─────────────────────────
|
||||
seed_records:
|
||||
projects:
|
||||
- id: "proj-web-redesign"
|
||||
name: "Redesign Sito Web Corporate"
|
||||
status: "active"
|
||||
tasks: []
|
||||
notes: []
|
||||
timelines: []
|
||||
|
||||
# ── Expected extractions ─────────────────────────────────────────
|
||||
expected:
|
||||
tasks:
|
||||
- title: "Sviluppo frontend React"
|
||||
priority: "high"
|
||||
status: "todo"
|
||||
- title: "Integrazione API backend"
|
||||
priority: "medium"
|
||||
status: "todo"
|
||||
- title: "Testing cross-browser e fix bug responsive"
|
||||
status: "todo"
|
||||
- title: "Preparare wireframe homepage"
|
||||
priority: "high"
|
||||
status: "todo"
|
||||
- title: "Setup progetto Next.js e configurare CI/CD"
|
||||
priority: "medium"
|
||||
status: "todo"
|
||||
- title: "Ricerca plugin Stripe per gestione abbonamenti"
|
||||
priority: "low"
|
||||
status: "todo"
|
||||
|
||||
notes:
|
||||
- title: "Meeting Kickoff Progetto E-Commerce"
|
||||
|
||||
timelines:
|
||||
- title: "MVP E-Commerce pronto"
|
||||
- title: "Meeting di revisione"
|
||||
471
services/batch-agent/eval/interactive.py
Normal file
471
services/batch-agent/eval/interactive.py
Normal file
@@ -0,0 +1,471 @@
|
||||
"""Interactive journey session — human-in-the-loop CLI conversation.
|
||||
|
||||
Flow:
|
||||
1. Show the system prompt used by the journey AI.
|
||||
2. Start the journey (AI explores files, asks first question).
|
||||
3. User types responses in the terminal — AI replies.
|
||||
4. User types `/done` to end the conversation.
|
||||
5. User writes a comment about the interaction quality.
|
||||
6. LLM judge scores the conversation + generated template.
|
||||
7. Results are reported to Langfuse.
|
||||
|
||||
Usage::
|
||||
|
||||
python -m eval interactive # pick a fixture interactively
|
||||
python -m eval interactive --fixture=journey-invoice-setup
|
||||
python -m eval interactive --model=gpt-4o
|
||||
python -m eval interactive --judge-model=github_copilot/gpt-4o-mini
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import sys
|
||||
import time
|
||||
import uuid
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any
|
||||
|
||||
from langchain_core.messages import HumanMessage, SystemMessage
|
||||
|
||||
from eval.config import JourneyFixture, discover_journey_fixtures
|
||||
from eval.mock_executor import MockExecutor
|
||||
from eval import langfuse_eval
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# ── Special commands ─────────────────────────────────────────────────────
|
||||
|
||||
_CMD_DONE = "/done"
|
||||
_CMD_QUIT = "/quit"
|
||||
_CMD_TEMPLATE = "/template"
|
||||
_CMD_HELP = "/help"
|
||||
|
||||
_HELP_TEXT = f"""\
|
||||
{_CMD_DONE} — End the conversation and proceed to evaluation
|
||||
{_CMD_QUIT} — Abort without evaluation
|
||||
{_CMD_TEMPLATE} — Show the generated template (if any)
|
||||
{_CMD_HELP} — Show this help"""
|
||||
|
||||
# ── Terminal colours (ANSI) ──────────────────────────────────────────────
|
||||
|
||||
_C_RESET = "\033[0m"
|
||||
_C_BOLD = "\033[1m"
|
||||
_C_DIM = "\033[2m"
|
||||
_C_CYAN = "\033[36m"
|
||||
_C_GREEN = "\033[32m"
|
||||
_C_YELLOW = "\033[33m"
|
||||
_C_MAGENTA = "\033[35m"
|
||||
_C_RED = "\033[31m"
|
||||
_C_BLUE = "\033[34m"
|
||||
|
||||
|
||||
def _print_header(text: str) -> None:
|
||||
print(f"\n{_C_BOLD}{_C_CYAN}{'═' * 80}")
|
||||
print(f" {text}")
|
||||
print(f"{'═' * 80}{_C_RESET}\n")
|
||||
|
||||
|
||||
def _print_ai(text: str) -> None:
|
||||
print(f"\n{_C_GREEN}{_C_BOLD}AI:{_C_RESET} {text}\n")
|
||||
|
||||
|
||||
def _print_system(text: str) -> None:
|
||||
print(f"{_C_DIM}{text}{_C_RESET}")
|
||||
|
||||
|
||||
def _print_score(label: str, score: float) -> None:
|
||||
if score >= 0.7:
|
||||
color = _C_GREEN
|
||||
tag = "PASS"
|
||||
elif score >= 0.4:
|
||||
color = _C_YELLOW
|
||||
tag = "PARTIAL"
|
||||
else:
|
||||
color = _C_RED
|
||||
tag = "FAIL"
|
||||
print(f" {color}{tag:>7}{_C_RESET} ({score:.1f}) {label}")
|
||||
|
||||
|
||||
# ── Result type ──────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
@dataclass
|
||||
class InteractiveResult:
|
||||
fixture_name: str
|
||||
model: str
|
||||
judge_model: str
|
||||
prompt_template: str | None
|
||||
conversation: list[dict[str, str]]
|
||||
user_comment: str
|
||||
done: bool
|
||||
criteria_scores: dict[str, float]
|
||||
overall_score: float
|
||||
judge_reasoning: str
|
||||
elapsed_seconds: float
|
||||
|
||||
def summary(self) -> dict[str, Any]:
|
||||
return {
|
||||
"fixture": self.fixture_name,
|
||||
"model": self.model,
|
||||
"judge_model": self.judge_model,
|
||||
"done": self.done,
|
||||
"turns": len([c for c in self.conversation if c["role"] == "user"]),
|
||||
"overall_score": round(self.overall_score, 3),
|
||||
"user_comment": self.user_comment,
|
||||
"criteria_scores": {k: round(v, 3) for k, v in self.criteria_scores.items()},
|
||||
"elapsed_s": round(self.elapsed_seconds, 1),
|
||||
}
|
||||
|
||||
|
||||
# ── LLM judge ────────────────────────────────────────────────────────────
|
||||
|
||||
_INTERACTIVE_JUDGE_SYSTEM = """\
|
||||
You are an evaluation judge for AI-generated prompt templates produced during
|
||||
an interactive conversation between a human and a journey chatbot.
|
||||
|
||||
The chatbot explored a directory and through multi-turn conversation with the
|
||||
user produced a prompt_template — an instruction set for a data-extraction agent.
|
||||
|
||||
You have access to:
|
||||
- The full conversation transcript
|
||||
- The generated prompt_template (if any)
|
||||
- The user's own comment about the interaction
|
||||
- A list of quality criteria
|
||||
|
||||
Score each criterion from 0 to 1:
|
||||
- 1.0: Fully satisfied
|
||||
- 0.5: Partially satisfied
|
||||
- 0.0: Not satisfied
|
||||
|
||||
Also provide an overall_quality score (0-1) evaluating the conversation flow,
|
||||
how well the AI understood the user, and the template quality.
|
||||
|
||||
Respond with ONLY a JSON object:
|
||||
{
|
||||
"criteria_scores": {"criterion_1": 0.8, ...},
|
||||
"overall_quality": 0.85,
|
||||
"reasoning": "Brief explanation covering both conversation quality and template accuracy"
|
||||
}
|
||||
"""
|
||||
|
||||
|
||||
async def _judge_interactive(
|
||||
conversation: list[dict[str, str]],
|
||||
prompt_template: str | None,
|
||||
user_comment: str,
|
||||
criteria: list[str],
|
||||
*,
|
||||
judge_model: str = "gpt-4o-mini",
|
||||
) -> tuple[dict[str, float], float, str]:
|
||||
"""Score an interactive session. Returns (criteria_scores, overall_quality, reasoning)."""
|
||||
from shared.llm import get_llm
|
||||
|
||||
llm = get_llm(model=judge_model, temperature=0)
|
||||
|
||||
conv_text = "\n".join(
|
||||
f"{'USER' if t['role'] == 'user' else 'AI'}: {t['content']}"
|
||||
for t in conversation
|
||||
)
|
||||
criteria_text = "\n".join(f" {i+1}. {c}" for i, c in enumerate(criteria))
|
||||
|
||||
user_content = (
|
||||
f"## Conversation transcript\n```\n{conv_text}\n```\n\n"
|
||||
f"## Generated prompt_template\n```\n{prompt_template or '(none — conversation did not complete)'}\n```\n\n"
|
||||
f"## User's comment\n{user_comment}\n\n"
|
||||
f"## Criteria to evaluate\n{criteria_text}"
|
||||
)
|
||||
|
||||
try:
|
||||
response = await llm.ainvoke([
|
||||
SystemMessage(content=_INTERACTIVE_JUDGE_SYSTEM),
|
||||
HumanMessage(content=user_content),
|
||||
])
|
||||
raw = response.content.strip()
|
||||
if raw.startswith("```"):
|
||||
raw = raw.split("```")[1]
|
||||
if raw.startswith("json"):
|
||||
raw = raw[4:]
|
||||
parsed = json.loads(raw.strip())
|
||||
|
||||
scores_raw = parsed.get("criteria_scores", parsed.get("scores", {}))
|
||||
criteria_scores: dict[str, float] = {}
|
||||
for i, criterion in enumerate(criteria):
|
||||
key_candidates = [f"criterion_{i+1}", criterion, criterion[:50], str(i + 1)]
|
||||
score = 0.0
|
||||
for key in key_candidates:
|
||||
if key in scores_raw:
|
||||
score = float(scores_raw[key])
|
||||
break
|
||||
if score == 0.0 and i < len(scores_raw):
|
||||
score = float(list(scores_raw.values())[i])
|
||||
criteria_scores[criterion] = score
|
||||
|
||||
overall = float(parsed.get("overall_quality", 0.0))
|
||||
reasoning = str(parsed.get("reasoning", ""))
|
||||
return criteria_scores, overall, reasoning
|
||||
|
||||
except Exception as exc:
|
||||
logger.warning("interactive judge failed: %s", exc)
|
||||
return {c: 0.0 for c in criteria}, 0.0, f"Judge error: {exc}"
|
||||
|
||||
|
||||
# ── Interactive session ──────────────────────────────────────────────────
|
||||
|
||||
|
||||
async def run_interactive(
|
||||
fixture: JourneyFixture,
|
||||
*,
|
||||
model: str = "gpt-4o",
|
||||
judge_model: str = "gpt-4o-mini",
|
||||
data_dir: Path | None = None,
|
||||
) -> InteractiveResult:
|
||||
"""Run an interactive journey session in the terminal.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data_dir :
|
||||
If set, overrides the fixture's sample-file directory. The LLM
|
||||
will explore this folder instead of the default
|
||||
``fixtures/sample_files/…``. Useful for private test data that
|
||||
shouldn't be committed to git.
|
||||
"""
|
||||
from shared.config import settings
|
||||
from shared.ws_context import set_current_user, clear_current_user
|
||||
from app.journey import (
|
||||
handle_journey_start,
|
||||
handle_journey_message,
|
||||
_build_system_prompt,
|
||||
)
|
||||
|
||||
# When --data-dir is given, the MockExecutor's root becomes
|
||||
# data_dir's parent and the journey directory is data_dir's name.
|
||||
# This way the LLM sees a meaningful directory name (not ".") and
|
||||
# MockExecutor resolves paths correctly.
|
||||
# Otherwise, use the fixture's YAML parent and its relative path.
|
||||
if data_dir:
|
||||
mock_root = data_dir.parent
|
||||
journey_directory = data_dir.name
|
||||
else:
|
||||
mock_root = fixture.fixture_path.parent
|
||||
journey_directory = fixture.directory
|
||||
|
||||
mock = MockExecutor(
|
||||
fixture_dir=mock_root,
|
||||
seed_records={},
|
||||
)
|
||||
|
||||
original_model = settings.LLM_MODEL
|
||||
settings.LLM_MODEL = model
|
||||
eval_user_id = f"interactive-{uuid.uuid4().hex[:8]}"
|
||||
|
||||
# ── Show system prompt ───────────────────────────────────────
|
||||
system_prompt = _build_system_prompt(journey_directory, fixture.data_types)
|
||||
|
||||
_print_header("SYSTEM PROMPT")
|
||||
print(f"{_C_DIM}{system_prompt}{_C_RESET}")
|
||||
|
||||
_print_header(f"INTERACTIVE JOURNEY | fixture: {fixture.name} | model: {model}")
|
||||
print(f" Data dir: {mock_root}")
|
||||
print(f" Type your responses. Commands: {_CMD_DONE}, {_CMD_QUIT}, {_CMD_TEMPLATE}, {_CMD_HELP}")
|
||||
print(f" Judge model: {judge_model}")
|
||||
print(f" Criteria: {len(fixture.expected_template_criteria)}")
|
||||
print()
|
||||
|
||||
conversation: list[dict[str, str]] = []
|
||||
prompt_template: str | None = None
|
||||
done = False
|
||||
start_time = time.time()
|
||||
|
||||
try:
|
||||
set_current_user(eval_user_id)
|
||||
|
||||
with mock.patch():
|
||||
# ── Start ────────────────────────────────────────────
|
||||
_print_system("Starting journey... (AI is exploring your files)")
|
||||
|
||||
start_frame: dict[str, Any] = {
|
||||
"agent_type": "local",
|
||||
"directory": journey_directory,
|
||||
"data_types": fixture.data_types,
|
||||
"session_id": f"interactive-{uuid.uuid4().hex[:8]}",
|
||||
}
|
||||
|
||||
reply = await handle_journey_start(eval_user_id, start_frame)
|
||||
session_id = reply["session_id"]
|
||||
conversation.append({"role": "assistant", "content": reply["message"]})
|
||||
_print_ai(reply["message"])
|
||||
|
||||
if reply["done"]:
|
||||
prompt_template = reply.get("prompt_template")
|
||||
done = True
|
||||
_print_system("Journey completed on first reply (template generated).")
|
||||
|
||||
# ── Conversation loop ────────────────────────────────
|
||||
while not done:
|
||||
try:
|
||||
user_input = input(f"{_C_BOLD}{_C_BLUE}YOU:{_C_RESET} ").strip()
|
||||
except (EOFError, KeyboardInterrupt):
|
||||
print()
|
||||
user_input = _CMD_QUIT
|
||||
|
||||
if not user_input:
|
||||
continue
|
||||
|
||||
# Handle commands
|
||||
if user_input.lower() == _CMD_QUIT:
|
||||
_print_system("Aborted — no evaluation will be performed.")
|
||||
settings.LLM_MODEL = original_model
|
||||
clear_current_user()
|
||||
return InteractiveResult(
|
||||
fixture_name=fixture.name, model=model, judge_model=judge_model,
|
||||
prompt_template=None, conversation=conversation,
|
||||
user_comment="(aborted)", done=False,
|
||||
criteria_scores={}, overall_score=0.0,
|
||||
judge_reasoning="Session aborted by user.",
|
||||
elapsed_seconds=time.time() - start_time,
|
||||
)
|
||||
|
||||
if user_input.lower() == _CMD_HELP:
|
||||
print(_HELP_TEXT)
|
||||
continue
|
||||
|
||||
if user_input.lower() == _CMD_TEMPLATE:
|
||||
if prompt_template:
|
||||
print(f"\n{_C_MAGENTA}{prompt_template}{_C_RESET}\n")
|
||||
else:
|
||||
_print_system("No template generated yet.")
|
||||
continue
|
||||
|
||||
if user_input.lower() == _CMD_DONE:
|
||||
_print_system("Ending conversation...")
|
||||
break
|
||||
|
||||
# ── Send message to AI ───────────────────────────
|
||||
conversation.append({"role": "user", "content": user_input})
|
||||
_print_system("AI is thinking...")
|
||||
|
||||
msg_frame: dict[str, Any] = {
|
||||
"session_id": session_id,
|
||||
"message": user_input,
|
||||
}
|
||||
reply = await handle_journey_message(eval_user_id, msg_frame)
|
||||
conversation.append({"role": "assistant", "content": reply["message"]})
|
||||
_print_ai(reply["message"])
|
||||
|
||||
if reply["done"]:
|
||||
prompt_template = reply.get("prompt_template")
|
||||
done = True
|
||||
_print_system("Journey completed — template generated!")
|
||||
|
||||
except Exception as exc:
|
||||
logger.error("interactive journey failed: %s", exc)
|
||||
_print_system(f"Error: {exc}")
|
||||
finally:
|
||||
settings.LLM_MODEL = original_model
|
||||
clear_current_user()
|
||||
|
||||
elapsed = time.time() - start_time
|
||||
turns = len([c for c in conversation if c["role"] == "user"])
|
||||
|
||||
# ── Show template if generated ───────────────────────────────
|
||||
if prompt_template:
|
||||
_print_header("GENERATED TEMPLATE")
|
||||
print(f"{_C_MAGENTA}{prompt_template}{_C_RESET}\n")
|
||||
else:
|
||||
_print_system("No template was generated during this session.")
|
||||
|
||||
# ── User comment ─────────────────────────────────────────────
|
||||
_print_header("YOUR EVALUATION")
|
||||
print(" Write your comment about this interaction (press Enter twice to finish):")
|
||||
print()
|
||||
comment_lines: list[str] = []
|
||||
try:
|
||||
while True:
|
||||
line = input()
|
||||
if line == "" and comment_lines and comment_lines[-1] == "":
|
||||
comment_lines.pop() # remove trailing empty
|
||||
break
|
||||
comment_lines.append(line)
|
||||
except (EOFError, KeyboardInterrupt):
|
||||
pass
|
||||
user_comment = "\n".join(comment_lines).strip() or "(no comment)"
|
||||
|
||||
# ── Judge ────────────────────────────────────────────────────
|
||||
_print_header("LLM JUDGE EVALUATION")
|
||||
_print_system(f"Scoring with {judge_model}...")
|
||||
|
||||
criteria_scores, overall_quality, judge_reasoning = await _judge_interactive(
|
||||
conversation=conversation,
|
||||
prompt_template=prompt_template,
|
||||
user_comment=user_comment,
|
||||
criteria=fixture.expected_template_criteria,
|
||||
judge_model=judge_model,
|
||||
)
|
||||
|
||||
# ── Display scores ───────────────────────────────────────────
|
||||
print()
|
||||
for criterion, score in criteria_scores.items():
|
||||
_print_score(criterion, score)
|
||||
|
||||
overall = (
|
||||
sum(criteria_scores.values()) / len(criteria_scores)
|
||||
if criteria_scores
|
||||
else 0.0
|
||||
)
|
||||
|
||||
print(f"\n {_C_BOLD}Criteria avg: {overall:.2f}{_C_RESET}")
|
||||
print(f" {_C_BOLD}Overall quality: {overall_quality:.2f}{_C_RESET}")
|
||||
print(f" {_C_BOLD}Turns: {turns}{_C_RESET}")
|
||||
print(f" {_C_BOLD}Time: {elapsed:.1f}s{_C_RESET}")
|
||||
print(f"\n {_C_DIM}Judge: {judge_reasoning}{_C_RESET}")
|
||||
print(f" {_C_DIM}Your comment: {user_comment}{_C_RESET}\n")
|
||||
|
||||
result = InteractiveResult(
|
||||
fixture_name=fixture.name,
|
||||
model=model,
|
||||
judge_model=judge_model,
|
||||
prompt_template=prompt_template,
|
||||
conversation=conversation,
|
||||
user_comment=user_comment,
|
||||
done=done,
|
||||
criteria_scores=criteria_scores,
|
||||
overall_score=overall_quality,
|
||||
judge_reasoning=judge_reasoning,
|
||||
elapsed_seconds=elapsed,
|
||||
)
|
||||
|
||||
# ── Report to Langfuse ───────────────────────────────────────
|
||||
trace_id = langfuse_eval.log_eval_trace(
|
||||
fixture_name=fixture.name,
|
||||
model=model,
|
||||
prompt_variant="interactive",
|
||||
prompt_template=prompt_template or "(not generated)",
|
||||
actual_mutations=[{
|
||||
"conversation": conversation[:30],
|
||||
"user_comment": user_comment,
|
||||
}],
|
||||
scores_summary=result.summary(),
|
||||
langfuse_prompt_names=["journey_system"],
|
||||
)
|
||||
|
||||
if trace_id:
|
||||
from eval.scorer import EvalScores
|
||||
scores_obj = EvalScores(
|
||||
fixture_name=fixture.name,
|
||||
model=model,
|
||||
prompt_variant="interactive",
|
||||
precision=overall,
|
||||
recall=float(done),
|
||||
f1=overall,
|
||||
llm_judge_score=overall_quality,
|
||||
llm_judge_reasoning=judge_reasoning,
|
||||
)
|
||||
langfuse_eval.post_eval_scores(scores_obj, trace_id=trace_id)
|
||||
_print_system(f"Results reported to Langfuse (trace: {trace_id})")
|
||||
else:
|
||||
_print_system("Langfuse not configured — results not reported.")
|
||||
|
||||
return result
|
||||
@@ -94,7 +94,7 @@ async def _judge_template(
|
||||
|
||||
Returns (criteria_scores, reasoning).
|
||||
"""
|
||||
from app.llm import get_llm
|
||||
from shared.llm import get_llm
|
||||
|
||||
llm = get_llm(model=judge_model, temperature=0)
|
||||
|
||||
@@ -152,13 +152,23 @@ async def run_single_journey_eval(
|
||||
model: str,
|
||||
*,
|
||||
judge_model: str = "gpt-4o-mini",
|
||||
data_dir: Path | None = None,
|
||||
) -> JourneyEvalResult:
|
||||
"""Execute one journey eval: start → messages → score template."""
|
||||
"""Execute one journey eval: start \u2192 messages \u2192 score template."""
|
||||
from shared.config import settings
|
||||
|
||||
# Build mock executor for filesystem tools
|
||||
# When data_dir is given, use its parent as MockExecutor root
|
||||
# and its name as the journey directory so the LLM sees a
|
||||
# meaningful path (not ".").
|
||||
if data_dir:
|
||||
mock_root = data_dir.parent
|
||||
journey_directory = data_dir.name
|
||||
else:
|
||||
mock_root = fixture.fixture_path.parent
|
||||
journey_directory = fixture.directory
|
||||
|
||||
mock = MockExecutor(
|
||||
fixture_dir=fixture.fixture_dir,
|
||||
fixture_dir=mock_root,
|
||||
seed_records={},
|
||||
)
|
||||
|
||||
@@ -178,7 +188,7 @@ async def run_single_journey_eval(
|
||||
done = False
|
||||
|
||||
try:
|
||||
from app.ws_context import set_current_user, clear_current_user
|
||||
from shared.ws_context import set_current_user, clear_current_user
|
||||
from app.journey import handle_journey_start, handle_journey_message, _sessions
|
||||
|
||||
set_current_user(eval_user_id)
|
||||
@@ -186,7 +196,7 @@ async def run_single_journey_eval(
|
||||
# ── Start the journey ────────────────────────────────
|
||||
start_frame: dict[str, Any] = {
|
||||
"agent_type": "local",
|
||||
"directory": fixture.directory,
|
||||
"directory": journey_directory,
|
||||
"data_types": fixture.data_types,
|
||||
"session_id": f"eval-{uuid.uuid4().hex[:8]}",
|
||||
}
|
||||
@@ -246,7 +256,7 @@ async def run_single_journey_eval(
|
||||
logger.error("journey_eval: pipeline failed for %s/%s: %s", fixture.name, model, exc)
|
||||
finally:
|
||||
settings.LLM_MODEL = original_model
|
||||
from app.ws_context import clear_current_user
|
||||
from shared.ws_context import clear_current_user
|
||||
clear_current_user()
|
||||
|
||||
elapsed = time.time() - start_time
|
||||
@@ -297,6 +307,7 @@ async def run_single_journey_eval(
|
||||
prompt_template=prompt_template or "(not generated)",
|
||||
actual_mutations=[{"conversation": conversation[:20]}],
|
||||
scores_summary=result.summary(),
|
||||
langfuse_prompt_names=["journey_system"],
|
||||
)
|
||||
|
||||
if trace_id:
|
||||
@@ -321,6 +332,7 @@ async def run_journey_fixture_eval(
|
||||
models: list[str],
|
||||
*,
|
||||
judge_model: str = "gpt-4o-mini",
|
||||
data_dir: Path | None = None,
|
||||
) -> list[JourneyEvalResult]:
|
||||
"""Run all models for a journey fixture."""
|
||||
langfuse_eval.sync_journey_fixture_to_dataset(fixture)
|
||||
@@ -329,6 +341,7 @@ async def run_journey_fixture_eval(
|
||||
for model in models:
|
||||
result = await run_single_journey_eval(
|
||||
fixture, model, judge_model=judge_model,
|
||||
data_dir=data_dir,
|
||||
)
|
||||
results.append(result)
|
||||
|
||||
|
||||
@@ -1,21 +1,21 @@
|
||||
"""Langfuse evaluation integration — datasets, runs, and scoring.
|
||||
|
||||
Uses the Langfuse Python SDK to:
|
||||
Uses the Langfuse Python SDK v4 (OpenTelemetry-based) to:
|
||||
|
||||
1. **Sync fixtures → Langfuse datasets**: Each YAML fixture becomes a dataset,
|
||||
each prompt variant + expected pair becomes a dataset item.
|
||||
|
||||
2. **Track eval runs**: Each (fixture × model × prompt_variant) execution
|
||||
is recorded as a dataset run with linked traces and scores.
|
||||
is recorded as a trace with linked scores.
|
||||
|
||||
3. **Post scores**: precision, recall, F1, field_accuracy, llm_judge are
|
||||
posted as numeric scores on the trace/run.
|
||||
posted as numeric scores on the trace.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
from typing import Any
|
||||
|
||||
from shared.config import settings
|
||||
@@ -26,16 +26,16 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _get_langfuse():
|
||||
"""Get or create a Langfuse client instance."""
|
||||
"""Get or create a Langfuse client instance (SDK v4)."""
|
||||
if not settings.LANGFUSE_SECRET_KEY or not settings.LANGFUSE_PUBLIC_KEY:
|
||||
return None
|
||||
try:
|
||||
from langfuse import Langfuse
|
||||
return Langfuse(
|
||||
secret_key=settings.LANGFUSE_SECRET_KEY,
|
||||
public_key=settings.LANGFUSE_PUBLIC_KEY,
|
||||
host=settings.LANGFUSE_HOST,
|
||||
)
|
||||
os.environ.setdefault("LANGFUSE_SECRET_KEY", settings.LANGFUSE_SECRET_KEY)
|
||||
os.environ.setdefault("LANGFUSE_PUBLIC_KEY", settings.LANGFUSE_PUBLIC_KEY)
|
||||
if settings.LANGFUSE_HOST:
|
||||
os.environ.setdefault("LANGFUSE_HOST", settings.LANGFUSE_HOST)
|
||||
from langfuse import get_client
|
||||
return get_client()
|
||||
except Exception as exc:
|
||||
logger.warning("langfuse_eval: failed to create client: %s", exc)
|
||||
return None
|
||||
@@ -61,35 +61,44 @@ def sync_fixture_to_dataset(fixture: EvalFixture) -> str | None:
|
||||
lf.create_dataset(
|
||||
name=dataset_name,
|
||||
description=fixture.description,
|
||||
metadata={"data_types": fixture.data_types, "file_extensions": fixture.file_extensions},
|
||||
metadata={
|
||||
"data_types": ",".join(fixture.data_types),
|
||||
"file_extensions": ",".join(fixture.file_extensions) if fixture.file_extensions else "",
|
||||
},
|
||||
)
|
||||
except Exception:
|
||||
# Dataset may already exist — that's fine
|
||||
pass
|
||||
|
||||
expected_output = {}
|
||||
for rec in fixture.expected:
|
||||
expected_output.setdefault(rec.table, []).append(rec.fields)
|
||||
# Build expected_output appropriate to the fixture's mode
|
||||
expected_output: dict[str, Any] = {}
|
||||
if fixture.mode in ("step1", "full") and fixture.expected_classification:
|
||||
expected_output["classifications"] = [
|
||||
{"file": ec.file, "project_id": ec.project_id, "domains": ec.domains}
|
||||
for ec in fixture.expected_classification
|
||||
]
|
||||
if fixture.mode in ("step2", "full") and fixture.expected:
|
||||
for rec in fixture.expected:
|
||||
expected_output.setdefault(rec.table, []).append(rec.fields)
|
||||
|
||||
for variant_name, prompt_template in fixture.prompt_variants.items():
|
||||
item_id = f"{fixture.name}--{variant_name}"
|
||||
try:
|
||||
lf.create_dataset_item(
|
||||
dataset_name=dataset_name,
|
||||
id=item_id,
|
||||
input={
|
||||
"directory": fixture.directory,
|
||||
"data_types": fixture.data_types,
|
||||
"prompt_template": prompt_template,
|
||||
"seed_records": fixture.seed_records,
|
||||
},
|
||||
expected_output=expected_output,
|
||||
metadata={"prompt_variant": variant_name},
|
||||
)
|
||||
except Exception as exc:
|
||||
logger.warning(
|
||||
"langfuse_eval: failed to upsert dataset item %s: %s", item_id, exc
|
||||
)
|
||||
item_id = f"{fixture.name}--{fixture.mode}"
|
||||
try:
|
||||
lf.create_dataset_item(
|
||||
dataset_name=dataset_name,
|
||||
id=item_id,
|
||||
input={
|
||||
"directory": fixture.directory,
|
||||
"data_types": fixture.data_types,
|
||||
"mode": fixture.mode,
|
||||
"seed_records": fixture.seed_records,
|
||||
},
|
||||
expected_output=expected_output,
|
||||
metadata={"mode": fixture.mode},
|
||||
)
|
||||
except Exception as exc:
|
||||
logger.warning(
|
||||
"langfuse_eval: failed to upsert dataset item %s: %s", item_id, exc
|
||||
)
|
||||
|
||||
lf.flush()
|
||||
logger.info("langfuse_eval: synced fixture '%s' → dataset '%s'", fixture.name, dataset_name)
|
||||
@@ -114,7 +123,7 @@ def sync_journey_fixture_to_dataset(fixture) -> str | None:
|
||||
lf.create_dataset(
|
||||
name=dataset_name,
|
||||
description=fixture.description,
|
||||
metadata={"type": "journey", "data_types": fixture.data_types},
|
||||
metadata={"type": "journey", "data_types": ",".join(fixture.data_types)},
|
||||
)
|
||||
except Exception:
|
||||
pass # Dataset may already exist
|
||||
@@ -148,18 +157,26 @@ def create_eval_run(
|
||||
*,
|
||||
metadata: dict[str, Any] | None = None,
|
||||
) -> str:
|
||||
"""Create a dataset run in Langfuse. Returns the run name."""
|
||||
"""Create a dataset run in Langfuse. Returns the run name.
|
||||
|
||||
Note: In SDK v4, dataset runs are created implicitly via
|
||||
dataset.run_experiment(). This function is kept for backwards
|
||||
compatibility but may not create a run.
|
||||
"""
|
||||
lf = _get_langfuse()
|
||||
if lf is None:
|
||||
return run_name
|
||||
|
||||
try:
|
||||
lf.create_dataset_run(
|
||||
dataset_name=dataset_name,
|
||||
run_name=run_name,
|
||||
metadata=metadata or {},
|
||||
)
|
||||
lf.flush()
|
||||
if hasattr(lf, "create_dataset_run"):
|
||||
lf.create_dataset_run(
|
||||
dataset_name=dataset_name,
|
||||
run_name=run_name,
|
||||
metadata=metadata or {},
|
||||
)
|
||||
lf.flush()
|
||||
else:
|
||||
logger.debug("langfuse_eval: create_dataset_run not available in SDK v4")
|
||||
except Exception as exc:
|
||||
logger.warning("langfuse_eval: failed to create run %s: %s", run_name, exc)
|
||||
|
||||
@@ -185,21 +202,22 @@ def post_eval_scores(
|
||||
("precision", scores.precision),
|
||||
("recall", scores.recall),
|
||||
("f1", scores.f1),
|
||||
("field_accuracy", scores.field_accuracy),
|
||||
]
|
||||
# Only post field_accuracy when there are field-level scores (step2/full)
|
||||
if scores.field_scores:
|
||||
score_data.append(("field_accuracy", scores.field_accuracy))
|
||||
if scores.llm_judge_score is not None:
|
||||
score_data.append(("llm_judge", scores.llm_judge_score))
|
||||
|
||||
for name, value in score_data:
|
||||
try:
|
||||
kwargs: dict[str, Any] = {
|
||||
"name": name,
|
||||
"value": value,
|
||||
"comment": f"{scores.fixture_name} | {scores.model} | {scores.prompt_variant}",
|
||||
}
|
||||
if trace_id:
|
||||
kwargs["trace_id"] = trace_id
|
||||
lf.score(**kwargs)
|
||||
lf.create_score(
|
||||
name=name,
|
||||
value=value,
|
||||
trace_id=trace_id,
|
||||
data_type="NUMERIC",
|
||||
comment=f"{scores.fixture_name} | {scores.model} | {scores.prompt_variant}",
|
||||
)
|
||||
except Exception as exc:
|
||||
logger.warning("langfuse_eval: failed to post score %s: %s", name, exc)
|
||||
|
||||
@@ -218,12 +236,20 @@ def log_eval_trace(
|
||||
prompt_template: str,
|
||||
actual_mutations: list[dict],
|
||||
scores_summary: dict[str, Any],
|
||||
step1_results: list[dict] | None = None,
|
||||
dataset_name: str | None = None,
|
||||
run_name: str | None = None,
|
||||
dataset_item_id: str | None = None,
|
||||
langfuse_prompt_names: list[str] | None = None,
|
||||
) -> str | None:
|
||||
"""Create a Langfuse trace for one eval execution and link it to a dataset run.
|
||||
|
||||
Uses SDK v4 observation API (traces are created implicitly by root spans).
|
||||
``langfuse_prompt_names`` can contain one or two prompt names to link
|
||||
(e.g. ``["batch_file_classifier", "batch_processing"]`` for full mode).
|
||||
Each prompt gets its own generation-type observation for per-version
|
||||
metrics tracking.
|
||||
|
||||
Returns the trace_id, or None if Langfuse is unavailable.
|
||||
"""
|
||||
lf = _get_langfuse()
|
||||
@@ -231,38 +257,71 @@ def log_eval_trace(
|
||||
return None
|
||||
|
||||
try:
|
||||
trace = lf.trace(
|
||||
name=f"eval-{fixture_name}",
|
||||
input={
|
||||
"prompt_template": prompt_template,
|
||||
"model": model,
|
||||
"prompt_variant": prompt_variant,
|
||||
},
|
||||
output={
|
||||
"mutations": actual_mutations[:50],
|
||||
"scores": scores_summary,
|
||||
},
|
||||
from langfuse import propagate_attributes
|
||||
|
||||
# Fetch prompt objects for linking
|
||||
prompt_objs: list[tuple[str, Any]] = []
|
||||
for pname in (langfuse_prompt_names or []):
|
||||
try:
|
||||
obj = lf.get_prompt(name=pname, cache_ttl_seconds=300)
|
||||
prompt_objs.append((pname, obj))
|
||||
logger.info("langfuse_eval: linked prompt '%s' (type=%s)", pname, type(obj).__name__)
|
||||
except Exception as exc:
|
||||
logger.warning("langfuse_eval: prompt '%s' not found — %s", pname, exc)
|
||||
|
||||
# Build trace output dict
|
||||
trace_output: dict[str, Any] = {"scores": scores_summary}
|
||||
if step1_results:
|
||||
trace_output["classifications"] = step1_results
|
||||
if actual_mutations:
|
||||
trace_output["mutations"] = actual_mutations[:50]
|
||||
|
||||
with propagate_attributes(
|
||||
trace_name=f"eval-{fixture_name}",
|
||||
metadata={
|
||||
"eval": True,
|
||||
"eval": "true",
|
||||
"fixture": fixture_name,
|
||||
"model": model,
|
||||
"prompt_variant": prompt_variant,
|
||||
},
|
||||
tags=["eval", f"model:{model}", f"variant:{prompt_variant}"],
|
||||
)
|
||||
):
|
||||
# Root span for the eval run
|
||||
span = lf.start_observation(name=f"eval-{fixture_name}")
|
||||
span.update(
|
||||
input={
|
||||
"prompt_template": prompt_template,
|
||||
"model": model,
|
||||
"prompt_variant": prompt_variant,
|
||||
},
|
||||
output=trace_output,
|
||||
)
|
||||
trace_id = span.trace_id
|
||||
|
||||
# Link to dataset run if available
|
||||
if dataset_name and run_name and dataset_item_id:
|
||||
try:
|
||||
dataset = lf.get_dataset(dataset_name)
|
||||
item = dataset.get_item(dataset_item_id)
|
||||
if item:
|
||||
item.link(trace, run_name)
|
||||
except Exception as exc:
|
||||
logger.warning("langfuse_eval: failed to link trace to dataset run: %s", exc)
|
||||
# Create a generation-type observation per linked prompt
|
||||
for pname, pobj in prompt_objs:
|
||||
gen = lf.start_observation(
|
||||
name=f"prompt-{pname}",
|
||||
prompt=pobj,
|
||||
as_type="generation",
|
||||
)
|
||||
gen.end()
|
||||
|
||||
# Link to dataset run if available
|
||||
if dataset_name and run_name and dataset_item_id:
|
||||
try:
|
||||
dataset = lf.get_dataset(dataset_name)
|
||||
for item in dataset.items:
|
||||
if item.id == dataset_item_id:
|
||||
item.link(span, run_name)
|
||||
break
|
||||
except Exception as exc:
|
||||
logger.warning("langfuse_eval: failed to link trace to dataset run: %s", exc)
|
||||
|
||||
span.end()
|
||||
|
||||
lf.flush()
|
||||
return trace.id
|
||||
return trace_id
|
||||
except Exception as exc:
|
||||
logger.warning("langfuse_eval: failed to create eval trace: %s", exc)
|
||||
return None
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
"""Mock executor — intercepts execute_on_client for offline E2E testing.
|
||||
|
||||
Patches ``app.ws_context.execute_on_client`` so agent pipeline runs don't
|
||||
Patches ``execute_on_client`` at all usage sites so agent pipeline runs don't
|
||||
require a live Electron client or Redis. Instead:
|
||||
|
||||
- **Filesystem actions** (list_directory, read_file_content, get_file_metadata)
|
||||
@@ -20,6 +20,7 @@ import uuid
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
from contextlib import contextmanager, asynccontextmanager
|
||||
from unittest.mock import AsyncMock, patch
|
||||
|
||||
|
||||
@@ -33,6 +34,30 @@ class Mutation:
|
||||
timestamp: float = field(default_factory=time.time)
|
||||
|
||||
|
||||
# ── Fake DB helpers (used to bypass async_session in full mode) ───────
|
||||
|
||||
class _FakeRow:
|
||||
"""Mimics an AgentRunLog row returned by SQLAlchemy."""
|
||||
id = 0
|
||||
status = "running"
|
||||
items_processed = 0
|
||||
items_created = 0
|
||||
errors: list[str] = []
|
||||
completed_at = None
|
||||
|
||||
def __setattr__(self, name: str, value: Any) -> None:
|
||||
object.__setattr__(self, name, value)
|
||||
|
||||
|
||||
class _FakeResult:
|
||||
"""Mimics a SQLAlchemy ``Result`` with ``scalar_one_or_none``."""
|
||||
def __init__(self, row: _FakeRow) -> None:
|
||||
self._row = row
|
||||
|
||||
def scalar_one_or_none(self) -> _FakeRow:
|
||||
return self._row
|
||||
|
||||
|
||||
@dataclass
|
||||
class MockExecutor:
|
||||
"""In-memory executor that replaces Redis-based tool round-trip.
|
||||
@@ -77,12 +102,37 @@ class MockExecutor:
|
||||
|
||||
# ── Context manager for patching ──────────────────────────────
|
||||
|
||||
@contextmanager
|
||||
def patch(self):
|
||||
"""Return an async context-manager that patches execute_on_client."""
|
||||
return patch(
|
||||
"app.ws_context.execute_on_client",
|
||||
new=AsyncMock(side_effect=self._handle),
|
||||
)
|
||||
"""Patch execute_on_client and DB session at all usage sites."""
|
||||
mock_fn = AsyncMock(side_effect=self._handle)
|
||||
targets = [
|
||||
"shared.ws_context.execute_on_client",
|
||||
"app.agent_runner.execute_on_client",
|
||||
"app.agents.filesystem_agent.execute_on_client",
|
||||
]
|
||||
|
||||
# Mock async_session so run_local_agent / _finalize_run skip real DB
|
||||
fake_row = _FakeRow()
|
||||
fake_db = AsyncMock()
|
||||
fake_db.commit = AsyncMock()
|
||||
fake_db.refresh = AsyncMock()
|
||||
fake_db.execute = AsyncMock(return_value=_FakeResult(fake_row))
|
||||
fake_db.add = lambda obj: None # noqa: ARG005
|
||||
|
||||
@asynccontextmanager
|
||||
async def _fake_session():
|
||||
yield fake_db
|
||||
|
||||
patches = [patch(t, new=mock_fn) for t in targets]
|
||||
patches.append(patch("app.agent_runner.async_session", _fake_session))
|
||||
for p in patches:
|
||||
p.start()
|
||||
try:
|
||||
yield mock_fn
|
||||
finally:
|
||||
for p in patches:
|
||||
p.stop()
|
||||
|
||||
# ── Internal dispatch ─────────────────────────────────────────
|
||||
|
||||
|
||||
@@ -1,28 +1,31 @@
|
||||
"""Eval runner — orchestrates fixture → mock → agent pipeline → scoring.
|
||||
|
||||
For each (fixture × model × prompt_variant) combination:
|
||||
1. Build a MockExecutor with fixture data
|
||||
2. Patch execute_on_client
|
||||
3. Override LLM_MODEL in shared settings
|
||||
4. Run the batch agent pipeline (run_local_agent)
|
||||
5. Collect mutations from the mock
|
||||
6. Score against expected results (field match + optional LLM judge)
|
||||
7. Report scores to Langfuse
|
||||
8. Print results
|
||||
Supports three eval modes:
|
||||
|
||||
- **step1**: Test classification prompt only (``_STEP1_SYSTEM_PROMPT``).
|
||||
Calls the LLM with fixture-provided ``domain_definitions`` and
|
||||
``projects_list`` and compares output against ``expected_classification``.
|
||||
|
||||
- **step2**: Test processing prompt only (``_PROCESSING_SYSTEM_PROMPT``).
|
||||
Compiles the prompt with fixture-provided ``existing_context``,
|
||||
``project_context``, ``data_types``, and ``custom_prompt_section``,
|
||||
then runs the tool-calling loop. Mutations are scored against
|
||||
``expected`` records.
|
||||
|
||||
- **full**: Run ``run_local_agent()`` end-to-end (both steps).
|
||||
Scored on both classification and extraction.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import copy
|
||||
import json
|
||||
import logging
|
||||
import time
|
||||
import uuid
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from eval.config import EvalFixture, ExpectedRecord
|
||||
from eval.config import EvalFixture, ExpectedClassification
|
||||
from eval.mock_executor import MockExecutor
|
||||
from eval.scorer import (
|
||||
EvalScores,
|
||||
@@ -36,72 +39,193 @@ from eval import langfuse_eval
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
async def run_single_eval(
|
||||
# ── Step 1 runner ─────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
async def _run_step1(
|
||||
fixture: EvalFixture,
|
||||
model: str,
|
||||
prompt_variant: str,
|
||||
*,
|
||||
use_llm_judge: bool = True,
|
||||
judge_model: str = "gpt-4o-mini",
|
||||
) -> EvalScores:
|
||||
"""Execute one (fixture × model × prompt_variant) eval and return scores."""
|
||||
from shared.config import settings
|
||||
mock: MockExecutor,
|
||||
) -> list[dict[str, Any]]:
|
||||
"""Run step-1 classification for each expected file.
|
||||
|
||||
prompt_template = fixture.prompt_variants.get(prompt_variant, "")
|
||||
Returns a list of result dicts:
|
||||
``[{file, project_id, domains, new_project_name}, ...]``
|
||||
"""
|
||||
from app.agent_runner import _classify_file
|
||||
|
||||
# Build mock executor
|
||||
seed = copy.deepcopy(fixture.seed_records)
|
||||
mock = MockExecutor(
|
||||
fixture_dir=fixture.fixture_dir,
|
||||
seed_records=seed,
|
||||
results: list[dict[str, Any]] = []
|
||||
for ec in fixture.expected_classification:
|
||||
# Read the file content through the mock
|
||||
file_result = await mock._handle(
|
||||
action="read_file_content",
|
||||
data={"path": ec.file},
|
||||
)
|
||||
file_content: str = file_result.get("content", "")
|
||||
|
||||
project_id, domains, new_name = await _classify_file(
|
||||
file_path=ec.file,
|
||||
file_content=file_content,
|
||||
projects=fixture.projects_list,
|
||||
config_data_types=fixture.data_types,
|
||||
)
|
||||
results.append({
|
||||
"file": ec.file,
|
||||
"project_id": project_id,
|
||||
"domains": domains,
|
||||
"new_project_name": new_name,
|
||||
})
|
||||
return results
|
||||
|
||||
|
||||
def _score_step1(
|
||||
fixture: EvalFixture,
|
||||
results: list[dict[str, Any]],
|
||||
) -> tuple[float, float, float, str]:
|
||||
"""Score step-1 results. Returns (precision, recall, f1, reasoning)."""
|
||||
if not fixture.expected_classification:
|
||||
return 0.0, 0.0, 0.0, "No expected classifications"
|
||||
|
||||
total = len(fixture.expected_classification)
|
||||
matched = 0
|
||||
details: list[str] = []
|
||||
|
||||
for ec in fixture.expected_classification:
|
||||
actual = next((r for r in results if r["file"] == ec.file), None)
|
||||
if actual is None:
|
||||
details.append(f" MISS {ec.file}: not processed")
|
||||
continue
|
||||
|
||||
pid_ok = actual["project_id"] == ec.project_id
|
||||
domains_ok = set(actual["domains"]) == set(ec.domains) if ec.domains else True
|
||||
|
||||
if pid_ok and domains_ok:
|
||||
matched += 1
|
||||
details.append(f" OK {ec.file}: project={actual['project_id']}, domains={actual['domains']}")
|
||||
else:
|
||||
parts: list[str] = []
|
||||
if not pid_ok:
|
||||
parts.append(f"project expected={ec.project_id} got={actual['project_id']}")
|
||||
if not domains_ok:
|
||||
parts.append(f"domains expected={ec.domains} got={actual['domains']}")
|
||||
details.append(f" FAIL {ec.file}: {'; '.join(parts)}")
|
||||
|
||||
precision = matched / total if total > 0 else 0.0
|
||||
recall = precision # in step1, precision == recall (same denominator)
|
||||
f1 = precision # same
|
||||
reasoning = "\n".join(details)
|
||||
return precision, recall, f1, reasoning
|
||||
|
||||
|
||||
# ── Step 2 runner ─────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
async def _run_step2(
|
||||
fixture: EvalFixture,
|
||||
model: str,
|
||||
mock: MockExecutor,
|
||||
) -> None:
|
||||
"""Run step-2 processing for each file in the fixture directory.
|
||||
|
||||
Compiles ``_PROCESSING_SYSTEM_PROMPT`` with fixture-provided variables
|
||||
and runs the tool-calling loop. Mutations are captured by the mock.
|
||||
"""
|
||||
from app.agent_runner import (
|
||||
_PROCESSING_SYSTEM_PROMPT,
|
||||
_build_processing_tools,
|
||||
_run_agent_with_tools,
|
||||
_MAX_PROCESSING_STEPS,
|
||||
)
|
||||
from app import tracing
|
||||
|
||||
# Compile the processing prompt with fixture variables
|
||||
system_prompt = tracing.compile_prompt(
|
||||
"batch_processing",
|
||||
fallback=_PROCESSING_SYSTEM_PROMPT,
|
||||
variables={
|
||||
"existing_context": fixture.existing_context,
|
||||
"project_context": fixture.project_context,
|
||||
"data_types": ", ".join(fixture.data_types),
|
||||
"custom_prompt_section": fixture.custom_prompt_section,
|
||||
},
|
||||
)
|
||||
|
||||
# Override the LLM model for this run
|
||||
original_model = settings.LLM_MODEL
|
||||
settings.LLM_MODEL = model
|
||||
tools = _build_processing_tools(fixture.data_types)
|
||||
|
||||
# Scan files in the fixture directory
|
||||
file_entries = await mock._handle(
|
||||
action="list_directory",
|
||||
data={"path": fixture.directory},
|
||||
)
|
||||
for entry in file_entries.get("entries", []):
|
||||
if entry.get("type") != "file":
|
||||
continue
|
||||
# Filter by extension if specified
|
||||
if fixture.file_extensions:
|
||||
ext = entry["name"].rsplit(".", 1)[-1] if "." in entry["name"] else ""
|
||||
if ext not in fixture.file_extensions:
|
||||
continue
|
||||
|
||||
file_result = await mock._handle(
|
||||
action="read_file_content",
|
||||
data={"path": entry["path"]},
|
||||
)
|
||||
file_content: str = file_result.get("content", "")
|
||||
if not file_content.strip():
|
||||
continue
|
||||
|
||||
await _run_agent_with_tools(
|
||||
system_prompt=system_prompt,
|
||||
user_message=(
|
||||
f"Process this file and extract relevant information.\n\n"
|
||||
f"File: {entry['path']}\n\nContent:\n{file_content}"
|
||||
),
|
||||
tools=tools,
|
||||
max_steps=_MAX_PROCESSING_STEPS,
|
||||
)
|
||||
|
||||
|
||||
# ── Full runner ───────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
async def _run_full(
|
||||
fixture: EvalFixture,
|
||||
model: str,
|
||||
mock: MockExecutor,
|
||||
user_id: str,
|
||||
) -> None:
|
||||
"""Run the full two-step pipeline via ``run_local_agent``."""
|
||||
from app.agent_runner import run_local_agent
|
||||
|
||||
# Build trigger data (same shape as what redis_consumer delivers)
|
||||
trigger_data: dict[str, Any] = {
|
||||
"type": "agent_trigger",
|
||||
"directory": fixture.directory,
|
||||
"directory_paths": [fixture.directory],
|
||||
"data_types": fixture.data_types,
|
||||
"file_extensions": fixture.file_extensions,
|
||||
"prompt_template": prompt_template,
|
||||
"prompt_template": fixture.custom_prompt_section,
|
||||
"device_id": "eval-harness",
|
||||
"run_context": {
|
||||
"agent_id": f"eval-{fixture.name}-{prompt_variant}",
|
||||
"run_id": None, # skip DB logging during eval
|
||||
"agent_id": f"eval-{fixture.name}",
|
||||
"run_id": None,
|
||||
},
|
||||
}
|
||||
|
||||
eval_user_id = f"eval-{uuid.uuid4().hex[:8]}"
|
||||
with mock.patch():
|
||||
await run_local_agent(user_id, trigger_data)
|
||||
|
||||
logger.info(
|
||||
"eval: starting %s | model=%s | variant=%s",
|
||||
fixture.name, model, prompt_variant,
|
||||
)
|
||||
start_time = time.time()
|
||||
|
||||
try:
|
||||
# Patch execute_on_client + set user context, then run the pipeline
|
||||
from app.ws_context import set_current_user, clear_current_user
|
||||
from app.agent_runner import run_local_agent
|
||||
# ── Scoring helpers ───────────────────────────────────────────────────────
|
||||
|
||||
set_current_user(eval_user_id)
|
||||
with mock.patch():
|
||||
await run_local_agent(eval_user_id, trigger_data)
|
||||
except Exception as exc:
|
||||
logger.error("eval: pipeline failed for %s/%s/%s: %s", fixture.name, model, prompt_variant, exc)
|
||||
finally:
|
||||
settings.LLM_MODEL = original_model
|
||||
from app.ws_context import clear_current_user
|
||||
clear_current_user()
|
||||
|
||||
elapsed = time.time() - start_time
|
||||
logger.info("eval: pipeline completed in %.1fs — %d mutations", elapsed, len(mock.mutations))
|
||||
def _score_mutations(
|
||||
fixture: EvalFixture,
|
||||
mock: MockExecutor,
|
||||
) -> tuple[list[FieldScore], float, float, float, int, int]:
|
||||
"""Score mutations against expected records.
|
||||
|
||||
# ── Score results ────────────────────────────────────────────
|
||||
Returns (field_scores, precision, recall, f1, extra, missing).
|
||||
"""
|
||||
all_field_scores: list[FieldScore] = []
|
||||
total_expected = 0
|
||||
total_actual = 0
|
||||
@@ -109,12 +233,10 @@ async def run_single_eval(
|
||||
total_extra = 0
|
||||
total_missing = 0
|
||||
|
||||
# Group expected by table
|
||||
expected_by_table: dict[str, list[dict]] = {}
|
||||
for rec in fixture.expected:
|
||||
expected_by_table.setdefault(rec.table, []).append(rec.fields)
|
||||
|
||||
# Compare against actual mutations (inserts + updates)
|
||||
tables = set(expected_by_table.keys()) | {m.table for m in mock.mutations}
|
||||
for table in tables:
|
||||
expected_records = expected_by_table.get(table, [])
|
||||
@@ -131,49 +253,160 @@ async def run_single_eval(
|
||||
total_missing += missing
|
||||
|
||||
precision, recall, f1 = compute_precision_recall(total_expected, total_actual, total_matched)
|
||||
return all_field_scores, precision, recall, f1, total_extra, total_missing
|
||||
|
||||
scores = EvalScores(
|
||||
fixture_name=fixture.name,
|
||||
model=model,
|
||||
prompt_variant=prompt_variant,
|
||||
field_scores=all_field_scores,
|
||||
precision=precision,
|
||||
recall=recall,
|
||||
f1=f1,
|
||||
extra_records=total_extra,
|
||||
missing_records=total_missing,
|
||||
|
||||
# ── Main entry point ──────────────────────────────────────────────────────
|
||||
|
||||
|
||||
async def run_single_eval(
|
||||
fixture: EvalFixture,
|
||||
model: str,
|
||||
*,
|
||||
use_llm_judge: bool = True,
|
||||
judge_model: str = "gpt-4o-mini",
|
||||
) -> EvalScores:
|
||||
"""Execute one eval run for a fixture + model. Mode is read from the fixture."""
|
||||
from shared.config import settings
|
||||
from shared.ws_context import set_current_user, clear_current_user
|
||||
|
||||
seed = copy.deepcopy(fixture.seed_records)
|
||||
mock = MockExecutor(
|
||||
fixture_dir=fixture.fixture_path.parent,
|
||||
seed_records=seed,
|
||||
)
|
||||
|
||||
# ── Optional LLM judge ───────────────────────────────────────
|
||||
if use_llm_judge and fixture.expected:
|
||||
all_expected = [r.fields for r in fixture.expected]
|
||||
all_actual = [m.data for m in mock.mutations if m.action in ("insert", "update")]
|
||||
judge_score, reasoning = await llm_judge_score(
|
||||
all_expected, all_actual, judge_model=judge_model,
|
||||
)
|
||||
scores.llm_judge_score = judge_score
|
||||
scores.llm_judge_reasoning = reasoning
|
||||
original_model = settings.LLM_MODEL
|
||||
settings.LLM_MODEL = model
|
||||
eval_user_id = str(uuid.uuid4())
|
||||
|
||||
# ── Report to Langfuse ───────────────────────────────────────
|
||||
dataset_name = f"batch-eval-{fixture.name}"
|
||||
dataset_item_id = f"{fixture.name}--{prompt_variant}"
|
||||
run_name = f"{model}--{prompt_variant}--{int(time.time())}"
|
||||
logger.info(
|
||||
"eval: starting %s | mode=%s | model=%s",
|
||||
fixture.name, fixture.mode, model,
|
||||
)
|
||||
start_time = time.time()
|
||||
|
||||
step1_results: list[dict[str, Any]] = []
|
||||
step1_reasoning = ""
|
||||
|
||||
try:
|
||||
set_current_user(eval_user_id)
|
||||
|
||||
if fixture.mode == "step1":
|
||||
with mock.patch():
|
||||
step1_results = await _run_step1(fixture, model, mock)
|
||||
|
||||
elif fixture.mode == "step2":
|
||||
with mock.patch():
|
||||
await _run_step2(fixture, model, mock)
|
||||
|
||||
elif fixture.mode == "full":
|
||||
with mock.patch():
|
||||
# Step 1 — classification (independent from run_local_agent)
|
||||
if fixture.expected_classification:
|
||||
step1_results = await _run_step1(fixture, model, mock)
|
||||
|
||||
# Step 2 — full pipeline (run_local_agent handles both steps)
|
||||
await _run_full(fixture, model, mock, eval_user_id)
|
||||
|
||||
except Exception as exc:
|
||||
logger.error("eval: pipeline failed for %s/%s: %s", fixture.name, model, exc)
|
||||
finally:
|
||||
settings.LLM_MODEL = original_model
|
||||
clear_current_user()
|
||||
|
||||
elapsed = time.time() - start_time
|
||||
logger.info("eval: completed in %.1fs — %d mutations", elapsed, len(mock.mutations))
|
||||
|
||||
# ── Score ─────────────────────────────────────────────────────
|
||||
|
||||
if fixture.mode == "step1":
|
||||
s1_precision, s1_recall, s1_f1, step1_reasoning = _score_step1(fixture, step1_results)
|
||||
scores = EvalScores(
|
||||
fixture_name=fixture.name,
|
||||
model=model,
|
||||
prompt_variant=fixture.mode,
|
||||
precision=s1_precision,
|
||||
recall=s1_recall,
|
||||
f1=s1_f1,
|
||||
llm_judge_reasoning=step1_reasoning,
|
||||
)
|
||||
else:
|
||||
# step2 or full — score mutations
|
||||
field_scores, precision, recall, f1, extra, missing = _score_mutations(fixture, mock)
|
||||
scores = EvalScores(
|
||||
fixture_name=fixture.name,
|
||||
model=model,
|
||||
prompt_variant=fixture.mode,
|
||||
field_scores=field_scores,
|
||||
precision=precision,
|
||||
recall=recall,
|
||||
f1=f1,
|
||||
extra_records=extra,
|
||||
missing_records=missing,
|
||||
)
|
||||
|
||||
# Add step1 classification scores for full mode
|
||||
if fixture.mode == "full" and fixture.expected_classification:
|
||||
s1_p, s1_r, s1_f1, step1_reasoning = _score_step1(fixture, step1_results)
|
||||
scores.llm_judge_reasoning = f"Step1 classification:\n{step1_reasoning}"
|
||||
|
||||
# Optional LLM judge for extraction quality
|
||||
if use_llm_judge and fixture.expected:
|
||||
all_expected = [r.fields for r in fixture.expected]
|
||||
all_actual = [m.data for m in mock.mutations if m.action in ("insert", "update")]
|
||||
judge_score, reasoning = await llm_judge_score(
|
||||
all_expected, all_actual, judge_model=judge_model,
|
||||
)
|
||||
scores.llm_judge_score = judge_score
|
||||
if step1_reasoning:
|
||||
scores.llm_judge_reasoning += f"\n\nLLM judge:\n{reasoning}"
|
||||
else:
|
||||
scores.llm_judge_reasoning = reasoning
|
||||
|
||||
# ── Report to Langfuse ────────────────────────────────────────
|
||||
prompt_names = {
|
||||
"step1": ["batch_file_classifier"],
|
||||
"step2": ["batch_processing"],
|
||||
"full": ["batch_file_classifier", "batch_processing"],
|
||||
}.get(fixture.mode, ["batch_processing"])
|
||||
|
||||
trace_id = langfuse_eval.log_eval_trace(
|
||||
fixture_name=fixture.name,
|
||||
model=model,
|
||||
prompt_variant=prompt_variant,
|
||||
prompt_template=prompt_template,
|
||||
prompt_variant=fixture.mode,
|
||||
prompt_template=fixture.custom_prompt_section or "(default)",
|
||||
actual_mutations=[{"action": m.action, "table": m.table, "data": m.data} for m in mock.mutations],
|
||||
scores_summary=scores.summary(),
|
||||
dataset_name=dataset_name,
|
||||
run_name=run_name,
|
||||
dataset_item_id=dataset_item_id,
|
||||
step1_results=step1_results or None,
|
||||
langfuse_prompt_names=prompt_names,
|
||||
)
|
||||
|
||||
if trace_id:
|
||||
langfuse_eval.post_eval_scores(scores, trace_id=trace_id)
|
||||
|
||||
# For full mode, post classification scores separately
|
||||
if fixture.mode == "full" and fixture.expected_classification:
|
||||
s1_p, s1_r, s1_f1, _ = _score_step1(fixture, step1_results)
|
||||
for name, value in [
|
||||
("classification_precision", s1_p),
|
||||
("classification_recall", s1_r),
|
||||
("classification_f1", s1_f1),
|
||||
]:
|
||||
try:
|
||||
from langfuse import get_client
|
||||
lf = get_client()
|
||||
if lf:
|
||||
lf.create_score(
|
||||
name=name,
|
||||
value=value,
|
||||
trace_id=trace_id,
|
||||
data_type="NUMERIC",
|
||||
comment=f"{fixture.name} | {model} | full",
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return scores
|
||||
|
||||
|
||||
@@ -181,29 +414,20 @@ async def run_fixture_eval(
|
||||
fixture: EvalFixture,
|
||||
models: list[str],
|
||||
*,
|
||||
variants: list[str] | None = None,
|
||||
use_llm_judge: bool = True,
|
||||
judge_model: str = "gpt-4o-mini",
|
||||
) -> list[EvalScores]:
|
||||
"""Run all (model × variant) combinations for a fixture."""
|
||||
if variants is None:
|
||||
variants = list(fixture.prompt_variants.keys())
|
||||
|
||||
# Sync fixture to Langfuse dataset
|
||||
"""Run all models for a fixture."""
|
||||
langfuse_eval.sync_fixture_to_dataset(fixture)
|
||||
|
||||
results: list[EvalScores] = []
|
||||
for model in models:
|
||||
for variant in variants:
|
||||
if variant not in fixture.prompt_variants:
|
||||
logger.warning("eval: variant %r not found in fixture %s", variant, fixture.name)
|
||||
continue
|
||||
scores = await run_single_eval(
|
||||
fixture, model, variant,
|
||||
use_llm_judge=use_llm_judge,
|
||||
judge_model=judge_model,
|
||||
)
|
||||
results.append(scores)
|
||||
scores = await run_single_eval(
|
||||
fixture, model,
|
||||
use_llm_judge=use_llm_judge,
|
||||
judge_model=judge_model,
|
||||
)
|
||||
results.append(scores)
|
||||
|
||||
return results
|
||||
|
||||
@@ -214,18 +438,21 @@ def print_results(results: list[EvalScores]) -> None:
|
||||
print("\nNo eval results.")
|
||||
return
|
||||
|
||||
print("\n" + "=" * 90)
|
||||
print(f"{'Fixture':<25} {'Model':<25} {'Variant':<15} {'P':>6} {'R':>6} {'F1':>6} {'FA':>6} {'LLM':>6}")
|
||||
print("-" * 90)
|
||||
print("\n" + "=" * 95)
|
||||
print(f"{'Fixture':<25} {'Mode':<6} {'Model':<25} {'P':>6} {'R':>6} {'F1':>6} {'FA':>6} {'LLM':>6}")
|
||||
print("-" * 95)
|
||||
|
||||
for s in results:
|
||||
llm_str = f"{s.llm_judge_score:.2f}" if s.llm_judge_score is not None else " --"
|
||||
print(
|
||||
f"{s.fixture_name:<25} {s.model:<25} {s.prompt_variant:<15} "
|
||||
f"{s.fixture_name:<25} {s.prompt_variant:<6} {s.model:<25} "
|
||||
f"{s.precision:>6.2f} {s.recall:>6.2f} {s.f1:>6.2f} "
|
||||
f"{s.field_accuracy:>6.2f} {llm_str:>6}"
|
||||
)
|
||||
|
||||
print("=" * 95)
|
||||
print()
|
||||
|
||||
print("=" * 90)
|
||||
|
||||
# If LLM judge reasoning is available, print it
|
||||
|
||||
@@ -242,7 +242,7 @@ async def llm_judge_score(
|
||||
|
||||
Returns (score, reasoning).
|
||||
"""
|
||||
from app.llm import get_llm
|
||||
from shared.llm import get_llm
|
||||
|
||||
llm = get_llm(model=judge_model, temperature=0)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user