refactor(eval): 3-mode eval harness (step1/step2/full) with Langfuse fixes

- Rewrite eval config with EvalMode (step1, step2, full) replacing prompt_variants
- Rewrite runner with _run_step1, _run_step2, _run_full dispatch
- CLI: replace --variants with --mode flag
- Add 3 fixture YAMLs: classify_invoices (step1), process_invoices (step2), full_invoices (full)
- Remove old freelance_invoices fixture
- Langfuse: mode-aware dataset items (classifications for step1, extraction for step2, both for full)
- Langfuse: link both prompts (batch_file_classifier + batch_processing) in full mode
- Langfuse: post separate classification_precision/recall/f1 scores for full mode
- Langfuse: skip misleading field_accuracy=0 when field_scores is empty (step1)
- Langfuse: include step1_results in trace output
- MockExecutor: mock async_session to bypass DB in full mode
- Journey fixture: remove user_messages (only interactive test kept)
This commit is contained in:
Roberto Musso
2026-03-24 16:18:51 +01:00
parent 63fa119543
commit d3f7099d93
13 changed files with 1409 additions and 439 deletions

View File

@@ -4,14 +4,15 @@ Usage::
# From services/batch-agent/:
python -m eval run # all agent fixtures, default model
python -m eval run --fixture=freelance-invoices # single fixture
python -m eval run --models=gpt-4o,anthropic/claude-sonnet-4
python -m eval run --variants=baseline,detailed # specific prompt variants
python -m eval run --fixture=classify-invoices # single fixture
python -m eval run --models=gpt-4o,gpt-5.3-codex # multiple models
python -m eval run --mode=step1 # only step1 fixtures
python -m eval run --no-judge # skip LLM judge scoring
python -m eval journey # all journey fixtures
python -m eval journey --fixture=journey-invoices # single journey fixture
python -m eval journey --models=gpt-4o,anthropic/claude-sonnet-4
python -m eval interactive # interactive journey session
python -m eval interactive --fixture=journey-invoice-setup
python -m eval interactive --model=gpt-4o
python -m eval interactive --judge-model=github_copilot/gpt-4o-mini
python -m eval list # list all fixtures
python -m eval sync # sync fixtures to Langfuse datasets
@@ -25,16 +26,24 @@ import logging
import sys
from pathlib import Path
# Ensure the service root and repo root are in sys.path
# Ensure the service root and repo root are in sys.path.
# Service root must come BEFORE repo root so its ``app/`` package
# shadows the monolith ``app/`` in the repo root.
_SERVICE_ROOT = Path(__file__).resolve().parent.parent
_REPO_ROOT = _SERVICE_ROOT.parent.parent
for p in (_SERVICE_ROOT, _REPO_ROOT):
if str(p) not in sys.path:
sys.path.insert(0, str(p))
_sr = str(_SERVICE_ROOT)
_rr = str(_REPO_ROOT)
if _rr not in sys.path:
sys.path.insert(0, _rr)
# Always force service root to position 0 (python -m may have already
# added CWD further down the list, which loses to repo root).
if _sr in sys.path:
sys.path.remove(_sr)
sys.path.insert(0, _sr)
from eval.config import discover_fixtures, discover_journey_fixtures
from eval.runner import run_fixture_eval, print_results
from eval.journey_runner import run_journey_fixture_eval, print_journey_results
from eval.interactive import run_interactive
from eval import langfuse_eval
@@ -65,13 +74,14 @@ def _parse_args() -> argparse.Namespace:
)
run_cmd.add_argument(
"--models", "-m",
default="gpt-4o",
help="Comma-separated list of models to test (default: gpt-4o)",
default="github_copilot/gpt-5.3-codex",
help="Comma-separated list of models to test (default: github_copilot/gpt-5.3-codex)",
)
run_cmd.add_argument(
"--variants", "-p",
"--mode",
default=None,
help="Comma-separated prompt variants to test (default: all in fixture)",
choices=["step1", "step2", "full"],
help="Only run fixtures with this mode (default: all)",
)
run_cmd.add_argument(
"--no-judge",
@@ -80,8 +90,8 @@ def _parse_args() -> argparse.Namespace:
)
run_cmd.add_argument(
"--judge-model",
default="gpt-4o-mini",
help="Model for LLM judge (default: gpt-4o-mini)",
default="gpt-4o",
help="Model for LLM judge (default: gpt-4o)",
)
run_cmd.add_argument(
"--fixtures-dir",
@@ -95,35 +105,40 @@ def _parse_args() -> argparse.Namespace:
list_cmd.add_argument("--fixtures-dir", default=None)
list_cmd.add_argument("-v", "--verbose", action="store_true")
# ── journey ───────────────────────────────────────────────────
journey_cmd = sub.add_parser("journey", help="Run journey evaluations")
journey_cmd.add_argument(
"--fixture", "-f",
help="Run only the named journey fixture (default: all)",
)
journey_cmd.add_argument(
"--models", "-m",
default="gpt-4o",
help="Comma-separated list of models to test (default: gpt-4o)",
)
journey_cmd.add_argument(
"--judge-model",
default="gpt-4o-mini",
help="Model for LLM judge (default: gpt-4o-mini)",
)
journey_cmd.add_argument(
"--fixtures-dir",
default=None,
help="Path to fixtures directory (default: eval/fixtures/)",
)
journey_cmd.add_argument("-v", "--verbose", action="store_true")
# ── sync ──────────────────────────────────────────────────────
sync_cmd = sub.add_parser("sync", help="Sync fixtures to Langfuse datasets")
sync_cmd.add_argument("--fixture", "-f", default=None, help="Sync only the named fixture")
sync_cmd.add_argument("--fixtures-dir", default=None)
sync_cmd.add_argument("-v", "--verbose", action="store_true")
# ── interactive ───────────────────────────────────────────────
inter_cmd = sub.add_parser("interactive", help="Interactive journey session (human-in-the-loop)")
inter_cmd.add_argument(
"--fixture", "-f",
help="Journey fixture to use (default: pick interactively)",
)
inter_cmd.add_argument(
"--model", "-m",
default="github_copilot/gpt-5.3-codex",
help="Model for the journey AI (default: github_copilot/gpt-5.3-codex)",
)
inter_cmd.add_argument(
"--judge-model",
default="gpt-4o",
help="Model for LLM judge (default: gpt-4o)",
)
inter_cmd.add_argument(
"--fixtures-dir",
default=None,
help="Path to fixtures directory (default: eval/fixtures/)",
)
inter_cmd.add_argument(
"--data-dir",
default=None,
help="Override sample data directory (e.g. path to private test files not in git)",
)
inter_cmd.add_argument("-v", "--verbose", action="store_true")
return parser.parse_args()
@@ -146,14 +161,14 @@ async def _cmd_run(args: argparse.Namespace) -> None:
return
models = [m.strip() for m in args.models.split(",")]
variants = [v.strip() for v in args.variants.split(",")] if args.variants else None
all_results = []
for fixture in fixtures:
if args.mode and fixture.mode != args.mode:
continue
results = await run_fixture_eval(
fixture,
models=models,
variants=variants,
use_llm_judge=not args.no_judge,
judge_model=args.judge_model,
)
@@ -172,12 +187,12 @@ def _cmd_list(args: argparse.Namespace) -> None:
if fixtures:
print(f"\n{'[Agent Fixtures]'}")
print(f"{'Name':<30} {'Types':<25} {'Variants':<20} {'Expected'}")
print(f"{'Name':<30} {'Mode':<6} {'Types':<25} {'Expected'}")
print("-" * 90)
for f in fixtures:
variants = ", ".join(f.prompt_variants.keys())
types = ", ".join(f.data_types)
print(f"{f.name:<30} {types:<25} {variants:<20} {len(f.expected)}")
n_expected = len(f.expected) + len(f.expected_classification)
print(f"{f.name:<30} {f.mode:<6} {types:<25} {n_expected}")
if journey_fixtures:
print(f"\n{'[Journey Fixtures]'}")
@@ -217,30 +232,39 @@ def _cmd_sync(args: argparse.Namespace) -> None:
print(f"Skipped: {fixture.name} (Langfuse not configured)")
async def _cmd_journey(args: argparse.Namespace) -> None:
async def _cmd_interactive(args: argparse.Namespace) -> None:
journey_fixtures = discover_journey_fixtures(_fixtures_dir(args.fixtures_dir))
if not journey_fixtures:
print("No journey fixtures found. Create YAML files with type: journey in eval/fixtures/.")
return
if args.fixture:
journey_fixtures = [f for f in journey_fixtures if f.name == args.fixture]
if not journey_fixtures:
fixtures = [f for f in journey_fixtures if f.name == args.fixture]
if not fixtures:
print(f"Journey fixture '{args.fixture}' not found.")
return
fixture = fixtures[0]
elif len(journey_fixtures) == 1:
fixture = journey_fixtures[0]
else:
# Let user pick
print("\nAvailable journey fixtures:")
for i, f in enumerate(journey_fixtures, 1):
print(f" {i}. {f.name}{f.description[:60]}")
print()
try:
choice = int(input("Pick a fixture number: ").strip()) - 1
fixture = journey_fixtures[choice]
except (ValueError, IndexError, EOFError, KeyboardInterrupt):
print("Invalid choice.")
return
models = [m.strip() for m in args.models.split(",")]
all_results = []
for fixture in journey_fixtures:
results = await run_journey_fixture_eval(
fixture,
models=models,
judge_model=args.judge_model,
)
all_results.extend(results)
print_journey_results(all_results)
await run_interactive(
fixture,
model=args.model,
judge_model=args.judge_model,
data_dir=Path(args.data_dir).resolve() if args.data_dir else None,
)
def main() -> None:
@@ -249,8 +273,8 @@ def main() -> None:
if args.command == "run":
asyncio.run(_cmd_run(args))
elif args.command == "journey":
asyncio.run(_cmd_journey(args))
elif args.command == "interactive":
asyncio.run(_cmd_interactive(args))
elif args.command == "list":
_cmd_list(args)
elif args.command == "sync":

View File

@@ -1,70 +1,16 @@
"""Eval configuration — YAML fixture loader and dataclasses.
A *fixture* is a YAML file that defines a complete test scenario:
Fixtures come in two families:
.. code-block:: yaml
1. **Agent fixtures** — test the batch agent pipeline.
Three modes controlled by ``mode``:
name: freelance-invoices
description: Extract tasks and notes from invoice PDFs (text layer)
directory: sample_files/invoices # relative to fixture dir
data_types: [tasks, notes]
file_extensions: [txt, md]
``step1`` — classification prompt only.
``step2`` — processing prompt only.
``full`` — both steps in sequence.
# Preseeded records the agent "sees" as existing data
seed_records:
projects:
- id: proj-1
name: "Website Redesign"
status: active
tasks: []
# Prompt variations to test (at least one required)
prompt_variants:
baseline: |
Extract action items as tasks and meeting summaries as notes.
Set priority based on urgency keywords.
detailed: |
Extract action items as tasks. Map "URGENT" to high priority,
"ASAP" to medium. Summaries become notes with full content.
# Expected extractions — what the agent SHOULD produce
expected:
tasks:
- title: "Send revised invoice to client"
priority: high
status: todo
- title: "Update project timeline"
priority: medium
notes:
- title: "Meeting summary - March kickoff"
# Optional: models to test (overrides CLI --models)
models: []
A *journey fixture* tests the prompt-template builder conversation:
.. code-block:: yaml
type: journey
name: journey-invoices
description: Test journey builds a good template for invoices
directory: sample_files/invoices
data_types: [tasks, notes]
# Simulated user responses for multi-turn conversation
user_messages:
- "I want to extract action items and meeting summaries"
- "Yes, map URGENTE to high priority"
- "That looks good, generate the template"
# Criteria the generated prompt_template should satisfy
expected_template_criteria:
- "mentions tasks and notes as target entities"
- "includes priority mapping rules"
- "references isAiSuggested=1"
- "does not mention projectId"
models: []
2. **Journey fixtures** — test the prompt-template builder conversation
(unchanged).
"""
from __future__ import annotations
@@ -72,12 +18,14 @@ from __future__ import annotations
import logging
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any
from typing import Any, Literal
import yaml
logger = logging.getLogger(__name__)
EvalMode = Literal["step1", "step2", "full"]
@dataclass
class ExpectedRecord:
@@ -90,21 +38,52 @@ class ExpectedRecord:
fields: dict[str, Any] # field_name → expected_value
@dataclass
class ExpectedClassification:
"""Expected output of step-1 classification for one file."""
file: str # relative path to the sample file
project_id: str # expected matched project id, or "new"
domains: list[str] # expected domain list
new_project_name: str | None = None
@dataclass
class EvalFixture:
"""A complete test scenario loaded from YAML."""
"""A complete test scenario loaded from YAML.
``mode`` determines which pipeline steps are exercised:
- **step1**: only ``_classify_file``
- **step2**: only the processing LLM + tool loop
- **full**: both steps in sequence (``run_local_agent``)
"""
name: str
description: str
mode: EvalMode
directory: str # relative path to sample files
data_types: list[str]
file_extensions: list[str]
seed_records: dict[str, list[dict]]
prompt_variants: dict[str, str] # variant_name → prompt_template
expected: list[ExpectedRecord]
models: list[str] # if empty, use CLI default
fixture_path: Path = field(default_factory=lambda: Path("."))
# ── Step-1 inputs (classification) ───────────────────────────
domain_definitions: str = ""
projects_list: list[dict[str, Any]] = field(default_factory=list)
# ── Step-2 inputs (processing) ───────────────────────────────
existing_context: str = ""
project_context: str = ""
custom_prompt_section: str = ""
# ── Seed records for mock executor ───────────────────────────
seed_records: dict[str, list[dict]] = field(default_factory=dict)
# ── Expected outputs ─────────────────────────────────────────
expected_classification: list[ExpectedClassification] = field(default_factory=list)
expected: list[ExpectedRecord] = field(default_factory=list)
@property
def fixture_dir(self) -> Path:
"""Absolute path to the sample files directory."""
@@ -115,22 +94,44 @@ class EvalFixture:
"""Load a fixture from a YAML file."""
raw = yaml.safe_load(path.read_text(encoding="utf-8"))
mode: EvalMode = raw.get("mode", "full")
# Parse expected records (step2/full)
expected: list[ExpectedRecord] = []
for table, records in (raw.get("expected") or {}).items():
for rec in records:
expected.append(ExpectedRecord(table=table, fields=rec))
# Parse expected classification (step1/full)
expected_classification: list[ExpectedClassification] = []
for item in raw.get("expected_classification") or []:
expected_classification.append(ExpectedClassification(
file=item["file"],
project_id=item["project_id"],
domains=item.get("domains", []),
new_project_name=item.get("new_project_name"),
))
return cls(
name=raw["name"],
description=raw.get("description", ""),
mode=mode,
directory=raw.get("directory", "sample_files"),
data_types=raw.get("data_types", ["tasks"]),
file_extensions=raw.get("file_extensions", []),
seed_records=raw.get("seed_records", {}),
prompt_variants=raw.get("prompt_variants", {"default": ""}),
expected=expected,
models=raw.get("models", []),
fixture_path=path,
# Step-1 inputs
domain_definitions=raw.get("domain_definitions", ""),
projects_list=raw.get("projects_list", []),
# Step-2 inputs
existing_context=raw.get("existing_context", ""),
project_context=raw.get("project_context", ""),
custom_prompt_section=raw.get("custom_prompt_section", ""),
# Shared
seed_records=raw.get("seed_records", {}),
expected_classification=expected_classification,
expected=expected,
)
@@ -168,9 +169,9 @@ class JourneyFixture:
description: str
directory: str # relative path to sample files
data_types: list[str]
user_messages: list[str] # simulated user responses
expected_template_criteria: list[str] # what the template should contain/satisfy
models: list[str]
user_messages: list[str] = field(default_factory=list) # for automated journey runs (unused in interactive mode)
models: list[str] = field(default_factory=list)
fixture_path: Path = field(default_factory=lambda: Path("."))
@property

View File

@@ -0,0 +1,40 @@
# Fixture: classify-invoices (step1)
# Tests _STEP1_SYSTEM_PROMPT — file classification and project matching.
# Verifies that the LLM correctly matches files to existing projects
# and identifies the right data domains.
name: classify-invoices
mode: step1
description: >
Test file classification on Italian freelance invoices and meeting notes.
Verifies project matching and domain identification.
directory: sample_files/invoices
data_types: [tasks, notes, timelines]
file_extensions: [txt, md]
# ── Step-1 prompt variables ──────────────────────────────────────
domain_definitions: |
- tasks: Action items, deliverables, things to do — anything that someone needs to complete.
- notes: Meeting summaries, decisions, reference information — permanent knowledge entries.
- timelines: Project milestones, deadlines, scheduled events — specific dates that mark a point in the progress of a project.
projects_list:
- id: "proj-web-redesign"
name: "Redesign Sito Web Corporate"
status: "active"
aiSummary: "Corporate website redesign for Studio Architettura Bianchi"
- id: "proj-ecommerce"
name: "E-Commerce FashionStore"
status: "active"
aiSummary: "Next.js e-commerce platform for FashionStore srl"
# ── Expected classification results ─────────────────────────────
expected_classification:
- file: "sample_files/invoices/fattura_042.txt"
project_id: "proj-web-redesign"
domains: [tasks, notes, timelines]
- file: "sample_files/invoices/meeting_ecommerce.md"
project_id: "proj-ecommerce"
domains: [tasks, notes, timelines]

View File

@@ -1,86 +0,0 @@
# Fixture: freelance-invoices
# Tests extraction of tasks, notes, and timelines from
# invoices and meeting notes typical of a freelance workflow.
name: freelance-invoices
description: >
Extract tasks, notes, and timeline events from Italian freelance
invoices and meeting notes. Tests project matching, priority
mapping, and bilingual content handling.
directory: sample_files/invoices
data_types: [tasks, notes, timelines]
file_extensions: [txt, md]
# Pre-existing records in the "database"
seed_records:
projects:
- id: "proj-web-redesign"
name: "Redesign Sito Web Corporate"
status: "active"
aiSummary: "Corporate website redesign for Studio Architettura Bianchi"
- id: "proj-ecommerce"
name: "E-Commerce FashionStore"
status: "active"
aiSummary: "Next.js e-commerce platform for FashionStore srl"
tasks: []
notes: []
timelines: []
# Prompt variations to compare
prompt_variants:
baseline: |
Extract action items as tasks and summaries as notes.
For timelines, extract any mentioned dates and deadlines.
Set isAiSuggested=1 on every record.
detailed_italian: |
Estrai i dati dai file come segue:
- TASK: ogni azione da fare, deliverable, o item con scadenza.
Mappa "URGENTE" o "ALTA PRIORITÀ" → priority: high.
Mappa "media priorità" → priority: medium.
Mappa "bassa priorità" → priority: low.
Se un item è marcato come "completato" o [x], impostalo status: done.
Altrimenti status: todo.
- NOTE: riassunti di meeting, decisioni prese, note tecniche.
Il titolo deve essere descrittivo. Il content deve includere tutti i dettagli.
- TIMELINE: date di scadenza, milestone, meeting futuri.
Formato data: timestamp Unix in millisecondi.
Imposta sempre isAiSuggested=1.
minimal: |
Extract only high-priority action items as tasks.
Ignore notes and timelines unless explicitly marked as important.
Set isAiSuggested=1.
# Expected extractions (what the agent SHOULD produce)
# Only key fields are specified — scorer uses fuzzy matching
expected:
tasks:
- title: "Sviluppo frontend React"
priority: "high"
status: "todo"
- title: "Integrazione API backend"
priority: "medium"
status: "todo"
- title: "Testing cross-browser e fix bug responsive"
status: "todo"
- title: "Preparare wireframe homepage"
priority: "high"
status: "todo"
- title: "Setup progetto Next.js e configurare CI/CD"
priority: "medium"
status: "todo"
- title: "Ricerca plugin Stripe per gestione abbonamenti"
priority: "low"
status: "todo"
notes:
- title: "Meeting Kickoff Progetto E-Commerce"
timelines:
- title: "MVP E-Commerce pronto"
- title: "Meeting di revisione"
# Models to test (can be overridden via CLI --models)
models: []

View File

@@ -0,0 +1,108 @@
# Fixture: full-invoices (full)
# Tests both _STEP1_SYSTEM_PROMPT and _PROCESSING_SYSTEM_PROMPT in sequence
# via run_local_agent(). Verifies end-to-end classification + extraction.
name: full-invoices
mode: full
description: >
End-to-end test: classify Italian invoices/meeting notes into the
correct project, then extract tasks, notes, and timeline events.
directory: sample_files/invoices
data_types: [tasks, notes, timelines]
file_extensions: [txt, md]
# ── Step-1 prompt variables ──────────────────────────────────────
domain_definitions: |
- tasks: Action items, deliverables, things to do — anything that someone needs to complete.
- notes: Meeting summaries, decisions, reference information — permanent knowledge entries.
- timelines: Project milestones, deadlines, scheduled events — specific dates that mark a point in the progress of a project.
projects_list:
- id: "proj-web-redesign"
name: "Redesign Sito Web Corporate"
status: "active"
aiSummary: "Corporate website redesign for Studio Architettura Bianchi"
- id: "proj-ecommerce"
name: "E-Commerce FashionStore"
status: "active"
aiSummary: "Next.js e-commerce platform for FashionStore srl"
# ── Step-2 prompt variables ──────────────────────────────────────
existing_context: |
Existing tasks:
(none)
Existing notes:
(none)
Existing timelines:
(none)
project_context: ""
custom_prompt_section: |
User instructions:
Estrai i dati dai file come segue:
- TASK: ogni azione da fare, deliverable, o item con scadenza.
Mappa "URGENTE" o "ALTA PRIORITÀ" → priority: high.
Mappa "media priorità" → priority: medium.
Mappa "bassa priorità" → priority: low.
Se un item è marcato come "completato" o [x], impostalo status: done.
Altrimenti status: todo.
- NOTE: riassunti di meeting, decisioni prese, note tecniche.
- TIMELINE: date di scadenza, milestone, meeting futuri.
Imposta sempre isAiSuggested=1.
# ── Seed records (pre-existing DB state) ─────────────────────────
seed_records:
projects:
- id: "proj-web-redesign"
name: "Redesign Sito Web Corporate"
status: "active"
aiSummary: "Corporate website redesign for Studio Architettura Bianchi"
- id: "proj-ecommerce"
name: "E-Commerce FashionStore"
status: "active"
aiSummary: "Next.js e-commerce platform for FashionStore srl"
tasks: []
notes: []
timelines: []
# ── Expected classification (step 1) ─────────────────────────────
expected_classification:
- file: "sample_files/invoices/fattura_042.txt"
project_id: "proj-web-redesign"
domains: [tasks, notes, timelines]
- file: "sample_files/invoices/meeting_ecommerce.md"
project_id: "proj-ecommerce"
domains: [tasks, notes, timelines]
# ── Expected extractions (step 2) ────────────────────────────────
expected:
tasks:
- title: "Sviluppo frontend React"
priority: "high"
status: "todo"
- title: "Integrazione API backend"
priority: "medium"
status: "todo"
- title: "Testing cross-browser e fix bug responsive"
status: "todo"
- title: "Preparare wireframe homepage"
priority: "high"
status: "todo"
- title: "Setup progetto Next.js e configurare CI/CD"
priority: "medium"
status: "todo"
- title: "Ricerca plugin Stripe per gestione abbonamenti"
priority: "low"
status: "todo"
notes:
- title: "Meeting Kickoff Progetto E-Commerce"
timelines:
- title: "MVP E-Commerce pronto"
- title: "Meeting di revisione"

View File

@@ -1,43 +1,25 @@
# Journey Fixture: journey-invoice-setup
# Tests that the journey chatbot correctly builds a prompt_template
# for extracting tasks and notes from Italian invoices and meeting notes.
# Used by `python -m eval interactive` for human-in-the-loop testing
# of the journey chatbot's prompt-building conversation.
type: journey
name: journey-invoice-setup
description: >
Test the journey chatbot's ability to explore a directory of Italian
invoices and meeting notes, ask relevant questions, and produce a
well-structured prompt_template for data extraction.
Interactive test for the journey chatbot explore a directory of
Italian invoices and meeting notes, answer the chatbot's questions,
and verify it produces a well-structured prompt_template for data
extraction.
directory: sample_files/invoices
data_types: [tasks, notes, timelines]
# Simulated user responses (the journey starts with the LLM exploring
# the directory and asking its first question)
user_messages:
- >
I want to extract action items from invoices and meeting notes.
The invoices are in Italian and contain work descriptions with
deadlines. Meeting notes have action items with checkboxes.
- >
Yes, map Italian priority keywords: "URGENTE" and "ALTA PRIORITÀ"
should be high priority, "media priorità" is medium, "bassa priorità"
is low. Items marked with [x] are already completed.
- >
For notes, I want meeting summaries with the full content including
decisions and attendees. For timelines, extract deadlines and
scheduled meeting dates.
- >
That's everything I need. Please generate the template.
data_types: [tasks, notes, timelines, projects]
# Criteria the generated prompt_template must satisfy
# Each is scored 0-1 by an LLM judge
expected_template_criteria:
- "Mentions creating tasks from action items and work descriptions"
- "Includes Italian priority keyword mapping (URGENTE→high, media priorità→medium, bassa priorità→low)"
- "Handles completed items marked with [x] as status done"
- "Mentions creating notes from meeting summaries"
- "Mentions extracting timeline events from deadlines and meeting dates"
- "Mentions creating projects from relevant information"
- "Sets isAiSuggested=1 on all created records"
- "Does NOT include projectId assignment logic"
- "Uses camelCase field names (title, status, priority, dueDate, content)"

View File

@@ -0,0 +1,81 @@
# Fixture: process-invoices (step2)
# Tests _PROCESSING_SYSTEM_PROMPT — data extraction & tool calling.
# The classification step is skipped; prompt variables are injected directly.
name: process-invoices
mode: step2
description: >
Test data extraction from Italian freelance invoices.
Verifies correct record creation via tool calls with the right
fields, priorities, and status values.
directory: sample_files/invoices
data_types: [tasks, notes, timelines]
file_extensions: [txt, md]
# ── Step-2 prompt variables ──────────────────────────────────────
existing_context: |
Existing tasks:
(none)
Existing notes:
(none)
Existing timelines:
(none)
project_context: >
Project: Redesign Sito Web Corporate (id: proj-web-redesign).
Always set projectId to this id on every record you create.
custom_prompt_section: |
User instructions:
Estrai i dati dai file come segue:
- TASK: ogni azione da fare, deliverable, o item con scadenza.
Mappa "URGENTE" o "ALTA PRIORITÀ" → priority: high.
Mappa "media priorità" → priority: medium.
Mappa "bassa priorità" → priority: low.
Se un item è marcato come "completato" o [x], impostalo status: done.
Altrimenti status: todo.
- NOTE: riassunti di meeting, decisioni prese, note tecniche.
Il titolo deve essere descrittivo. Il content deve includere tutti i dettagli.
- TIMELINE: date di scadenza, milestone, meeting futuri.
Imposta sempre isAiSuggested=1.
# ── Seed records (pre-existing DB state) ─────────────────────────
seed_records:
projects:
- id: "proj-web-redesign"
name: "Redesign Sito Web Corporate"
status: "active"
tasks: []
notes: []
timelines: []
# ── Expected extractions ─────────────────────────────────────────
expected:
tasks:
- title: "Sviluppo frontend React"
priority: "high"
status: "todo"
- title: "Integrazione API backend"
priority: "medium"
status: "todo"
- title: "Testing cross-browser e fix bug responsive"
status: "todo"
- title: "Preparare wireframe homepage"
priority: "high"
status: "todo"
- title: "Setup progetto Next.js e configurare CI/CD"
priority: "medium"
status: "todo"
- title: "Ricerca plugin Stripe per gestione abbonamenti"
priority: "low"
status: "todo"
notes:
- title: "Meeting Kickoff Progetto E-Commerce"
timelines:
- title: "MVP E-Commerce pronto"
- title: "Meeting di revisione"

View File

@@ -0,0 +1,471 @@
"""Interactive journey session — human-in-the-loop CLI conversation.
Flow:
1. Show the system prompt used by the journey AI.
2. Start the journey (AI explores files, asks first question).
3. User types responses in the terminal — AI replies.
4. User types `/done` to end the conversation.
5. User writes a comment about the interaction quality.
6. LLM judge scores the conversation + generated template.
7. Results are reported to Langfuse.
Usage::
python -m eval interactive # pick a fixture interactively
python -m eval interactive --fixture=journey-invoice-setup
python -m eval interactive --model=gpt-4o
python -m eval interactive --judge-model=github_copilot/gpt-4o-mini
"""
from __future__ import annotations
import asyncio
import json
import logging
import sys
import time
import uuid
from dataclasses import dataclass, field
from typing import Any
from langchain_core.messages import HumanMessage, SystemMessage
from eval.config import JourneyFixture, discover_journey_fixtures
from eval.mock_executor import MockExecutor
from eval import langfuse_eval
logger = logging.getLogger(__name__)
# ── Special commands ─────────────────────────────────────────────────────
_CMD_DONE = "/done"
_CMD_QUIT = "/quit"
_CMD_TEMPLATE = "/template"
_CMD_HELP = "/help"
_HELP_TEXT = f"""\
{_CMD_DONE} — End the conversation and proceed to evaluation
{_CMD_QUIT} — Abort without evaluation
{_CMD_TEMPLATE} — Show the generated template (if any)
{_CMD_HELP} — Show this help"""
# ── Terminal colours (ANSI) ──────────────────────────────────────────────
_C_RESET = "\033[0m"
_C_BOLD = "\033[1m"
_C_DIM = "\033[2m"
_C_CYAN = "\033[36m"
_C_GREEN = "\033[32m"
_C_YELLOW = "\033[33m"
_C_MAGENTA = "\033[35m"
_C_RED = "\033[31m"
_C_BLUE = "\033[34m"
def _print_header(text: str) -> None:
print(f"\n{_C_BOLD}{_C_CYAN}{'' * 80}")
print(f" {text}")
print(f"{'' * 80}{_C_RESET}\n")
def _print_ai(text: str) -> None:
print(f"\n{_C_GREEN}{_C_BOLD}AI:{_C_RESET} {text}\n")
def _print_system(text: str) -> None:
print(f"{_C_DIM}{text}{_C_RESET}")
def _print_score(label: str, score: float) -> None:
if score >= 0.7:
color = _C_GREEN
tag = "PASS"
elif score >= 0.4:
color = _C_YELLOW
tag = "PARTIAL"
else:
color = _C_RED
tag = "FAIL"
print(f" {color}{tag:>7}{_C_RESET} ({score:.1f}) {label}")
# ── Result type ──────────────────────────────────────────────────────────
@dataclass
class InteractiveResult:
fixture_name: str
model: str
judge_model: str
prompt_template: str | None
conversation: list[dict[str, str]]
user_comment: str
done: bool
criteria_scores: dict[str, float]
overall_score: float
judge_reasoning: str
elapsed_seconds: float
def summary(self) -> dict[str, Any]:
return {
"fixture": self.fixture_name,
"model": self.model,
"judge_model": self.judge_model,
"done": self.done,
"turns": len([c for c in self.conversation if c["role"] == "user"]),
"overall_score": round(self.overall_score, 3),
"user_comment": self.user_comment,
"criteria_scores": {k: round(v, 3) for k, v in self.criteria_scores.items()},
"elapsed_s": round(self.elapsed_seconds, 1),
}
# ── LLM judge ────────────────────────────────────────────────────────────
_INTERACTIVE_JUDGE_SYSTEM = """\
You are an evaluation judge for AI-generated prompt templates produced during
an interactive conversation between a human and a journey chatbot.
The chatbot explored a directory and through multi-turn conversation with the
user produced a prompt_template — an instruction set for a data-extraction agent.
You have access to:
- The full conversation transcript
- The generated prompt_template (if any)
- The user's own comment about the interaction
- A list of quality criteria
Score each criterion from 0 to 1:
- 1.0: Fully satisfied
- 0.5: Partially satisfied
- 0.0: Not satisfied
Also provide an overall_quality score (0-1) evaluating the conversation flow,
how well the AI understood the user, and the template quality.
Respond with ONLY a JSON object:
{
"criteria_scores": {"criterion_1": 0.8, ...},
"overall_quality": 0.85,
"reasoning": "Brief explanation covering both conversation quality and template accuracy"
}
"""
async def _judge_interactive(
conversation: list[dict[str, str]],
prompt_template: str | None,
user_comment: str,
criteria: list[str],
*,
judge_model: str = "gpt-4o-mini",
) -> tuple[dict[str, float], float, str]:
"""Score an interactive session. Returns (criteria_scores, overall_quality, reasoning)."""
from shared.llm import get_llm
llm = get_llm(model=judge_model, temperature=0)
conv_text = "\n".join(
f"{'USER' if t['role'] == 'user' else 'AI'}: {t['content']}"
for t in conversation
)
criteria_text = "\n".join(f" {i+1}. {c}" for i, c in enumerate(criteria))
user_content = (
f"## Conversation transcript\n```\n{conv_text}\n```\n\n"
f"## Generated prompt_template\n```\n{prompt_template or '(none — conversation did not complete)'}\n```\n\n"
f"## User's comment\n{user_comment}\n\n"
f"## Criteria to evaluate\n{criteria_text}"
)
try:
response = await llm.ainvoke([
SystemMessage(content=_INTERACTIVE_JUDGE_SYSTEM),
HumanMessage(content=user_content),
])
raw = response.content.strip()
if raw.startswith("```"):
raw = raw.split("```")[1]
if raw.startswith("json"):
raw = raw[4:]
parsed = json.loads(raw.strip())
scores_raw = parsed.get("criteria_scores", parsed.get("scores", {}))
criteria_scores: dict[str, float] = {}
for i, criterion in enumerate(criteria):
key_candidates = [f"criterion_{i+1}", criterion, criterion[:50], str(i + 1)]
score = 0.0
for key in key_candidates:
if key in scores_raw:
score = float(scores_raw[key])
break
if score == 0.0 and i < len(scores_raw):
score = float(list(scores_raw.values())[i])
criteria_scores[criterion] = score
overall = float(parsed.get("overall_quality", 0.0))
reasoning = str(parsed.get("reasoning", ""))
return criteria_scores, overall, reasoning
except Exception as exc:
logger.warning("interactive judge failed: %s", exc)
return {c: 0.0 for c in criteria}, 0.0, f"Judge error: {exc}"
# ── Interactive session ──────────────────────────────────────────────────
async def run_interactive(
fixture: JourneyFixture,
*,
model: str = "gpt-4o",
judge_model: str = "gpt-4o-mini",
data_dir: Path | None = None,
) -> InteractiveResult:
"""Run an interactive journey session in the terminal.
Parameters
----------
data_dir :
If set, overrides the fixture's sample-file directory. The LLM
will explore this folder instead of the default
``fixtures/sample_files/…``. Useful for private test data that
shouldn't be committed to git.
"""
from shared.config import settings
from shared.ws_context import set_current_user, clear_current_user
from app.journey import (
handle_journey_start,
handle_journey_message,
_build_system_prompt,
)
# When --data-dir is given, the MockExecutor's root becomes
# data_dir's parent and the journey directory is data_dir's name.
# This way the LLM sees a meaningful directory name (not ".") and
# MockExecutor resolves paths correctly.
# Otherwise, use the fixture's YAML parent and its relative path.
if data_dir:
mock_root = data_dir.parent
journey_directory = data_dir.name
else:
mock_root = fixture.fixture_path.parent
journey_directory = fixture.directory
mock = MockExecutor(
fixture_dir=mock_root,
seed_records={},
)
original_model = settings.LLM_MODEL
settings.LLM_MODEL = model
eval_user_id = f"interactive-{uuid.uuid4().hex[:8]}"
# ── Show system prompt ───────────────────────────────────────
system_prompt = _build_system_prompt(journey_directory, fixture.data_types)
_print_header("SYSTEM PROMPT")
print(f"{_C_DIM}{system_prompt}{_C_RESET}")
_print_header(f"INTERACTIVE JOURNEY | fixture: {fixture.name} | model: {model}")
print(f" Data dir: {mock_root}")
print(f" Type your responses. Commands: {_CMD_DONE}, {_CMD_QUIT}, {_CMD_TEMPLATE}, {_CMD_HELP}")
print(f" Judge model: {judge_model}")
print(f" Criteria: {len(fixture.expected_template_criteria)}")
print()
conversation: list[dict[str, str]] = []
prompt_template: str | None = None
done = False
start_time = time.time()
try:
set_current_user(eval_user_id)
with mock.patch():
# ── Start ────────────────────────────────────────────
_print_system("Starting journey... (AI is exploring your files)")
start_frame: dict[str, Any] = {
"agent_type": "local",
"directory": journey_directory,
"data_types": fixture.data_types,
"session_id": f"interactive-{uuid.uuid4().hex[:8]}",
}
reply = await handle_journey_start(eval_user_id, start_frame)
session_id = reply["session_id"]
conversation.append({"role": "assistant", "content": reply["message"]})
_print_ai(reply["message"])
if reply["done"]:
prompt_template = reply.get("prompt_template")
done = True
_print_system("Journey completed on first reply (template generated).")
# ── Conversation loop ────────────────────────────────
while not done:
try:
user_input = input(f"{_C_BOLD}{_C_BLUE}YOU:{_C_RESET} ").strip()
except (EOFError, KeyboardInterrupt):
print()
user_input = _CMD_QUIT
if not user_input:
continue
# Handle commands
if user_input.lower() == _CMD_QUIT:
_print_system("Aborted — no evaluation will be performed.")
settings.LLM_MODEL = original_model
clear_current_user()
return InteractiveResult(
fixture_name=fixture.name, model=model, judge_model=judge_model,
prompt_template=None, conversation=conversation,
user_comment="(aborted)", done=False,
criteria_scores={}, overall_score=0.0,
judge_reasoning="Session aborted by user.",
elapsed_seconds=time.time() - start_time,
)
if user_input.lower() == _CMD_HELP:
print(_HELP_TEXT)
continue
if user_input.lower() == _CMD_TEMPLATE:
if prompt_template:
print(f"\n{_C_MAGENTA}{prompt_template}{_C_RESET}\n")
else:
_print_system("No template generated yet.")
continue
if user_input.lower() == _CMD_DONE:
_print_system("Ending conversation...")
break
# ── Send message to AI ───────────────────────────
conversation.append({"role": "user", "content": user_input})
_print_system("AI is thinking...")
msg_frame: dict[str, Any] = {
"session_id": session_id,
"message": user_input,
}
reply = await handle_journey_message(eval_user_id, msg_frame)
conversation.append({"role": "assistant", "content": reply["message"]})
_print_ai(reply["message"])
if reply["done"]:
prompt_template = reply.get("prompt_template")
done = True
_print_system("Journey completed — template generated!")
except Exception as exc:
logger.error("interactive journey failed: %s", exc)
_print_system(f"Error: {exc}")
finally:
settings.LLM_MODEL = original_model
clear_current_user()
elapsed = time.time() - start_time
turns = len([c for c in conversation if c["role"] == "user"])
# ── Show template if generated ───────────────────────────────
if prompt_template:
_print_header("GENERATED TEMPLATE")
print(f"{_C_MAGENTA}{prompt_template}{_C_RESET}\n")
else:
_print_system("No template was generated during this session.")
# ── User comment ─────────────────────────────────────────────
_print_header("YOUR EVALUATION")
print(" Write your comment about this interaction (press Enter twice to finish):")
print()
comment_lines: list[str] = []
try:
while True:
line = input()
if line == "" and comment_lines and comment_lines[-1] == "":
comment_lines.pop() # remove trailing empty
break
comment_lines.append(line)
except (EOFError, KeyboardInterrupt):
pass
user_comment = "\n".join(comment_lines).strip() or "(no comment)"
# ── Judge ────────────────────────────────────────────────────
_print_header("LLM JUDGE EVALUATION")
_print_system(f"Scoring with {judge_model}...")
criteria_scores, overall_quality, judge_reasoning = await _judge_interactive(
conversation=conversation,
prompt_template=prompt_template,
user_comment=user_comment,
criteria=fixture.expected_template_criteria,
judge_model=judge_model,
)
# ── Display scores ───────────────────────────────────────────
print()
for criterion, score in criteria_scores.items():
_print_score(criterion, score)
overall = (
sum(criteria_scores.values()) / len(criteria_scores)
if criteria_scores
else 0.0
)
print(f"\n {_C_BOLD}Criteria avg: {overall:.2f}{_C_RESET}")
print(f" {_C_BOLD}Overall quality: {overall_quality:.2f}{_C_RESET}")
print(f" {_C_BOLD}Turns: {turns}{_C_RESET}")
print(f" {_C_BOLD}Time: {elapsed:.1f}s{_C_RESET}")
print(f"\n {_C_DIM}Judge: {judge_reasoning}{_C_RESET}")
print(f" {_C_DIM}Your comment: {user_comment}{_C_RESET}\n")
result = InteractiveResult(
fixture_name=fixture.name,
model=model,
judge_model=judge_model,
prompt_template=prompt_template,
conversation=conversation,
user_comment=user_comment,
done=done,
criteria_scores=criteria_scores,
overall_score=overall_quality,
judge_reasoning=judge_reasoning,
elapsed_seconds=elapsed,
)
# ── Report to Langfuse ───────────────────────────────────────
trace_id = langfuse_eval.log_eval_trace(
fixture_name=fixture.name,
model=model,
prompt_variant="interactive",
prompt_template=prompt_template or "(not generated)",
actual_mutations=[{
"conversation": conversation[:30],
"user_comment": user_comment,
}],
scores_summary=result.summary(),
langfuse_prompt_names=["journey_system"],
)
if trace_id:
from eval.scorer import EvalScores
scores_obj = EvalScores(
fixture_name=fixture.name,
model=model,
prompt_variant="interactive",
precision=overall,
recall=float(done),
f1=overall,
llm_judge_score=overall_quality,
llm_judge_reasoning=judge_reasoning,
)
langfuse_eval.post_eval_scores(scores_obj, trace_id=trace_id)
_print_system(f"Results reported to Langfuse (trace: {trace_id})")
else:
_print_system("Langfuse not configured — results not reported.")
return result

View File

@@ -94,7 +94,7 @@ async def _judge_template(
Returns (criteria_scores, reasoning).
"""
from app.llm import get_llm
from shared.llm import get_llm
llm = get_llm(model=judge_model, temperature=0)
@@ -152,13 +152,23 @@ async def run_single_journey_eval(
model: str,
*,
judge_model: str = "gpt-4o-mini",
data_dir: Path | None = None,
) -> JourneyEvalResult:
"""Execute one journey eval: start → messages → score template."""
"""Execute one journey eval: start \u2192 messages \u2192 score template."""
from shared.config import settings
# Build mock executor for filesystem tools
# When data_dir is given, use its parent as MockExecutor root
# and its name as the journey directory so the LLM sees a
# meaningful path (not ".").
if data_dir:
mock_root = data_dir.parent
journey_directory = data_dir.name
else:
mock_root = fixture.fixture_path.parent
journey_directory = fixture.directory
mock = MockExecutor(
fixture_dir=fixture.fixture_dir,
fixture_dir=mock_root,
seed_records={},
)
@@ -178,7 +188,7 @@ async def run_single_journey_eval(
done = False
try:
from app.ws_context import set_current_user, clear_current_user
from shared.ws_context import set_current_user, clear_current_user
from app.journey import handle_journey_start, handle_journey_message, _sessions
set_current_user(eval_user_id)
@@ -186,7 +196,7 @@ async def run_single_journey_eval(
# ── Start the journey ────────────────────────────────
start_frame: dict[str, Any] = {
"agent_type": "local",
"directory": fixture.directory,
"directory": journey_directory,
"data_types": fixture.data_types,
"session_id": f"eval-{uuid.uuid4().hex[:8]}",
}
@@ -246,7 +256,7 @@ async def run_single_journey_eval(
logger.error("journey_eval: pipeline failed for %s/%s: %s", fixture.name, model, exc)
finally:
settings.LLM_MODEL = original_model
from app.ws_context import clear_current_user
from shared.ws_context import clear_current_user
clear_current_user()
elapsed = time.time() - start_time
@@ -297,6 +307,7 @@ async def run_single_journey_eval(
prompt_template=prompt_template or "(not generated)",
actual_mutations=[{"conversation": conversation[:20]}],
scores_summary=result.summary(),
langfuse_prompt_names=["journey_system"],
)
if trace_id:
@@ -321,6 +332,7 @@ async def run_journey_fixture_eval(
models: list[str],
*,
judge_model: str = "gpt-4o-mini",
data_dir: Path | None = None,
) -> list[JourneyEvalResult]:
"""Run all models for a journey fixture."""
langfuse_eval.sync_journey_fixture_to_dataset(fixture)
@@ -329,6 +341,7 @@ async def run_journey_fixture_eval(
for model in models:
result = await run_single_journey_eval(
fixture, model, judge_model=judge_model,
data_dir=data_dir,
)
results.append(result)

View File

@@ -1,21 +1,21 @@
"""Langfuse evaluation integration — datasets, runs, and scoring.
Uses the Langfuse Python SDK to:
Uses the Langfuse Python SDK v4 (OpenTelemetry-based) to:
1. **Sync fixtures → Langfuse datasets**: Each YAML fixture becomes a dataset,
each prompt variant + expected pair becomes a dataset item.
2. **Track eval runs**: Each (fixture × model × prompt_variant) execution
is recorded as a dataset run with linked traces and scores.
is recorded as a trace with linked scores.
3. **Post scores**: precision, recall, F1, field_accuracy, llm_judge are
posted as numeric scores on the trace/run.
posted as numeric scores on the trace.
"""
from __future__ import annotations
import json
import logging
import os
from typing import Any
from shared.config import settings
@@ -26,16 +26,16 @@ logger = logging.getLogger(__name__)
def _get_langfuse():
"""Get or create a Langfuse client instance."""
"""Get or create a Langfuse client instance (SDK v4)."""
if not settings.LANGFUSE_SECRET_KEY or not settings.LANGFUSE_PUBLIC_KEY:
return None
try:
from langfuse import Langfuse
return Langfuse(
secret_key=settings.LANGFUSE_SECRET_KEY,
public_key=settings.LANGFUSE_PUBLIC_KEY,
host=settings.LANGFUSE_HOST,
)
os.environ.setdefault("LANGFUSE_SECRET_KEY", settings.LANGFUSE_SECRET_KEY)
os.environ.setdefault("LANGFUSE_PUBLIC_KEY", settings.LANGFUSE_PUBLIC_KEY)
if settings.LANGFUSE_HOST:
os.environ.setdefault("LANGFUSE_HOST", settings.LANGFUSE_HOST)
from langfuse import get_client
return get_client()
except Exception as exc:
logger.warning("langfuse_eval: failed to create client: %s", exc)
return None
@@ -61,35 +61,44 @@ def sync_fixture_to_dataset(fixture: EvalFixture) -> str | None:
lf.create_dataset(
name=dataset_name,
description=fixture.description,
metadata={"data_types": fixture.data_types, "file_extensions": fixture.file_extensions},
metadata={
"data_types": ",".join(fixture.data_types),
"file_extensions": ",".join(fixture.file_extensions) if fixture.file_extensions else "",
},
)
except Exception:
# Dataset may already exist — that's fine
pass
expected_output = {}
for rec in fixture.expected:
expected_output.setdefault(rec.table, []).append(rec.fields)
# Build expected_output appropriate to the fixture's mode
expected_output: dict[str, Any] = {}
if fixture.mode in ("step1", "full") and fixture.expected_classification:
expected_output["classifications"] = [
{"file": ec.file, "project_id": ec.project_id, "domains": ec.domains}
for ec in fixture.expected_classification
]
if fixture.mode in ("step2", "full") and fixture.expected:
for rec in fixture.expected:
expected_output.setdefault(rec.table, []).append(rec.fields)
for variant_name, prompt_template in fixture.prompt_variants.items():
item_id = f"{fixture.name}--{variant_name}"
try:
lf.create_dataset_item(
dataset_name=dataset_name,
id=item_id,
input={
"directory": fixture.directory,
"data_types": fixture.data_types,
"prompt_template": prompt_template,
"seed_records": fixture.seed_records,
},
expected_output=expected_output,
metadata={"prompt_variant": variant_name},
)
except Exception as exc:
logger.warning(
"langfuse_eval: failed to upsert dataset item %s: %s", item_id, exc
)
item_id = f"{fixture.name}--{fixture.mode}"
try:
lf.create_dataset_item(
dataset_name=dataset_name,
id=item_id,
input={
"directory": fixture.directory,
"data_types": fixture.data_types,
"mode": fixture.mode,
"seed_records": fixture.seed_records,
},
expected_output=expected_output,
metadata={"mode": fixture.mode},
)
except Exception as exc:
logger.warning(
"langfuse_eval: failed to upsert dataset item %s: %s", item_id, exc
)
lf.flush()
logger.info("langfuse_eval: synced fixture '%s' → dataset '%s'", fixture.name, dataset_name)
@@ -114,7 +123,7 @@ def sync_journey_fixture_to_dataset(fixture) -> str | None:
lf.create_dataset(
name=dataset_name,
description=fixture.description,
metadata={"type": "journey", "data_types": fixture.data_types},
metadata={"type": "journey", "data_types": ",".join(fixture.data_types)},
)
except Exception:
pass # Dataset may already exist
@@ -148,18 +157,26 @@ def create_eval_run(
*,
metadata: dict[str, Any] | None = None,
) -> str:
"""Create a dataset run in Langfuse. Returns the run name."""
"""Create a dataset run in Langfuse. Returns the run name.
Note: In SDK v4, dataset runs are created implicitly via
dataset.run_experiment(). This function is kept for backwards
compatibility but may not create a run.
"""
lf = _get_langfuse()
if lf is None:
return run_name
try:
lf.create_dataset_run(
dataset_name=dataset_name,
run_name=run_name,
metadata=metadata or {},
)
lf.flush()
if hasattr(lf, "create_dataset_run"):
lf.create_dataset_run(
dataset_name=dataset_name,
run_name=run_name,
metadata=metadata or {},
)
lf.flush()
else:
logger.debug("langfuse_eval: create_dataset_run not available in SDK v4")
except Exception as exc:
logger.warning("langfuse_eval: failed to create run %s: %s", run_name, exc)
@@ -185,21 +202,22 @@ def post_eval_scores(
("precision", scores.precision),
("recall", scores.recall),
("f1", scores.f1),
("field_accuracy", scores.field_accuracy),
]
# Only post field_accuracy when there are field-level scores (step2/full)
if scores.field_scores:
score_data.append(("field_accuracy", scores.field_accuracy))
if scores.llm_judge_score is not None:
score_data.append(("llm_judge", scores.llm_judge_score))
for name, value in score_data:
try:
kwargs: dict[str, Any] = {
"name": name,
"value": value,
"comment": f"{scores.fixture_name} | {scores.model} | {scores.prompt_variant}",
}
if trace_id:
kwargs["trace_id"] = trace_id
lf.score(**kwargs)
lf.create_score(
name=name,
value=value,
trace_id=trace_id,
data_type="NUMERIC",
comment=f"{scores.fixture_name} | {scores.model} | {scores.prompt_variant}",
)
except Exception as exc:
logger.warning("langfuse_eval: failed to post score %s: %s", name, exc)
@@ -218,12 +236,20 @@ def log_eval_trace(
prompt_template: str,
actual_mutations: list[dict],
scores_summary: dict[str, Any],
step1_results: list[dict] | None = None,
dataset_name: str | None = None,
run_name: str | None = None,
dataset_item_id: str | None = None,
langfuse_prompt_names: list[str] | None = None,
) -> str | None:
"""Create a Langfuse trace for one eval execution and link it to a dataset run.
Uses SDK v4 observation API (traces are created implicitly by root spans).
``langfuse_prompt_names`` can contain one or two prompt names to link
(e.g. ``["batch_file_classifier", "batch_processing"]`` for full mode).
Each prompt gets its own generation-type observation for per-version
metrics tracking.
Returns the trace_id, or None if Langfuse is unavailable.
"""
lf = _get_langfuse()
@@ -231,38 +257,71 @@ def log_eval_trace(
return None
try:
trace = lf.trace(
name=f"eval-{fixture_name}",
input={
"prompt_template": prompt_template,
"model": model,
"prompt_variant": prompt_variant,
},
output={
"mutations": actual_mutations[:50],
"scores": scores_summary,
},
from langfuse import propagate_attributes
# Fetch prompt objects for linking
prompt_objs: list[tuple[str, Any]] = []
for pname in (langfuse_prompt_names or []):
try:
obj = lf.get_prompt(name=pname, cache_ttl_seconds=300)
prompt_objs.append((pname, obj))
logger.info("langfuse_eval: linked prompt '%s' (type=%s)", pname, type(obj).__name__)
except Exception as exc:
logger.warning("langfuse_eval: prompt '%s' not found — %s", pname, exc)
# Build trace output dict
trace_output: dict[str, Any] = {"scores": scores_summary}
if step1_results:
trace_output["classifications"] = step1_results
if actual_mutations:
trace_output["mutations"] = actual_mutations[:50]
with propagate_attributes(
trace_name=f"eval-{fixture_name}",
metadata={
"eval": True,
"eval": "true",
"fixture": fixture_name,
"model": model,
"prompt_variant": prompt_variant,
},
tags=["eval", f"model:{model}", f"variant:{prompt_variant}"],
)
):
# Root span for the eval run
span = lf.start_observation(name=f"eval-{fixture_name}")
span.update(
input={
"prompt_template": prompt_template,
"model": model,
"prompt_variant": prompt_variant,
},
output=trace_output,
)
trace_id = span.trace_id
# Link to dataset run if available
if dataset_name and run_name and dataset_item_id:
try:
dataset = lf.get_dataset(dataset_name)
item = dataset.get_item(dataset_item_id)
if item:
item.link(trace, run_name)
except Exception as exc:
logger.warning("langfuse_eval: failed to link trace to dataset run: %s", exc)
# Create a generation-type observation per linked prompt
for pname, pobj in prompt_objs:
gen = lf.start_observation(
name=f"prompt-{pname}",
prompt=pobj,
as_type="generation",
)
gen.end()
# Link to dataset run if available
if dataset_name and run_name and dataset_item_id:
try:
dataset = lf.get_dataset(dataset_name)
for item in dataset.items:
if item.id == dataset_item_id:
item.link(span, run_name)
break
except Exception as exc:
logger.warning("langfuse_eval: failed to link trace to dataset run: %s", exc)
span.end()
lf.flush()
return trace.id
return trace_id
except Exception as exc:
logger.warning("langfuse_eval: failed to create eval trace: %s", exc)
return None

View File

@@ -1,6 +1,6 @@
"""Mock executor — intercepts execute_on_client for offline E2E testing.
Patches ``app.ws_context.execute_on_client`` so agent pipeline runs don't
Patches ``execute_on_client`` at all usage sites so agent pipeline runs don't
require a live Electron client or Redis. Instead:
- **Filesystem actions** (list_directory, read_file_content, get_file_metadata)
@@ -20,6 +20,7 @@ import uuid
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any
from contextlib import contextmanager, asynccontextmanager
from unittest.mock import AsyncMock, patch
@@ -33,6 +34,30 @@ class Mutation:
timestamp: float = field(default_factory=time.time)
# ── Fake DB helpers (used to bypass async_session in full mode) ───────
class _FakeRow:
"""Mimics an AgentRunLog row returned by SQLAlchemy."""
id = 0
status = "running"
items_processed = 0
items_created = 0
errors: list[str] = []
completed_at = None
def __setattr__(self, name: str, value: Any) -> None:
object.__setattr__(self, name, value)
class _FakeResult:
"""Mimics a SQLAlchemy ``Result`` with ``scalar_one_or_none``."""
def __init__(self, row: _FakeRow) -> None:
self._row = row
def scalar_one_or_none(self) -> _FakeRow:
return self._row
@dataclass
class MockExecutor:
"""In-memory executor that replaces Redis-based tool round-trip.
@@ -77,12 +102,37 @@ class MockExecutor:
# ── Context manager for patching ──────────────────────────────
@contextmanager
def patch(self):
"""Return an async context-manager that patches execute_on_client."""
return patch(
"app.ws_context.execute_on_client",
new=AsyncMock(side_effect=self._handle),
)
"""Patch execute_on_client and DB session at all usage sites."""
mock_fn = AsyncMock(side_effect=self._handle)
targets = [
"shared.ws_context.execute_on_client",
"app.agent_runner.execute_on_client",
"app.agents.filesystem_agent.execute_on_client",
]
# Mock async_session so run_local_agent / _finalize_run skip real DB
fake_row = _FakeRow()
fake_db = AsyncMock()
fake_db.commit = AsyncMock()
fake_db.refresh = AsyncMock()
fake_db.execute = AsyncMock(return_value=_FakeResult(fake_row))
fake_db.add = lambda obj: None # noqa: ARG005
@asynccontextmanager
async def _fake_session():
yield fake_db
patches = [patch(t, new=mock_fn) for t in targets]
patches.append(patch("app.agent_runner.async_session", _fake_session))
for p in patches:
p.start()
try:
yield mock_fn
finally:
for p in patches:
p.stop()
# ── Internal dispatch ─────────────────────────────────────────

View File

@@ -1,28 +1,31 @@
"""Eval runner — orchestrates fixture → mock → agent pipeline → scoring.
For each (fixture × model × prompt_variant) combination:
1. Build a MockExecutor with fixture data
2. Patch execute_on_client
3. Override LLM_MODEL in shared settings
4. Run the batch agent pipeline (run_local_agent)
5. Collect mutations from the mock
6. Score against expected results (field match + optional LLM judge)
7. Report scores to Langfuse
8. Print results
Supports three eval modes:
- **step1**: Test classification prompt only (``_STEP1_SYSTEM_PROMPT``).
Calls the LLM with fixture-provided ``domain_definitions`` and
``projects_list`` and compares output against ``expected_classification``.
- **step2**: Test processing prompt only (``_PROCESSING_SYSTEM_PROMPT``).
Compiles the prompt with fixture-provided ``existing_context``,
``project_context``, ``data_types``, and ``custom_prompt_section``,
then runs the tool-calling loop. Mutations are scored against
``expected`` records.
- **full**: Run ``run_local_agent()`` end-to-end (both steps).
Scored on both classification and extraction.
"""
from __future__ import annotations
import asyncio
import copy
import json
import logging
import time
import uuid
from pathlib import Path
from typing import Any
from eval.config import EvalFixture, ExpectedRecord
from eval.config import EvalFixture, ExpectedClassification
from eval.mock_executor import MockExecutor
from eval.scorer import (
EvalScores,
@@ -36,72 +39,193 @@ from eval import langfuse_eval
logger = logging.getLogger(__name__)
async def run_single_eval(
# ── Step 1 runner ─────────────────────────────────────────────────────────
async def _run_step1(
fixture: EvalFixture,
model: str,
prompt_variant: str,
*,
use_llm_judge: bool = True,
judge_model: str = "gpt-4o-mini",
) -> EvalScores:
"""Execute one (fixture × model × prompt_variant) eval and return scores."""
from shared.config import settings
mock: MockExecutor,
) -> list[dict[str, Any]]:
"""Run step-1 classification for each expected file.
prompt_template = fixture.prompt_variants.get(prompt_variant, "")
Returns a list of result dicts:
``[{file, project_id, domains, new_project_name}, ...]``
"""
from app.agent_runner import _classify_file
# Build mock executor
seed = copy.deepcopy(fixture.seed_records)
mock = MockExecutor(
fixture_dir=fixture.fixture_dir,
seed_records=seed,
results: list[dict[str, Any]] = []
for ec in fixture.expected_classification:
# Read the file content through the mock
file_result = await mock._handle(
action="read_file_content",
data={"path": ec.file},
)
file_content: str = file_result.get("content", "")
project_id, domains, new_name = await _classify_file(
file_path=ec.file,
file_content=file_content,
projects=fixture.projects_list,
config_data_types=fixture.data_types,
)
results.append({
"file": ec.file,
"project_id": project_id,
"domains": domains,
"new_project_name": new_name,
})
return results
def _score_step1(
fixture: EvalFixture,
results: list[dict[str, Any]],
) -> tuple[float, float, float, str]:
"""Score step-1 results. Returns (precision, recall, f1, reasoning)."""
if not fixture.expected_classification:
return 0.0, 0.0, 0.0, "No expected classifications"
total = len(fixture.expected_classification)
matched = 0
details: list[str] = []
for ec in fixture.expected_classification:
actual = next((r for r in results if r["file"] == ec.file), None)
if actual is None:
details.append(f" MISS {ec.file}: not processed")
continue
pid_ok = actual["project_id"] == ec.project_id
domains_ok = set(actual["domains"]) == set(ec.domains) if ec.domains else True
if pid_ok and domains_ok:
matched += 1
details.append(f" OK {ec.file}: project={actual['project_id']}, domains={actual['domains']}")
else:
parts: list[str] = []
if not pid_ok:
parts.append(f"project expected={ec.project_id} got={actual['project_id']}")
if not domains_ok:
parts.append(f"domains expected={ec.domains} got={actual['domains']}")
details.append(f" FAIL {ec.file}: {'; '.join(parts)}")
precision = matched / total if total > 0 else 0.0
recall = precision # in step1, precision == recall (same denominator)
f1 = precision # same
reasoning = "\n".join(details)
return precision, recall, f1, reasoning
# ── Step 2 runner ─────────────────────────────────────────────────────────
async def _run_step2(
fixture: EvalFixture,
model: str,
mock: MockExecutor,
) -> None:
"""Run step-2 processing for each file in the fixture directory.
Compiles ``_PROCESSING_SYSTEM_PROMPT`` with fixture-provided variables
and runs the tool-calling loop. Mutations are captured by the mock.
"""
from app.agent_runner import (
_PROCESSING_SYSTEM_PROMPT,
_build_processing_tools,
_run_agent_with_tools,
_MAX_PROCESSING_STEPS,
)
from app import tracing
# Compile the processing prompt with fixture variables
system_prompt = tracing.compile_prompt(
"batch_processing",
fallback=_PROCESSING_SYSTEM_PROMPT,
variables={
"existing_context": fixture.existing_context,
"project_context": fixture.project_context,
"data_types": ", ".join(fixture.data_types),
"custom_prompt_section": fixture.custom_prompt_section,
},
)
# Override the LLM model for this run
original_model = settings.LLM_MODEL
settings.LLM_MODEL = model
tools = _build_processing_tools(fixture.data_types)
# Scan files in the fixture directory
file_entries = await mock._handle(
action="list_directory",
data={"path": fixture.directory},
)
for entry in file_entries.get("entries", []):
if entry.get("type") != "file":
continue
# Filter by extension if specified
if fixture.file_extensions:
ext = entry["name"].rsplit(".", 1)[-1] if "." in entry["name"] else ""
if ext not in fixture.file_extensions:
continue
file_result = await mock._handle(
action="read_file_content",
data={"path": entry["path"]},
)
file_content: str = file_result.get("content", "")
if not file_content.strip():
continue
await _run_agent_with_tools(
system_prompt=system_prompt,
user_message=(
f"Process this file and extract relevant information.\n\n"
f"File: {entry['path']}\n\nContent:\n{file_content}"
),
tools=tools,
max_steps=_MAX_PROCESSING_STEPS,
)
# ── Full runner ───────────────────────────────────────────────────────────
async def _run_full(
fixture: EvalFixture,
model: str,
mock: MockExecutor,
user_id: str,
) -> None:
"""Run the full two-step pipeline via ``run_local_agent``."""
from app.agent_runner import run_local_agent
# Build trigger data (same shape as what redis_consumer delivers)
trigger_data: dict[str, Any] = {
"type": "agent_trigger",
"directory": fixture.directory,
"directory_paths": [fixture.directory],
"data_types": fixture.data_types,
"file_extensions": fixture.file_extensions,
"prompt_template": prompt_template,
"prompt_template": fixture.custom_prompt_section,
"device_id": "eval-harness",
"run_context": {
"agent_id": f"eval-{fixture.name}-{prompt_variant}",
"run_id": None, # skip DB logging during eval
"agent_id": f"eval-{fixture.name}",
"run_id": None,
},
}
eval_user_id = f"eval-{uuid.uuid4().hex[:8]}"
with mock.patch():
await run_local_agent(user_id, trigger_data)
logger.info(
"eval: starting %s | model=%s | variant=%s",
fixture.name, model, prompt_variant,
)
start_time = time.time()
try:
# Patch execute_on_client + set user context, then run the pipeline
from app.ws_context import set_current_user, clear_current_user
from app.agent_runner import run_local_agent
# ── Scoring helpers ───────────────────────────────────────────────────────
set_current_user(eval_user_id)
with mock.patch():
await run_local_agent(eval_user_id, trigger_data)
except Exception as exc:
logger.error("eval: pipeline failed for %s/%s/%s: %s", fixture.name, model, prompt_variant, exc)
finally:
settings.LLM_MODEL = original_model
from app.ws_context import clear_current_user
clear_current_user()
elapsed = time.time() - start_time
logger.info("eval: pipeline completed in %.1fs — %d mutations", elapsed, len(mock.mutations))
def _score_mutations(
fixture: EvalFixture,
mock: MockExecutor,
) -> tuple[list[FieldScore], float, float, float, int, int]:
"""Score mutations against expected records.
# ── Score results ────────────────────────────────────────────
Returns (field_scores, precision, recall, f1, extra, missing).
"""
all_field_scores: list[FieldScore] = []
total_expected = 0
total_actual = 0
@@ -109,12 +233,10 @@ async def run_single_eval(
total_extra = 0
total_missing = 0
# Group expected by table
expected_by_table: dict[str, list[dict]] = {}
for rec in fixture.expected:
expected_by_table.setdefault(rec.table, []).append(rec.fields)
# Compare against actual mutations (inserts + updates)
tables = set(expected_by_table.keys()) | {m.table for m in mock.mutations}
for table in tables:
expected_records = expected_by_table.get(table, [])
@@ -131,49 +253,160 @@ async def run_single_eval(
total_missing += missing
precision, recall, f1 = compute_precision_recall(total_expected, total_actual, total_matched)
return all_field_scores, precision, recall, f1, total_extra, total_missing
scores = EvalScores(
fixture_name=fixture.name,
model=model,
prompt_variant=prompt_variant,
field_scores=all_field_scores,
precision=precision,
recall=recall,
f1=f1,
extra_records=total_extra,
missing_records=total_missing,
# ── Main entry point ──────────────────────────────────────────────────────
async def run_single_eval(
fixture: EvalFixture,
model: str,
*,
use_llm_judge: bool = True,
judge_model: str = "gpt-4o-mini",
) -> EvalScores:
"""Execute one eval run for a fixture + model. Mode is read from the fixture."""
from shared.config import settings
from shared.ws_context import set_current_user, clear_current_user
seed = copy.deepcopy(fixture.seed_records)
mock = MockExecutor(
fixture_dir=fixture.fixture_path.parent,
seed_records=seed,
)
# ── Optional LLM judge ───────────────────────────────────────
if use_llm_judge and fixture.expected:
all_expected = [r.fields for r in fixture.expected]
all_actual = [m.data for m in mock.mutations if m.action in ("insert", "update")]
judge_score, reasoning = await llm_judge_score(
all_expected, all_actual, judge_model=judge_model,
)
scores.llm_judge_score = judge_score
scores.llm_judge_reasoning = reasoning
original_model = settings.LLM_MODEL
settings.LLM_MODEL = model
eval_user_id = str(uuid.uuid4())
# ── Report to Langfuse ───────────────────────────────────────
dataset_name = f"batch-eval-{fixture.name}"
dataset_item_id = f"{fixture.name}--{prompt_variant}"
run_name = f"{model}--{prompt_variant}--{int(time.time())}"
logger.info(
"eval: starting %s | mode=%s | model=%s",
fixture.name, fixture.mode, model,
)
start_time = time.time()
step1_results: list[dict[str, Any]] = []
step1_reasoning = ""
try:
set_current_user(eval_user_id)
if fixture.mode == "step1":
with mock.patch():
step1_results = await _run_step1(fixture, model, mock)
elif fixture.mode == "step2":
with mock.patch():
await _run_step2(fixture, model, mock)
elif fixture.mode == "full":
with mock.patch():
# Step 1 — classification (independent from run_local_agent)
if fixture.expected_classification:
step1_results = await _run_step1(fixture, model, mock)
# Step 2 — full pipeline (run_local_agent handles both steps)
await _run_full(fixture, model, mock, eval_user_id)
except Exception as exc:
logger.error("eval: pipeline failed for %s/%s: %s", fixture.name, model, exc)
finally:
settings.LLM_MODEL = original_model
clear_current_user()
elapsed = time.time() - start_time
logger.info("eval: completed in %.1fs — %d mutations", elapsed, len(mock.mutations))
# ── Score ─────────────────────────────────────────────────────
if fixture.mode == "step1":
s1_precision, s1_recall, s1_f1, step1_reasoning = _score_step1(fixture, step1_results)
scores = EvalScores(
fixture_name=fixture.name,
model=model,
prompt_variant=fixture.mode,
precision=s1_precision,
recall=s1_recall,
f1=s1_f1,
llm_judge_reasoning=step1_reasoning,
)
else:
# step2 or full — score mutations
field_scores, precision, recall, f1, extra, missing = _score_mutations(fixture, mock)
scores = EvalScores(
fixture_name=fixture.name,
model=model,
prompt_variant=fixture.mode,
field_scores=field_scores,
precision=precision,
recall=recall,
f1=f1,
extra_records=extra,
missing_records=missing,
)
# Add step1 classification scores for full mode
if fixture.mode == "full" and fixture.expected_classification:
s1_p, s1_r, s1_f1, step1_reasoning = _score_step1(fixture, step1_results)
scores.llm_judge_reasoning = f"Step1 classification:\n{step1_reasoning}"
# Optional LLM judge for extraction quality
if use_llm_judge and fixture.expected:
all_expected = [r.fields for r in fixture.expected]
all_actual = [m.data for m in mock.mutations if m.action in ("insert", "update")]
judge_score, reasoning = await llm_judge_score(
all_expected, all_actual, judge_model=judge_model,
)
scores.llm_judge_score = judge_score
if step1_reasoning:
scores.llm_judge_reasoning += f"\n\nLLM judge:\n{reasoning}"
else:
scores.llm_judge_reasoning = reasoning
# ── Report to Langfuse ────────────────────────────────────────
prompt_names = {
"step1": ["batch_file_classifier"],
"step2": ["batch_processing"],
"full": ["batch_file_classifier", "batch_processing"],
}.get(fixture.mode, ["batch_processing"])
trace_id = langfuse_eval.log_eval_trace(
fixture_name=fixture.name,
model=model,
prompt_variant=prompt_variant,
prompt_template=prompt_template,
prompt_variant=fixture.mode,
prompt_template=fixture.custom_prompt_section or "(default)",
actual_mutations=[{"action": m.action, "table": m.table, "data": m.data} for m in mock.mutations],
scores_summary=scores.summary(),
dataset_name=dataset_name,
run_name=run_name,
dataset_item_id=dataset_item_id,
step1_results=step1_results or None,
langfuse_prompt_names=prompt_names,
)
if trace_id:
langfuse_eval.post_eval_scores(scores, trace_id=trace_id)
# For full mode, post classification scores separately
if fixture.mode == "full" and fixture.expected_classification:
s1_p, s1_r, s1_f1, _ = _score_step1(fixture, step1_results)
for name, value in [
("classification_precision", s1_p),
("classification_recall", s1_r),
("classification_f1", s1_f1),
]:
try:
from langfuse import get_client
lf = get_client()
if lf:
lf.create_score(
name=name,
value=value,
trace_id=trace_id,
data_type="NUMERIC",
comment=f"{fixture.name} | {model} | full",
)
except Exception:
pass
return scores
@@ -181,29 +414,20 @@ async def run_fixture_eval(
fixture: EvalFixture,
models: list[str],
*,
variants: list[str] | None = None,
use_llm_judge: bool = True,
judge_model: str = "gpt-4o-mini",
) -> list[EvalScores]:
"""Run all (model × variant) combinations for a fixture."""
if variants is None:
variants = list(fixture.prompt_variants.keys())
# Sync fixture to Langfuse dataset
"""Run all models for a fixture."""
langfuse_eval.sync_fixture_to_dataset(fixture)
results: list[EvalScores] = []
for model in models:
for variant in variants:
if variant not in fixture.prompt_variants:
logger.warning("eval: variant %r not found in fixture %s", variant, fixture.name)
continue
scores = await run_single_eval(
fixture, model, variant,
use_llm_judge=use_llm_judge,
judge_model=judge_model,
)
results.append(scores)
scores = await run_single_eval(
fixture, model,
use_llm_judge=use_llm_judge,
judge_model=judge_model,
)
results.append(scores)
return results
@@ -214,18 +438,21 @@ def print_results(results: list[EvalScores]) -> None:
print("\nNo eval results.")
return
print("\n" + "=" * 90)
print(f"{'Fixture':<25} {'Model':<25} {'Variant':<15} {'P':>6} {'R':>6} {'F1':>6} {'FA':>6} {'LLM':>6}")
print("-" * 90)
print("\n" + "=" * 95)
print(f"{'Fixture':<25} {'Mode':<6} {'Model':<25} {'P':>6} {'R':>6} {'F1':>6} {'FA':>6} {'LLM':>6}")
print("-" * 95)
for s in results:
llm_str = f"{s.llm_judge_score:.2f}" if s.llm_judge_score is not None else " --"
print(
f"{s.fixture_name:<25} {s.model:<25} {s.prompt_variant:<15} "
f"{s.fixture_name:<25} {s.prompt_variant:<6} {s.model:<25} "
f"{s.precision:>6.2f} {s.recall:>6.2f} {s.f1:>6.2f} "
f"{s.field_accuracy:>6.2f} {llm_str:>6}"
)
print("=" * 95)
print()
print("=" * 90)
# If LLM judge reasoning is available, print it

View File

@@ -242,7 +242,7 @@ async def llm_judge_score(
Returns (score, reasoning).
"""
from app.llm import get_llm
from shared.llm import get_llm
llm = get_llm(model=judge_model, temperature=0)