diff --git a/services/batch-agent/eval/cli.py b/services/batch-agent/eval/cli.py index 59a1dbf..7f97db9 100644 --- a/services/batch-agent/eval/cli.py +++ b/services/batch-agent/eval/cli.py @@ -4,14 +4,15 @@ Usage:: # From services/batch-agent/: python -m eval run # all agent fixtures, default model - python -m eval run --fixture=freelance-invoices # single fixture - python -m eval run --models=gpt-4o,anthropic/claude-sonnet-4 - python -m eval run --variants=baseline,detailed # specific prompt variants + python -m eval run --fixture=classify-invoices # single fixture + python -m eval run --models=gpt-4o,gpt-5.3-codex # multiple models + python -m eval run --mode=step1 # only step1 fixtures python -m eval run --no-judge # skip LLM judge scoring - python -m eval journey # all journey fixtures - python -m eval journey --fixture=journey-invoices # single journey fixture - python -m eval journey --models=gpt-4o,anthropic/claude-sonnet-4 + python -m eval interactive # interactive journey session + python -m eval interactive --fixture=journey-invoice-setup + python -m eval interactive --model=gpt-4o + python -m eval interactive --judge-model=github_copilot/gpt-4o-mini python -m eval list # list all fixtures python -m eval sync # sync fixtures to Langfuse datasets @@ -25,16 +26,24 @@ import logging import sys from pathlib import Path -# Ensure the service root and repo root are in sys.path +# Ensure the service root and repo root are in sys.path. +# Service root must come BEFORE repo root so its ``app/`` package +# shadows the monolith ``app/`` in the repo root. _SERVICE_ROOT = Path(__file__).resolve().parent.parent _REPO_ROOT = _SERVICE_ROOT.parent.parent -for p in (_SERVICE_ROOT, _REPO_ROOT): - if str(p) not in sys.path: - sys.path.insert(0, str(p)) +_sr = str(_SERVICE_ROOT) +_rr = str(_REPO_ROOT) +if _rr not in sys.path: + sys.path.insert(0, _rr) +# Always force service root to position 0 (python -m may have already +# added CWD further down the list, which loses to repo root). +if _sr in sys.path: + sys.path.remove(_sr) +sys.path.insert(0, _sr) from eval.config import discover_fixtures, discover_journey_fixtures from eval.runner import run_fixture_eval, print_results -from eval.journey_runner import run_journey_fixture_eval, print_journey_results +from eval.interactive import run_interactive from eval import langfuse_eval @@ -65,13 +74,14 @@ def _parse_args() -> argparse.Namespace: ) run_cmd.add_argument( "--models", "-m", - default="gpt-4o", - help="Comma-separated list of models to test (default: gpt-4o)", + default="github_copilot/gpt-5.3-codex", + help="Comma-separated list of models to test (default: github_copilot/gpt-5.3-codex)", ) run_cmd.add_argument( - "--variants", "-p", + "--mode", default=None, - help="Comma-separated prompt variants to test (default: all in fixture)", + choices=["step1", "step2", "full"], + help="Only run fixtures with this mode (default: all)", ) run_cmd.add_argument( "--no-judge", @@ -80,8 +90,8 @@ def _parse_args() -> argparse.Namespace: ) run_cmd.add_argument( "--judge-model", - default="gpt-4o-mini", - help="Model for LLM judge (default: gpt-4o-mini)", + default="gpt-4o", + help="Model for LLM judge (default: gpt-4o)", ) run_cmd.add_argument( "--fixtures-dir", @@ -95,35 +105,40 @@ def _parse_args() -> argparse.Namespace: list_cmd.add_argument("--fixtures-dir", default=None) list_cmd.add_argument("-v", "--verbose", action="store_true") - # ── journey ─────────────────────────────────────────────────── - journey_cmd = sub.add_parser("journey", help="Run journey evaluations") - journey_cmd.add_argument( - "--fixture", "-f", - help="Run only the named journey fixture (default: all)", - ) - journey_cmd.add_argument( - "--models", "-m", - default="gpt-4o", - help="Comma-separated list of models to test (default: gpt-4o)", - ) - journey_cmd.add_argument( - "--judge-model", - default="gpt-4o-mini", - help="Model for LLM judge (default: gpt-4o-mini)", - ) - journey_cmd.add_argument( - "--fixtures-dir", - default=None, - help="Path to fixtures directory (default: eval/fixtures/)", - ) - journey_cmd.add_argument("-v", "--verbose", action="store_true") - # ── sync ────────────────────────────────────────────────────── sync_cmd = sub.add_parser("sync", help="Sync fixtures to Langfuse datasets") sync_cmd.add_argument("--fixture", "-f", default=None, help="Sync only the named fixture") sync_cmd.add_argument("--fixtures-dir", default=None) sync_cmd.add_argument("-v", "--verbose", action="store_true") + # ── interactive ─────────────────────────────────────────────── + inter_cmd = sub.add_parser("interactive", help="Interactive journey session (human-in-the-loop)") + inter_cmd.add_argument( + "--fixture", "-f", + help="Journey fixture to use (default: pick interactively)", + ) + inter_cmd.add_argument( + "--model", "-m", + default="github_copilot/gpt-5.3-codex", + help="Model for the journey AI (default: github_copilot/gpt-5.3-codex)", + ) + inter_cmd.add_argument( + "--judge-model", + default="gpt-4o", + help="Model for LLM judge (default: gpt-4o)", + ) + inter_cmd.add_argument( + "--fixtures-dir", + default=None, + help="Path to fixtures directory (default: eval/fixtures/)", + ) + inter_cmd.add_argument( + "--data-dir", + default=None, + help="Override sample data directory (e.g. path to private test files not in git)", + ) + inter_cmd.add_argument("-v", "--verbose", action="store_true") + return parser.parse_args() @@ -146,14 +161,14 @@ async def _cmd_run(args: argparse.Namespace) -> None: return models = [m.strip() for m in args.models.split(",")] - variants = [v.strip() for v in args.variants.split(",")] if args.variants else None all_results = [] for fixture in fixtures: + if args.mode and fixture.mode != args.mode: + continue results = await run_fixture_eval( fixture, models=models, - variants=variants, use_llm_judge=not args.no_judge, judge_model=args.judge_model, ) @@ -172,12 +187,12 @@ def _cmd_list(args: argparse.Namespace) -> None: if fixtures: print(f"\n{'[Agent Fixtures]'}") - print(f"{'Name':<30} {'Types':<25} {'Variants':<20} {'Expected'}") + print(f"{'Name':<30} {'Mode':<6} {'Types':<25} {'Expected'}") print("-" * 90) for f in fixtures: - variants = ", ".join(f.prompt_variants.keys()) types = ", ".join(f.data_types) - print(f"{f.name:<30} {types:<25} {variants:<20} {len(f.expected)}") + n_expected = len(f.expected) + len(f.expected_classification) + print(f"{f.name:<30} {f.mode:<6} {types:<25} {n_expected}") if journey_fixtures: print(f"\n{'[Journey Fixtures]'}") @@ -217,30 +232,39 @@ def _cmd_sync(args: argparse.Namespace) -> None: print(f"Skipped: {fixture.name} (Langfuse not configured)") -async def _cmd_journey(args: argparse.Namespace) -> None: +async def _cmd_interactive(args: argparse.Namespace) -> None: journey_fixtures = discover_journey_fixtures(_fixtures_dir(args.fixtures_dir)) if not journey_fixtures: print("No journey fixtures found. Create YAML files with type: journey in eval/fixtures/.") return if args.fixture: - journey_fixtures = [f for f in journey_fixtures if f.name == args.fixture] - if not journey_fixtures: + fixtures = [f for f in journey_fixtures if f.name == args.fixture] + if not fixtures: print(f"Journey fixture '{args.fixture}' not found.") return + fixture = fixtures[0] + elif len(journey_fixtures) == 1: + fixture = journey_fixtures[0] + else: + # Let user pick + print("\nAvailable journey fixtures:") + for i, f in enumerate(journey_fixtures, 1): + print(f" {i}. {f.name} — {f.description[:60]}") + print() + try: + choice = int(input("Pick a fixture number: ").strip()) - 1 + fixture = journey_fixtures[choice] + except (ValueError, IndexError, EOFError, KeyboardInterrupt): + print("Invalid choice.") + return - models = [m.strip() for m in args.models.split(",")] - - all_results = [] - for fixture in journey_fixtures: - results = await run_journey_fixture_eval( - fixture, - models=models, - judge_model=args.judge_model, - ) - all_results.extend(results) - - print_journey_results(all_results) + await run_interactive( + fixture, + model=args.model, + judge_model=args.judge_model, + data_dir=Path(args.data_dir).resolve() if args.data_dir else None, + ) def main() -> None: @@ -249,8 +273,8 @@ def main() -> None: if args.command == "run": asyncio.run(_cmd_run(args)) - elif args.command == "journey": - asyncio.run(_cmd_journey(args)) + elif args.command == "interactive": + asyncio.run(_cmd_interactive(args)) elif args.command == "list": _cmd_list(args) elif args.command == "sync": diff --git a/services/batch-agent/eval/config.py b/services/batch-agent/eval/config.py index 0b61147..1d37405 100644 --- a/services/batch-agent/eval/config.py +++ b/services/batch-agent/eval/config.py @@ -1,70 +1,16 @@ """Eval configuration — YAML fixture loader and dataclasses. -A *fixture* is a YAML file that defines a complete test scenario: +Fixtures come in two families: -.. code-block:: yaml +1. **Agent fixtures** — test the batch agent pipeline. + Three modes controlled by ``mode``: - name: freelance-invoices - description: Extract tasks and notes from invoice PDFs (text layer) - directory: sample_files/invoices # relative to fixture dir - data_types: [tasks, notes] - file_extensions: [txt, md] + ``step1`` — classification prompt only. + ``step2`` — processing prompt only. + ``full`` — both steps in sequence. - # Preseeded records the agent "sees" as existing data - seed_records: - projects: - - id: proj-1 - name: "Website Redesign" - status: active - tasks: [] - - # Prompt variations to test (at least one required) - prompt_variants: - baseline: | - Extract action items as tasks and meeting summaries as notes. - Set priority based on urgency keywords. - detailed: | - Extract action items as tasks. Map "URGENT" to high priority, - "ASAP" to medium. Summaries become notes with full content. - - # Expected extractions — what the agent SHOULD produce - expected: - tasks: - - title: "Send revised invoice to client" - priority: high - status: todo - - title: "Update project timeline" - priority: medium - notes: - - title: "Meeting summary - March kickoff" - - # Optional: models to test (overrides CLI --models) - models: [] - -A *journey fixture* tests the prompt-template builder conversation: - -.. code-block:: yaml - - type: journey - name: journey-invoices - description: Test journey builds a good template for invoices - directory: sample_files/invoices - data_types: [tasks, notes] - - # Simulated user responses for multi-turn conversation - user_messages: - - "I want to extract action items and meeting summaries" - - "Yes, map URGENTE to high priority" - - "That looks good, generate the template" - - # Criteria the generated prompt_template should satisfy - expected_template_criteria: - - "mentions tasks and notes as target entities" - - "includes priority mapping rules" - - "references isAiSuggested=1" - - "does not mention projectId" - - models: [] +2. **Journey fixtures** — test the prompt-template builder conversation + (unchanged). """ from __future__ import annotations @@ -72,12 +18,14 @@ from __future__ import annotations import logging from dataclasses import dataclass, field from pathlib import Path -from typing import Any +from typing import Any, Literal import yaml logger = logging.getLogger(__name__) +EvalMode = Literal["step1", "step2", "full"] + @dataclass class ExpectedRecord: @@ -90,21 +38,52 @@ class ExpectedRecord: fields: dict[str, Any] # field_name → expected_value +@dataclass +class ExpectedClassification: + """Expected output of step-1 classification for one file.""" + + file: str # relative path to the sample file + project_id: str # expected matched project id, or "new" + domains: list[str] # expected domain list + new_project_name: str | None = None + + @dataclass class EvalFixture: - """A complete test scenario loaded from YAML.""" + """A complete test scenario loaded from YAML. + + ``mode`` determines which pipeline steps are exercised: + + - **step1**: only ``_classify_file`` + - **step2**: only the processing LLM + tool loop + - **full**: both steps in sequence (``run_local_agent``) + """ name: str description: str + mode: EvalMode directory: str # relative path to sample files data_types: list[str] file_extensions: list[str] - seed_records: dict[str, list[dict]] - prompt_variants: dict[str, str] # variant_name → prompt_template - expected: list[ExpectedRecord] models: list[str] # if empty, use CLI default fixture_path: Path = field(default_factory=lambda: Path(".")) + # ── Step-1 inputs (classification) ─────────────────────────── + domain_definitions: str = "" + projects_list: list[dict[str, Any]] = field(default_factory=list) + + # ── Step-2 inputs (processing) ─────────────────────────────── + existing_context: str = "" + project_context: str = "" + custom_prompt_section: str = "" + + # ── Seed records for mock executor ─────────────────────────── + seed_records: dict[str, list[dict]] = field(default_factory=dict) + + # ── Expected outputs ───────────────────────────────────────── + expected_classification: list[ExpectedClassification] = field(default_factory=list) + expected: list[ExpectedRecord] = field(default_factory=list) + @property def fixture_dir(self) -> Path: """Absolute path to the sample files directory.""" @@ -115,22 +94,44 @@ class EvalFixture: """Load a fixture from a YAML file.""" raw = yaml.safe_load(path.read_text(encoding="utf-8")) + mode: EvalMode = raw.get("mode", "full") + + # Parse expected records (step2/full) expected: list[ExpectedRecord] = [] for table, records in (raw.get("expected") or {}).items(): for rec in records: expected.append(ExpectedRecord(table=table, fields=rec)) + # Parse expected classification (step1/full) + expected_classification: list[ExpectedClassification] = [] + for item in raw.get("expected_classification") or []: + expected_classification.append(ExpectedClassification( + file=item["file"], + project_id=item["project_id"], + domains=item.get("domains", []), + new_project_name=item.get("new_project_name"), + )) + return cls( name=raw["name"], description=raw.get("description", ""), + mode=mode, directory=raw.get("directory", "sample_files"), data_types=raw.get("data_types", ["tasks"]), file_extensions=raw.get("file_extensions", []), - seed_records=raw.get("seed_records", {}), - prompt_variants=raw.get("prompt_variants", {"default": ""}), - expected=expected, models=raw.get("models", []), fixture_path=path, + # Step-1 inputs + domain_definitions=raw.get("domain_definitions", ""), + projects_list=raw.get("projects_list", []), + # Step-2 inputs + existing_context=raw.get("existing_context", ""), + project_context=raw.get("project_context", ""), + custom_prompt_section=raw.get("custom_prompt_section", ""), + # Shared + seed_records=raw.get("seed_records", {}), + expected_classification=expected_classification, + expected=expected, ) @@ -168,9 +169,9 @@ class JourneyFixture: description: str directory: str # relative path to sample files data_types: list[str] - user_messages: list[str] # simulated user responses expected_template_criteria: list[str] # what the template should contain/satisfy - models: list[str] + user_messages: list[str] = field(default_factory=list) # for automated journey runs (unused in interactive mode) + models: list[str] = field(default_factory=list) fixture_path: Path = field(default_factory=lambda: Path(".")) @property diff --git a/services/batch-agent/eval/fixtures/classify_invoices.yaml b/services/batch-agent/eval/fixtures/classify_invoices.yaml new file mode 100644 index 0000000..c91c700 --- /dev/null +++ b/services/batch-agent/eval/fixtures/classify_invoices.yaml @@ -0,0 +1,40 @@ +# Fixture: classify-invoices (step1) +# Tests _STEP1_SYSTEM_PROMPT — file classification and project matching. +# Verifies that the LLM correctly matches files to existing projects +# and identifies the right data domains. + +name: classify-invoices +mode: step1 +description: > + Test file classification on Italian freelance invoices and meeting notes. + Verifies project matching and domain identification. + +directory: sample_files/invoices +data_types: [tasks, notes, timelines] +file_extensions: [txt, md] + +# ── Step-1 prompt variables ────────────────────────────────────── +domain_definitions: | + - tasks: Action items, deliverables, things to do — anything that someone needs to complete. + - notes: Meeting summaries, decisions, reference information — permanent knowledge entries. + - timelines: Project milestones, deadlines, scheduled events — specific dates that mark a point in the progress of a project. + +projects_list: + - id: "proj-web-redesign" + name: "Redesign Sito Web Corporate" + status: "active" + aiSummary: "Corporate website redesign for Studio Architettura Bianchi" + - id: "proj-ecommerce" + name: "E-Commerce FashionStore" + status: "active" + aiSummary: "Next.js e-commerce platform for FashionStore srl" + +# ── Expected classification results ───────────────────────────── +expected_classification: + - file: "sample_files/invoices/fattura_042.txt" + project_id: "proj-web-redesign" + domains: [tasks, notes, timelines] + + - file: "sample_files/invoices/meeting_ecommerce.md" + project_id: "proj-ecommerce" + domains: [tasks, notes, timelines] diff --git a/services/batch-agent/eval/fixtures/freelance_invoices.yaml b/services/batch-agent/eval/fixtures/freelance_invoices.yaml deleted file mode 100644 index 8194519..0000000 --- a/services/batch-agent/eval/fixtures/freelance_invoices.yaml +++ /dev/null @@ -1,86 +0,0 @@ -# Fixture: freelance-invoices -# Tests extraction of tasks, notes, and timelines from -# invoices and meeting notes typical of a freelance workflow. - -name: freelance-invoices -description: > - Extract tasks, notes, and timeline events from Italian freelance - invoices and meeting notes. Tests project matching, priority - mapping, and bilingual content handling. - -directory: sample_files/invoices -data_types: [tasks, notes, timelines] -file_extensions: [txt, md] - -# Pre-existing records in the "database" -seed_records: - projects: - - id: "proj-web-redesign" - name: "Redesign Sito Web Corporate" - status: "active" - aiSummary: "Corporate website redesign for Studio Architettura Bianchi" - - id: "proj-ecommerce" - name: "E-Commerce FashionStore" - status: "active" - aiSummary: "Next.js e-commerce platform for FashionStore srl" - tasks: [] - notes: [] - timelines: [] - -# Prompt variations to compare -prompt_variants: - baseline: | - Extract action items as tasks and summaries as notes. - For timelines, extract any mentioned dates and deadlines. - Set isAiSuggested=1 on every record. - - detailed_italian: | - Estrai i dati dai file come segue: - - TASK: ogni azione da fare, deliverable, o item con scadenza. - Mappa "URGENTE" o "ALTA PRIORITÀ" → priority: high. - Mappa "media priorità" → priority: medium. - Mappa "bassa priorità" → priority: low. - Se un item è marcato come "completato" o [x], impostalo status: done. - Altrimenti status: todo. - - NOTE: riassunti di meeting, decisioni prese, note tecniche. - Il titolo deve essere descrittivo. Il content deve includere tutti i dettagli. - - TIMELINE: date di scadenza, milestone, meeting futuri. - Formato data: timestamp Unix in millisecondi. - Imposta sempre isAiSuggested=1. - - minimal: | - Extract only high-priority action items as tasks. - Ignore notes and timelines unless explicitly marked as important. - Set isAiSuggested=1. - -# Expected extractions (what the agent SHOULD produce) -# Only key fields are specified — scorer uses fuzzy matching -expected: - tasks: - - title: "Sviluppo frontend React" - priority: "high" - status: "todo" - - title: "Integrazione API backend" - priority: "medium" - status: "todo" - - title: "Testing cross-browser e fix bug responsive" - status: "todo" - - title: "Preparare wireframe homepage" - priority: "high" - status: "todo" - - title: "Setup progetto Next.js e configurare CI/CD" - priority: "medium" - status: "todo" - - title: "Ricerca plugin Stripe per gestione abbonamenti" - priority: "low" - status: "todo" - - notes: - - title: "Meeting Kickoff Progetto E-Commerce" - - timelines: - - title: "MVP E-Commerce pronto" - - title: "Meeting di revisione" - -# Models to test (can be overridden via CLI --models) -models: [] diff --git a/services/batch-agent/eval/fixtures/full_invoices.yaml b/services/batch-agent/eval/fixtures/full_invoices.yaml new file mode 100644 index 0000000..0e7017a --- /dev/null +++ b/services/batch-agent/eval/fixtures/full_invoices.yaml @@ -0,0 +1,108 @@ +# Fixture: full-invoices (full) +# Tests both _STEP1_SYSTEM_PROMPT and _PROCESSING_SYSTEM_PROMPT in sequence +# via run_local_agent(). Verifies end-to-end classification + extraction. + +name: full-invoices +mode: full +description: > + End-to-end test: classify Italian invoices/meeting notes into the + correct project, then extract tasks, notes, and timeline events. + +directory: sample_files/invoices +data_types: [tasks, notes, timelines] +file_extensions: [txt, md] + +# ── Step-1 prompt variables ────────────────────────────────────── +domain_definitions: | + - tasks: Action items, deliverables, things to do — anything that someone needs to complete. + - notes: Meeting summaries, decisions, reference information — permanent knowledge entries. + - timelines: Project milestones, deadlines, scheduled events — specific dates that mark a point in the progress of a project. + +projects_list: + - id: "proj-web-redesign" + name: "Redesign Sito Web Corporate" + status: "active" + aiSummary: "Corporate website redesign for Studio Architettura Bianchi" + - id: "proj-ecommerce" + name: "E-Commerce FashionStore" + status: "active" + aiSummary: "Next.js e-commerce platform for FashionStore srl" + +# ── Step-2 prompt variables ────────────────────────────────────── +existing_context: | + Existing tasks: + (none) + + Existing notes: + (none) + + Existing timelines: + (none) + +project_context: "" + +custom_prompt_section: | + User instructions: + Estrai i dati dai file come segue: + - TASK: ogni azione da fare, deliverable, o item con scadenza. + Mappa "URGENTE" o "ALTA PRIORITÀ" → priority: high. + Mappa "media priorità" → priority: medium. + Mappa "bassa priorità" → priority: low. + Se un item è marcato come "completato" o [x], impostalo status: done. + Altrimenti status: todo. + - NOTE: riassunti di meeting, decisioni prese, note tecniche. + - TIMELINE: date di scadenza, milestone, meeting futuri. + Imposta sempre isAiSuggested=1. + +# ── Seed records (pre-existing DB state) ───────────────────────── +seed_records: + projects: + - id: "proj-web-redesign" + name: "Redesign Sito Web Corporate" + status: "active" + aiSummary: "Corporate website redesign for Studio Architettura Bianchi" + - id: "proj-ecommerce" + name: "E-Commerce FashionStore" + status: "active" + aiSummary: "Next.js e-commerce platform for FashionStore srl" + tasks: [] + notes: [] + timelines: [] + +# ── Expected classification (step 1) ───────────────────────────── +expected_classification: + - file: "sample_files/invoices/fattura_042.txt" + project_id: "proj-web-redesign" + domains: [tasks, notes, timelines] + + - file: "sample_files/invoices/meeting_ecommerce.md" + project_id: "proj-ecommerce" + domains: [tasks, notes, timelines] + +# ── Expected extractions (step 2) ──────────────────────────────── +expected: + tasks: + - title: "Sviluppo frontend React" + priority: "high" + status: "todo" + - title: "Integrazione API backend" + priority: "medium" + status: "todo" + - title: "Testing cross-browser e fix bug responsive" + status: "todo" + - title: "Preparare wireframe homepage" + priority: "high" + status: "todo" + - title: "Setup progetto Next.js e configurare CI/CD" + priority: "medium" + status: "todo" + - title: "Ricerca plugin Stripe per gestione abbonamenti" + priority: "low" + status: "todo" + + notes: + - title: "Meeting Kickoff Progetto E-Commerce" + + timelines: + - title: "MVP E-Commerce pronto" + - title: "Meeting di revisione" diff --git a/services/batch-agent/eval/fixtures/journey_invoice_setup.yaml b/services/batch-agent/eval/fixtures/journey_invoice_setup.yaml index b53ad2d..f98a7e1 100644 --- a/services/batch-agent/eval/fixtures/journey_invoice_setup.yaml +++ b/services/batch-agent/eval/fixtures/journey_invoice_setup.yaml @@ -1,43 +1,25 @@ # Journey Fixture: journey-invoice-setup -# Tests that the journey chatbot correctly builds a prompt_template -# for extracting tasks and notes from Italian invoices and meeting notes. +# Used by `python -m eval interactive` for human-in-the-loop testing +# of the journey chatbot's prompt-building conversation. type: journey name: journey-invoice-setup description: > - Test the journey chatbot's ability to explore a directory of Italian - invoices and meeting notes, ask relevant questions, and produce a - well-structured prompt_template for data extraction. + Interactive test for the journey chatbot — explore a directory of + Italian invoices and meeting notes, answer the chatbot's questions, + and verify it produces a well-structured prompt_template for data + extraction. directory: sample_files/invoices -data_types: [tasks, notes, timelines] - -# Simulated user responses (the journey starts with the LLM exploring -# the directory and asking its first question) -user_messages: - - > - I want to extract action items from invoices and meeting notes. - The invoices are in Italian and contain work descriptions with - deadlines. Meeting notes have action items with checkboxes. - - > - Yes, map Italian priority keywords: "URGENTE" and "ALTA PRIORITÀ" - should be high priority, "media priorità" is medium, "bassa priorità" - is low. Items marked with [x] are already completed. - - > - For notes, I want meeting summaries with the full content including - decisions and attendees. For timelines, extract deadlines and - scheduled meeting dates. - - > - That's everything I need. Please generate the template. +data_types: [tasks, notes, timelines, projects] # Criteria the generated prompt_template must satisfy # Each is scored 0-1 by an LLM judge expected_template_criteria: - "Mentions creating tasks from action items and work descriptions" - - "Includes Italian priority keyword mapping (URGENTE→high, media priorità→medium, bassa priorità→low)" - - "Handles completed items marked with [x] as status done" - "Mentions creating notes from meeting summaries" - "Mentions extracting timeline events from deadlines and meeting dates" + - "Mentions creating projects from relevant information" - "Sets isAiSuggested=1 on all created records" - "Does NOT include projectId assignment logic" - "Uses camelCase field names (title, status, priority, dueDate, content)" diff --git a/services/batch-agent/eval/fixtures/process_invoices.yaml b/services/batch-agent/eval/fixtures/process_invoices.yaml new file mode 100644 index 0000000..30e2e22 --- /dev/null +++ b/services/batch-agent/eval/fixtures/process_invoices.yaml @@ -0,0 +1,81 @@ +# Fixture: process-invoices (step2) +# Tests _PROCESSING_SYSTEM_PROMPT — data extraction & tool calling. +# The classification step is skipped; prompt variables are injected directly. + +name: process-invoices +mode: step2 +description: > + Test data extraction from Italian freelance invoices. + Verifies correct record creation via tool calls with the right + fields, priorities, and status values. + +directory: sample_files/invoices +data_types: [tasks, notes, timelines] +file_extensions: [txt, md] + +# ── Step-2 prompt variables ────────────────────────────────────── +existing_context: | + Existing tasks: + (none) + + Existing notes: + (none) + + Existing timelines: + (none) + +project_context: > + Project: Redesign Sito Web Corporate (id: proj-web-redesign). + Always set projectId to this id on every record you create. + +custom_prompt_section: | + User instructions: + Estrai i dati dai file come segue: + - TASK: ogni azione da fare, deliverable, o item con scadenza. + Mappa "URGENTE" o "ALTA PRIORITÀ" → priority: high. + Mappa "media priorità" → priority: medium. + Mappa "bassa priorità" → priority: low. + Se un item è marcato come "completato" o [x], impostalo status: done. + Altrimenti status: todo. + - NOTE: riassunti di meeting, decisioni prese, note tecniche. + Il titolo deve essere descrittivo. Il content deve includere tutti i dettagli. + - TIMELINE: date di scadenza, milestone, meeting futuri. + Imposta sempre isAiSuggested=1. + +# ── Seed records (pre-existing DB state) ───────────────────────── +seed_records: + projects: + - id: "proj-web-redesign" + name: "Redesign Sito Web Corporate" + status: "active" + tasks: [] + notes: [] + timelines: [] + +# ── Expected extractions ───────────────────────────────────────── +expected: + tasks: + - title: "Sviluppo frontend React" + priority: "high" + status: "todo" + - title: "Integrazione API backend" + priority: "medium" + status: "todo" + - title: "Testing cross-browser e fix bug responsive" + status: "todo" + - title: "Preparare wireframe homepage" + priority: "high" + status: "todo" + - title: "Setup progetto Next.js e configurare CI/CD" + priority: "medium" + status: "todo" + - title: "Ricerca plugin Stripe per gestione abbonamenti" + priority: "low" + status: "todo" + + notes: + - title: "Meeting Kickoff Progetto E-Commerce" + + timelines: + - title: "MVP E-Commerce pronto" + - title: "Meeting di revisione" diff --git a/services/batch-agent/eval/interactive.py b/services/batch-agent/eval/interactive.py new file mode 100644 index 0000000..e47e640 --- /dev/null +++ b/services/batch-agent/eval/interactive.py @@ -0,0 +1,471 @@ +"""Interactive journey session — human-in-the-loop CLI conversation. + +Flow: +1. Show the system prompt used by the journey AI. +2. Start the journey (AI explores files, asks first question). +3. User types responses in the terminal — AI replies. +4. User types `/done` to end the conversation. +5. User writes a comment about the interaction quality. +6. LLM judge scores the conversation + generated template. +7. Results are reported to Langfuse. + +Usage:: + + python -m eval interactive # pick a fixture interactively + python -m eval interactive --fixture=journey-invoice-setup + python -m eval interactive --model=gpt-4o + python -m eval interactive --judge-model=github_copilot/gpt-4o-mini +""" + +from __future__ import annotations + +import asyncio +import json +import logging +import sys +import time +import uuid +from dataclasses import dataclass, field +from typing import Any + +from langchain_core.messages import HumanMessage, SystemMessage + +from eval.config import JourneyFixture, discover_journey_fixtures +from eval.mock_executor import MockExecutor +from eval import langfuse_eval + +logger = logging.getLogger(__name__) + +# ── Special commands ───────────────────────────────────────────────────── + +_CMD_DONE = "/done" +_CMD_QUIT = "/quit" +_CMD_TEMPLATE = "/template" +_CMD_HELP = "/help" + +_HELP_TEXT = f"""\ + {_CMD_DONE} — End the conversation and proceed to evaluation + {_CMD_QUIT} — Abort without evaluation + {_CMD_TEMPLATE} — Show the generated template (if any) + {_CMD_HELP} — Show this help""" + +# ── Terminal colours (ANSI) ────────────────────────────────────────────── + +_C_RESET = "\033[0m" +_C_BOLD = "\033[1m" +_C_DIM = "\033[2m" +_C_CYAN = "\033[36m" +_C_GREEN = "\033[32m" +_C_YELLOW = "\033[33m" +_C_MAGENTA = "\033[35m" +_C_RED = "\033[31m" +_C_BLUE = "\033[34m" + + +def _print_header(text: str) -> None: + print(f"\n{_C_BOLD}{_C_CYAN}{'═' * 80}") + print(f" {text}") + print(f"{'═' * 80}{_C_RESET}\n") + + +def _print_ai(text: str) -> None: + print(f"\n{_C_GREEN}{_C_BOLD}AI:{_C_RESET} {text}\n") + + +def _print_system(text: str) -> None: + print(f"{_C_DIM}{text}{_C_RESET}") + + +def _print_score(label: str, score: float) -> None: + if score >= 0.7: + color = _C_GREEN + tag = "PASS" + elif score >= 0.4: + color = _C_YELLOW + tag = "PARTIAL" + else: + color = _C_RED + tag = "FAIL" + print(f" {color}{tag:>7}{_C_RESET} ({score:.1f}) {label}") + + +# ── Result type ────────────────────────────────────────────────────────── + + +@dataclass +class InteractiveResult: + fixture_name: str + model: str + judge_model: str + prompt_template: str | None + conversation: list[dict[str, str]] + user_comment: str + done: bool + criteria_scores: dict[str, float] + overall_score: float + judge_reasoning: str + elapsed_seconds: float + + def summary(self) -> dict[str, Any]: + return { + "fixture": self.fixture_name, + "model": self.model, + "judge_model": self.judge_model, + "done": self.done, + "turns": len([c for c in self.conversation if c["role"] == "user"]), + "overall_score": round(self.overall_score, 3), + "user_comment": self.user_comment, + "criteria_scores": {k: round(v, 3) for k, v in self.criteria_scores.items()}, + "elapsed_s": round(self.elapsed_seconds, 1), + } + + +# ── LLM judge ──────────────────────────────────────────────────────────── + +_INTERACTIVE_JUDGE_SYSTEM = """\ +You are an evaluation judge for AI-generated prompt templates produced during +an interactive conversation between a human and a journey chatbot. + +The chatbot explored a directory and through multi-turn conversation with the +user produced a prompt_template — an instruction set for a data-extraction agent. + +You have access to: +- The full conversation transcript +- The generated prompt_template (if any) +- The user's own comment about the interaction +- A list of quality criteria + +Score each criterion from 0 to 1: + - 1.0: Fully satisfied + - 0.5: Partially satisfied + - 0.0: Not satisfied + +Also provide an overall_quality score (0-1) evaluating the conversation flow, +how well the AI understood the user, and the template quality. + +Respond with ONLY a JSON object: +{ + "criteria_scores": {"criterion_1": 0.8, ...}, + "overall_quality": 0.85, + "reasoning": "Brief explanation covering both conversation quality and template accuracy" +} +""" + + +async def _judge_interactive( + conversation: list[dict[str, str]], + prompt_template: str | None, + user_comment: str, + criteria: list[str], + *, + judge_model: str = "gpt-4o-mini", +) -> tuple[dict[str, float], float, str]: + """Score an interactive session. Returns (criteria_scores, overall_quality, reasoning).""" + from shared.llm import get_llm + + llm = get_llm(model=judge_model, temperature=0) + + conv_text = "\n".join( + f"{'USER' if t['role'] == 'user' else 'AI'}: {t['content']}" + for t in conversation + ) + criteria_text = "\n".join(f" {i+1}. {c}" for i, c in enumerate(criteria)) + + user_content = ( + f"## Conversation transcript\n```\n{conv_text}\n```\n\n" + f"## Generated prompt_template\n```\n{prompt_template or '(none — conversation did not complete)'}\n```\n\n" + f"## User's comment\n{user_comment}\n\n" + f"## Criteria to evaluate\n{criteria_text}" + ) + + try: + response = await llm.ainvoke([ + SystemMessage(content=_INTERACTIVE_JUDGE_SYSTEM), + HumanMessage(content=user_content), + ]) + raw = response.content.strip() + if raw.startswith("```"): + raw = raw.split("```")[1] + if raw.startswith("json"): + raw = raw[4:] + parsed = json.loads(raw.strip()) + + scores_raw = parsed.get("criteria_scores", parsed.get("scores", {})) + criteria_scores: dict[str, float] = {} + for i, criterion in enumerate(criteria): + key_candidates = [f"criterion_{i+1}", criterion, criterion[:50], str(i + 1)] + score = 0.0 + for key in key_candidates: + if key in scores_raw: + score = float(scores_raw[key]) + break + if score == 0.0 and i < len(scores_raw): + score = float(list(scores_raw.values())[i]) + criteria_scores[criterion] = score + + overall = float(parsed.get("overall_quality", 0.0)) + reasoning = str(parsed.get("reasoning", "")) + return criteria_scores, overall, reasoning + + except Exception as exc: + logger.warning("interactive judge failed: %s", exc) + return {c: 0.0 for c in criteria}, 0.0, f"Judge error: {exc}" + + +# ── Interactive session ────────────────────────────────────────────────── + + +async def run_interactive( + fixture: JourneyFixture, + *, + model: str = "gpt-4o", + judge_model: str = "gpt-4o-mini", + data_dir: Path | None = None, +) -> InteractiveResult: + """Run an interactive journey session in the terminal. + + Parameters + ---------- + data_dir : + If set, overrides the fixture's sample-file directory. The LLM + will explore this folder instead of the default + ``fixtures/sample_files/…``. Useful for private test data that + shouldn't be committed to git. + """ + from shared.config import settings + from shared.ws_context import set_current_user, clear_current_user + from app.journey import ( + handle_journey_start, + handle_journey_message, + _build_system_prompt, + ) + + # When --data-dir is given, the MockExecutor's root becomes + # data_dir's parent and the journey directory is data_dir's name. + # This way the LLM sees a meaningful directory name (not ".") and + # MockExecutor resolves paths correctly. + # Otherwise, use the fixture's YAML parent and its relative path. + if data_dir: + mock_root = data_dir.parent + journey_directory = data_dir.name + else: + mock_root = fixture.fixture_path.parent + journey_directory = fixture.directory + + mock = MockExecutor( + fixture_dir=mock_root, + seed_records={}, + ) + + original_model = settings.LLM_MODEL + settings.LLM_MODEL = model + eval_user_id = f"interactive-{uuid.uuid4().hex[:8]}" + + # ── Show system prompt ─────────────────────────────────────── + system_prompt = _build_system_prompt(journey_directory, fixture.data_types) + + _print_header("SYSTEM PROMPT") + print(f"{_C_DIM}{system_prompt}{_C_RESET}") + + _print_header(f"INTERACTIVE JOURNEY | fixture: {fixture.name} | model: {model}") + print(f" Data dir: {mock_root}") + print(f" Type your responses. Commands: {_CMD_DONE}, {_CMD_QUIT}, {_CMD_TEMPLATE}, {_CMD_HELP}") + print(f" Judge model: {judge_model}") + print(f" Criteria: {len(fixture.expected_template_criteria)}") + print() + + conversation: list[dict[str, str]] = [] + prompt_template: str | None = None + done = False + start_time = time.time() + + try: + set_current_user(eval_user_id) + + with mock.patch(): + # ── Start ──────────────────────────────────────────── + _print_system("Starting journey... (AI is exploring your files)") + + start_frame: dict[str, Any] = { + "agent_type": "local", + "directory": journey_directory, + "data_types": fixture.data_types, + "session_id": f"interactive-{uuid.uuid4().hex[:8]}", + } + + reply = await handle_journey_start(eval_user_id, start_frame) + session_id = reply["session_id"] + conversation.append({"role": "assistant", "content": reply["message"]}) + _print_ai(reply["message"]) + + if reply["done"]: + prompt_template = reply.get("prompt_template") + done = True + _print_system("Journey completed on first reply (template generated).") + + # ── Conversation loop ──────────────────────────────── + while not done: + try: + user_input = input(f"{_C_BOLD}{_C_BLUE}YOU:{_C_RESET} ").strip() + except (EOFError, KeyboardInterrupt): + print() + user_input = _CMD_QUIT + + if not user_input: + continue + + # Handle commands + if user_input.lower() == _CMD_QUIT: + _print_system("Aborted — no evaluation will be performed.") + settings.LLM_MODEL = original_model + clear_current_user() + return InteractiveResult( + fixture_name=fixture.name, model=model, judge_model=judge_model, + prompt_template=None, conversation=conversation, + user_comment="(aborted)", done=False, + criteria_scores={}, overall_score=0.0, + judge_reasoning="Session aborted by user.", + elapsed_seconds=time.time() - start_time, + ) + + if user_input.lower() == _CMD_HELP: + print(_HELP_TEXT) + continue + + if user_input.lower() == _CMD_TEMPLATE: + if prompt_template: + print(f"\n{_C_MAGENTA}{prompt_template}{_C_RESET}\n") + else: + _print_system("No template generated yet.") + continue + + if user_input.lower() == _CMD_DONE: + _print_system("Ending conversation...") + break + + # ── Send message to AI ─────────────────────────── + conversation.append({"role": "user", "content": user_input}) + _print_system("AI is thinking...") + + msg_frame: dict[str, Any] = { + "session_id": session_id, + "message": user_input, + } + reply = await handle_journey_message(eval_user_id, msg_frame) + conversation.append({"role": "assistant", "content": reply["message"]}) + _print_ai(reply["message"]) + + if reply["done"]: + prompt_template = reply.get("prompt_template") + done = True + _print_system("Journey completed — template generated!") + + except Exception as exc: + logger.error("interactive journey failed: %s", exc) + _print_system(f"Error: {exc}") + finally: + settings.LLM_MODEL = original_model + clear_current_user() + + elapsed = time.time() - start_time + turns = len([c for c in conversation if c["role"] == "user"]) + + # ── Show template if generated ─────────────────────────────── + if prompt_template: + _print_header("GENERATED TEMPLATE") + print(f"{_C_MAGENTA}{prompt_template}{_C_RESET}\n") + else: + _print_system("No template was generated during this session.") + + # ── User comment ───────────────────────────────────────────── + _print_header("YOUR EVALUATION") + print(" Write your comment about this interaction (press Enter twice to finish):") + print() + comment_lines: list[str] = [] + try: + while True: + line = input() + if line == "" and comment_lines and comment_lines[-1] == "": + comment_lines.pop() # remove trailing empty + break + comment_lines.append(line) + except (EOFError, KeyboardInterrupt): + pass + user_comment = "\n".join(comment_lines).strip() or "(no comment)" + + # ── Judge ──────────────────────────────────────────────────── + _print_header("LLM JUDGE EVALUATION") + _print_system(f"Scoring with {judge_model}...") + + criteria_scores, overall_quality, judge_reasoning = await _judge_interactive( + conversation=conversation, + prompt_template=prompt_template, + user_comment=user_comment, + criteria=fixture.expected_template_criteria, + judge_model=judge_model, + ) + + # ── Display scores ─────────────────────────────────────────── + print() + for criterion, score in criteria_scores.items(): + _print_score(criterion, score) + + overall = ( + sum(criteria_scores.values()) / len(criteria_scores) + if criteria_scores + else 0.0 + ) + + print(f"\n {_C_BOLD}Criteria avg: {overall:.2f}{_C_RESET}") + print(f" {_C_BOLD}Overall quality: {overall_quality:.2f}{_C_RESET}") + print(f" {_C_BOLD}Turns: {turns}{_C_RESET}") + print(f" {_C_BOLD}Time: {elapsed:.1f}s{_C_RESET}") + print(f"\n {_C_DIM}Judge: {judge_reasoning}{_C_RESET}") + print(f" {_C_DIM}Your comment: {user_comment}{_C_RESET}\n") + + result = InteractiveResult( + fixture_name=fixture.name, + model=model, + judge_model=judge_model, + prompt_template=prompt_template, + conversation=conversation, + user_comment=user_comment, + done=done, + criteria_scores=criteria_scores, + overall_score=overall_quality, + judge_reasoning=judge_reasoning, + elapsed_seconds=elapsed, + ) + + # ── Report to Langfuse ─────────────────────────────────────── + trace_id = langfuse_eval.log_eval_trace( + fixture_name=fixture.name, + model=model, + prompt_variant="interactive", + prompt_template=prompt_template or "(not generated)", + actual_mutations=[{ + "conversation": conversation[:30], + "user_comment": user_comment, + }], + scores_summary=result.summary(), + langfuse_prompt_names=["journey_system"], + ) + + if trace_id: + from eval.scorer import EvalScores + scores_obj = EvalScores( + fixture_name=fixture.name, + model=model, + prompt_variant="interactive", + precision=overall, + recall=float(done), + f1=overall, + llm_judge_score=overall_quality, + llm_judge_reasoning=judge_reasoning, + ) + langfuse_eval.post_eval_scores(scores_obj, trace_id=trace_id) + _print_system(f"Results reported to Langfuse (trace: {trace_id})") + else: + _print_system("Langfuse not configured — results not reported.") + + return result diff --git a/services/batch-agent/eval/journey_runner.py b/services/batch-agent/eval/journey_runner.py index f49b57a..4e0965a 100644 --- a/services/batch-agent/eval/journey_runner.py +++ b/services/batch-agent/eval/journey_runner.py @@ -94,7 +94,7 @@ async def _judge_template( Returns (criteria_scores, reasoning). """ - from app.llm import get_llm + from shared.llm import get_llm llm = get_llm(model=judge_model, temperature=0) @@ -152,13 +152,23 @@ async def run_single_journey_eval( model: str, *, judge_model: str = "gpt-4o-mini", + data_dir: Path | None = None, ) -> JourneyEvalResult: - """Execute one journey eval: start → messages → score template.""" + """Execute one journey eval: start \u2192 messages \u2192 score template.""" from shared.config import settings - # Build mock executor for filesystem tools + # When data_dir is given, use its parent as MockExecutor root + # and its name as the journey directory so the LLM sees a + # meaningful path (not "."). + if data_dir: + mock_root = data_dir.parent + journey_directory = data_dir.name + else: + mock_root = fixture.fixture_path.parent + journey_directory = fixture.directory + mock = MockExecutor( - fixture_dir=fixture.fixture_dir, + fixture_dir=mock_root, seed_records={}, ) @@ -178,7 +188,7 @@ async def run_single_journey_eval( done = False try: - from app.ws_context import set_current_user, clear_current_user + from shared.ws_context import set_current_user, clear_current_user from app.journey import handle_journey_start, handle_journey_message, _sessions set_current_user(eval_user_id) @@ -186,7 +196,7 @@ async def run_single_journey_eval( # ── Start the journey ──────────────────────────────── start_frame: dict[str, Any] = { "agent_type": "local", - "directory": fixture.directory, + "directory": journey_directory, "data_types": fixture.data_types, "session_id": f"eval-{uuid.uuid4().hex[:8]}", } @@ -246,7 +256,7 @@ async def run_single_journey_eval( logger.error("journey_eval: pipeline failed for %s/%s: %s", fixture.name, model, exc) finally: settings.LLM_MODEL = original_model - from app.ws_context import clear_current_user + from shared.ws_context import clear_current_user clear_current_user() elapsed = time.time() - start_time @@ -297,6 +307,7 @@ async def run_single_journey_eval( prompt_template=prompt_template or "(not generated)", actual_mutations=[{"conversation": conversation[:20]}], scores_summary=result.summary(), + langfuse_prompt_names=["journey_system"], ) if trace_id: @@ -321,6 +332,7 @@ async def run_journey_fixture_eval( models: list[str], *, judge_model: str = "gpt-4o-mini", + data_dir: Path | None = None, ) -> list[JourneyEvalResult]: """Run all models for a journey fixture.""" langfuse_eval.sync_journey_fixture_to_dataset(fixture) @@ -329,6 +341,7 @@ async def run_journey_fixture_eval( for model in models: result = await run_single_journey_eval( fixture, model, judge_model=judge_model, + data_dir=data_dir, ) results.append(result) diff --git a/services/batch-agent/eval/langfuse_eval.py b/services/batch-agent/eval/langfuse_eval.py index 8ce2cbd..7c7bad4 100644 --- a/services/batch-agent/eval/langfuse_eval.py +++ b/services/batch-agent/eval/langfuse_eval.py @@ -1,21 +1,21 @@ """Langfuse evaluation integration — datasets, runs, and scoring. -Uses the Langfuse Python SDK to: +Uses the Langfuse Python SDK v4 (OpenTelemetry-based) to: 1. **Sync fixtures → Langfuse datasets**: Each YAML fixture becomes a dataset, each prompt variant + expected pair becomes a dataset item. 2. **Track eval runs**: Each (fixture × model × prompt_variant) execution - is recorded as a dataset run with linked traces and scores. + is recorded as a trace with linked scores. 3. **Post scores**: precision, recall, F1, field_accuracy, llm_judge are - posted as numeric scores on the trace/run. + posted as numeric scores on the trace. """ from __future__ import annotations -import json import logging +import os from typing import Any from shared.config import settings @@ -26,16 +26,16 @@ logger = logging.getLogger(__name__) def _get_langfuse(): - """Get or create a Langfuse client instance.""" + """Get or create a Langfuse client instance (SDK v4).""" if not settings.LANGFUSE_SECRET_KEY or not settings.LANGFUSE_PUBLIC_KEY: return None try: - from langfuse import Langfuse - return Langfuse( - secret_key=settings.LANGFUSE_SECRET_KEY, - public_key=settings.LANGFUSE_PUBLIC_KEY, - host=settings.LANGFUSE_HOST, - ) + os.environ.setdefault("LANGFUSE_SECRET_KEY", settings.LANGFUSE_SECRET_KEY) + os.environ.setdefault("LANGFUSE_PUBLIC_KEY", settings.LANGFUSE_PUBLIC_KEY) + if settings.LANGFUSE_HOST: + os.environ.setdefault("LANGFUSE_HOST", settings.LANGFUSE_HOST) + from langfuse import get_client + return get_client() except Exception as exc: logger.warning("langfuse_eval: failed to create client: %s", exc) return None @@ -61,35 +61,44 @@ def sync_fixture_to_dataset(fixture: EvalFixture) -> str | None: lf.create_dataset( name=dataset_name, description=fixture.description, - metadata={"data_types": fixture.data_types, "file_extensions": fixture.file_extensions}, + metadata={ + "data_types": ",".join(fixture.data_types), + "file_extensions": ",".join(fixture.file_extensions) if fixture.file_extensions else "", + }, ) except Exception: # Dataset may already exist — that's fine pass - expected_output = {} - for rec in fixture.expected: - expected_output.setdefault(rec.table, []).append(rec.fields) + # Build expected_output appropriate to the fixture's mode + expected_output: dict[str, Any] = {} + if fixture.mode in ("step1", "full") and fixture.expected_classification: + expected_output["classifications"] = [ + {"file": ec.file, "project_id": ec.project_id, "domains": ec.domains} + for ec in fixture.expected_classification + ] + if fixture.mode in ("step2", "full") and fixture.expected: + for rec in fixture.expected: + expected_output.setdefault(rec.table, []).append(rec.fields) - for variant_name, prompt_template in fixture.prompt_variants.items(): - item_id = f"{fixture.name}--{variant_name}" - try: - lf.create_dataset_item( - dataset_name=dataset_name, - id=item_id, - input={ - "directory": fixture.directory, - "data_types": fixture.data_types, - "prompt_template": prompt_template, - "seed_records": fixture.seed_records, - }, - expected_output=expected_output, - metadata={"prompt_variant": variant_name}, - ) - except Exception as exc: - logger.warning( - "langfuse_eval: failed to upsert dataset item %s: %s", item_id, exc - ) + item_id = f"{fixture.name}--{fixture.mode}" + try: + lf.create_dataset_item( + dataset_name=dataset_name, + id=item_id, + input={ + "directory": fixture.directory, + "data_types": fixture.data_types, + "mode": fixture.mode, + "seed_records": fixture.seed_records, + }, + expected_output=expected_output, + metadata={"mode": fixture.mode}, + ) + except Exception as exc: + logger.warning( + "langfuse_eval: failed to upsert dataset item %s: %s", item_id, exc + ) lf.flush() logger.info("langfuse_eval: synced fixture '%s' → dataset '%s'", fixture.name, dataset_name) @@ -114,7 +123,7 @@ def sync_journey_fixture_to_dataset(fixture) -> str | None: lf.create_dataset( name=dataset_name, description=fixture.description, - metadata={"type": "journey", "data_types": fixture.data_types}, + metadata={"type": "journey", "data_types": ",".join(fixture.data_types)}, ) except Exception: pass # Dataset may already exist @@ -148,18 +157,26 @@ def create_eval_run( *, metadata: dict[str, Any] | None = None, ) -> str: - """Create a dataset run in Langfuse. Returns the run name.""" + """Create a dataset run in Langfuse. Returns the run name. + + Note: In SDK v4, dataset runs are created implicitly via + dataset.run_experiment(). This function is kept for backwards + compatibility but may not create a run. + """ lf = _get_langfuse() if lf is None: return run_name try: - lf.create_dataset_run( - dataset_name=dataset_name, - run_name=run_name, - metadata=metadata or {}, - ) - lf.flush() + if hasattr(lf, "create_dataset_run"): + lf.create_dataset_run( + dataset_name=dataset_name, + run_name=run_name, + metadata=metadata or {}, + ) + lf.flush() + else: + logger.debug("langfuse_eval: create_dataset_run not available in SDK v4") except Exception as exc: logger.warning("langfuse_eval: failed to create run %s: %s", run_name, exc) @@ -185,21 +202,22 @@ def post_eval_scores( ("precision", scores.precision), ("recall", scores.recall), ("f1", scores.f1), - ("field_accuracy", scores.field_accuracy), ] + # Only post field_accuracy when there are field-level scores (step2/full) + if scores.field_scores: + score_data.append(("field_accuracy", scores.field_accuracy)) if scores.llm_judge_score is not None: score_data.append(("llm_judge", scores.llm_judge_score)) for name, value in score_data: try: - kwargs: dict[str, Any] = { - "name": name, - "value": value, - "comment": f"{scores.fixture_name} | {scores.model} | {scores.prompt_variant}", - } - if trace_id: - kwargs["trace_id"] = trace_id - lf.score(**kwargs) + lf.create_score( + name=name, + value=value, + trace_id=trace_id, + data_type="NUMERIC", + comment=f"{scores.fixture_name} | {scores.model} | {scores.prompt_variant}", + ) except Exception as exc: logger.warning("langfuse_eval: failed to post score %s: %s", name, exc) @@ -218,12 +236,20 @@ def log_eval_trace( prompt_template: str, actual_mutations: list[dict], scores_summary: dict[str, Any], + step1_results: list[dict] | None = None, dataset_name: str | None = None, run_name: str | None = None, dataset_item_id: str | None = None, + langfuse_prompt_names: list[str] | None = None, ) -> str | None: """Create a Langfuse trace for one eval execution and link it to a dataset run. + Uses SDK v4 observation API (traces are created implicitly by root spans). + ``langfuse_prompt_names`` can contain one or two prompt names to link + (e.g. ``["batch_file_classifier", "batch_processing"]`` for full mode). + Each prompt gets its own generation-type observation for per-version + metrics tracking. + Returns the trace_id, or None if Langfuse is unavailable. """ lf = _get_langfuse() @@ -231,38 +257,71 @@ def log_eval_trace( return None try: - trace = lf.trace( - name=f"eval-{fixture_name}", - input={ - "prompt_template": prompt_template, - "model": model, - "prompt_variant": prompt_variant, - }, - output={ - "mutations": actual_mutations[:50], - "scores": scores_summary, - }, + from langfuse import propagate_attributes + + # Fetch prompt objects for linking + prompt_objs: list[tuple[str, Any]] = [] + for pname in (langfuse_prompt_names or []): + try: + obj = lf.get_prompt(name=pname, cache_ttl_seconds=300) + prompt_objs.append((pname, obj)) + logger.info("langfuse_eval: linked prompt '%s' (type=%s)", pname, type(obj).__name__) + except Exception as exc: + logger.warning("langfuse_eval: prompt '%s' not found — %s", pname, exc) + + # Build trace output dict + trace_output: dict[str, Any] = {"scores": scores_summary} + if step1_results: + trace_output["classifications"] = step1_results + if actual_mutations: + trace_output["mutations"] = actual_mutations[:50] + + with propagate_attributes( + trace_name=f"eval-{fixture_name}", metadata={ - "eval": True, + "eval": "true", "fixture": fixture_name, "model": model, "prompt_variant": prompt_variant, }, tags=["eval", f"model:{model}", f"variant:{prompt_variant}"], - ) + ): + # Root span for the eval run + span = lf.start_observation(name=f"eval-{fixture_name}") + span.update( + input={ + "prompt_template": prompt_template, + "model": model, + "prompt_variant": prompt_variant, + }, + output=trace_output, + ) + trace_id = span.trace_id - # Link to dataset run if available - if dataset_name and run_name and dataset_item_id: - try: - dataset = lf.get_dataset(dataset_name) - item = dataset.get_item(dataset_item_id) - if item: - item.link(trace, run_name) - except Exception as exc: - logger.warning("langfuse_eval: failed to link trace to dataset run: %s", exc) + # Create a generation-type observation per linked prompt + for pname, pobj in prompt_objs: + gen = lf.start_observation( + name=f"prompt-{pname}", + prompt=pobj, + as_type="generation", + ) + gen.end() + + # Link to dataset run if available + if dataset_name and run_name and dataset_item_id: + try: + dataset = lf.get_dataset(dataset_name) + for item in dataset.items: + if item.id == dataset_item_id: + item.link(span, run_name) + break + except Exception as exc: + logger.warning("langfuse_eval: failed to link trace to dataset run: %s", exc) + + span.end() lf.flush() - return trace.id + return trace_id except Exception as exc: logger.warning("langfuse_eval: failed to create eval trace: %s", exc) return None diff --git a/services/batch-agent/eval/mock_executor.py b/services/batch-agent/eval/mock_executor.py index 93d83cb..4c81e56 100644 --- a/services/batch-agent/eval/mock_executor.py +++ b/services/batch-agent/eval/mock_executor.py @@ -1,6 +1,6 @@ """Mock executor — intercepts execute_on_client for offline E2E testing. -Patches ``app.ws_context.execute_on_client`` so agent pipeline runs don't +Patches ``execute_on_client`` at all usage sites so agent pipeline runs don't require a live Electron client or Redis. Instead: - **Filesystem actions** (list_directory, read_file_content, get_file_metadata) @@ -20,6 +20,7 @@ import uuid from dataclasses import dataclass, field from pathlib import Path from typing import Any +from contextlib import contextmanager, asynccontextmanager from unittest.mock import AsyncMock, patch @@ -33,6 +34,30 @@ class Mutation: timestamp: float = field(default_factory=time.time) +# ── Fake DB helpers (used to bypass async_session in full mode) ─────── + +class _FakeRow: + """Mimics an AgentRunLog row returned by SQLAlchemy.""" + id = 0 + status = "running" + items_processed = 0 + items_created = 0 + errors: list[str] = [] + completed_at = None + + def __setattr__(self, name: str, value: Any) -> None: + object.__setattr__(self, name, value) + + +class _FakeResult: + """Mimics a SQLAlchemy ``Result`` with ``scalar_one_or_none``.""" + def __init__(self, row: _FakeRow) -> None: + self._row = row + + def scalar_one_or_none(self) -> _FakeRow: + return self._row + + @dataclass class MockExecutor: """In-memory executor that replaces Redis-based tool round-trip. @@ -77,12 +102,37 @@ class MockExecutor: # ── Context manager for patching ────────────────────────────── + @contextmanager def patch(self): - """Return an async context-manager that patches execute_on_client.""" - return patch( - "app.ws_context.execute_on_client", - new=AsyncMock(side_effect=self._handle), - ) + """Patch execute_on_client and DB session at all usage sites.""" + mock_fn = AsyncMock(side_effect=self._handle) + targets = [ + "shared.ws_context.execute_on_client", + "app.agent_runner.execute_on_client", + "app.agents.filesystem_agent.execute_on_client", + ] + + # Mock async_session so run_local_agent / _finalize_run skip real DB + fake_row = _FakeRow() + fake_db = AsyncMock() + fake_db.commit = AsyncMock() + fake_db.refresh = AsyncMock() + fake_db.execute = AsyncMock(return_value=_FakeResult(fake_row)) + fake_db.add = lambda obj: None # noqa: ARG005 + + @asynccontextmanager + async def _fake_session(): + yield fake_db + + patches = [patch(t, new=mock_fn) for t in targets] + patches.append(patch("app.agent_runner.async_session", _fake_session)) + for p in patches: + p.start() + try: + yield mock_fn + finally: + for p in patches: + p.stop() # ── Internal dispatch ───────────────────────────────────────── diff --git a/services/batch-agent/eval/runner.py b/services/batch-agent/eval/runner.py index 920d35f..57d0609 100644 --- a/services/batch-agent/eval/runner.py +++ b/services/batch-agent/eval/runner.py @@ -1,28 +1,31 @@ """Eval runner — orchestrates fixture → mock → agent pipeline → scoring. -For each (fixture × model × prompt_variant) combination: -1. Build a MockExecutor with fixture data -2. Patch execute_on_client -3. Override LLM_MODEL in shared settings -4. Run the batch agent pipeline (run_local_agent) -5. Collect mutations from the mock -6. Score against expected results (field match + optional LLM judge) -7. Report scores to Langfuse -8. Print results +Supports three eval modes: + +- **step1**: Test classification prompt only (``_STEP1_SYSTEM_PROMPT``). + Calls the LLM with fixture-provided ``domain_definitions`` and + ``projects_list`` and compares output against ``expected_classification``. + +- **step2**: Test processing prompt only (``_PROCESSING_SYSTEM_PROMPT``). + Compiles the prompt with fixture-provided ``existing_context``, + ``project_context``, ``data_types``, and ``custom_prompt_section``, + then runs the tool-calling loop. Mutations are scored against + ``expected`` records. + +- **full**: Run ``run_local_agent()`` end-to-end (both steps). + Scored on both classification and extraction. """ from __future__ import annotations -import asyncio import copy import json import logging import time import uuid -from pathlib import Path from typing import Any -from eval.config import EvalFixture, ExpectedRecord +from eval.config import EvalFixture, ExpectedClassification from eval.mock_executor import MockExecutor from eval.scorer import ( EvalScores, @@ -36,72 +39,193 @@ from eval import langfuse_eval logger = logging.getLogger(__name__) -async def run_single_eval( +# ── Step 1 runner ───────────────────────────────────────────────────────── + + +async def _run_step1( fixture: EvalFixture, model: str, - prompt_variant: str, - *, - use_llm_judge: bool = True, - judge_model: str = "gpt-4o-mini", -) -> EvalScores: - """Execute one (fixture × model × prompt_variant) eval and return scores.""" - from shared.config import settings + mock: MockExecutor, +) -> list[dict[str, Any]]: + """Run step-1 classification for each expected file. - prompt_template = fixture.prompt_variants.get(prompt_variant, "") + Returns a list of result dicts: + ``[{file, project_id, domains, new_project_name}, ...]`` + """ + from app.agent_runner import _classify_file - # Build mock executor - seed = copy.deepcopy(fixture.seed_records) - mock = MockExecutor( - fixture_dir=fixture.fixture_dir, - seed_records=seed, + results: list[dict[str, Any]] = [] + for ec in fixture.expected_classification: + # Read the file content through the mock + file_result = await mock._handle( + action="read_file_content", + data={"path": ec.file}, + ) + file_content: str = file_result.get("content", "") + + project_id, domains, new_name = await _classify_file( + file_path=ec.file, + file_content=file_content, + projects=fixture.projects_list, + config_data_types=fixture.data_types, + ) + results.append({ + "file": ec.file, + "project_id": project_id, + "domains": domains, + "new_project_name": new_name, + }) + return results + + +def _score_step1( + fixture: EvalFixture, + results: list[dict[str, Any]], +) -> tuple[float, float, float, str]: + """Score step-1 results. Returns (precision, recall, f1, reasoning).""" + if not fixture.expected_classification: + return 0.0, 0.0, 0.0, "No expected classifications" + + total = len(fixture.expected_classification) + matched = 0 + details: list[str] = [] + + for ec in fixture.expected_classification: + actual = next((r for r in results if r["file"] == ec.file), None) + if actual is None: + details.append(f" MISS {ec.file}: not processed") + continue + + pid_ok = actual["project_id"] == ec.project_id + domains_ok = set(actual["domains"]) == set(ec.domains) if ec.domains else True + + if pid_ok and domains_ok: + matched += 1 + details.append(f" OK {ec.file}: project={actual['project_id']}, domains={actual['domains']}") + else: + parts: list[str] = [] + if not pid_ok: + parts.append(f"project expected={ec.project_id} got={actual['project_id']}") + if not domains_ok: + parts.append(f"domains expected={ec.domains} got={actual['domains']}") + details.append(f" FAIL {ec.file}: {'; '.join(parts)}") + + precision = matched / total if total > 0 else 0.0 + recall = precision # in step1, precision == recall (same denominator) + f1 = precision # same + reasoning = "\n".join(details) + return precision, recall, f1, reasoning + + +# ── Step 2 runner ───────────────────────────────────────────────────────── + + +async def _run_step2( + fixture: EvalFixture, + model: str, + mock: MockExecutor, +) -> None: + """Run step-2 processing for each file in the fixture directory. + + Compiles ``_PROCESSING_SYSTEM_PROMPT`` with fixture-provided variables + and runs the tool-calling loop. Mutations are captured by the mock. + """ + from app.agent_runner import ( + _PROCESSING_SYSTEM_PROMPT, + _build_processing_tools, + _run_agent_with_tools, + _MAX_PROCESSING_STEPS, + ) + from app import tracing + + # Compile the processing prompt with fixture variables + system_prompt = tracing.compile_prompt( + "batch_processing", + fallback=_PROCESSING_SYSTEM_PROMPT, + variables={ + "existing_context": fixture.existing_context, + "project_context": fixture.project_context, + "data_types": ", ".join(fixture.data_types), + "custom_prompt_section": fixture.custom_prompt_section, + }, ) - # Override the LLM model for this run - original_model = settings.LLM_MODEL - settings.LLM_MODEL = model + tools = _build_processing_tools(fixture.data_types) + + # Scan files in the fixture directory + file_entries = await mock._handle( + action="list_directory", + data={"path": fixture.directory}, + ) + for entry in file_entries.get("entries", []): + if entry.get("type") != "file": + continue + # Filter by extension if specified + if fixture.file_extensions: + ext = entry["name"].rsplit(".", 1)[-1] if "." in entry["name"] else "" + if ext not in fixture.file_extensions: + continue + + file_result = await mock._handle( + action="read_file_content", + data={"path": entry["path"]}, + ) + file_content: str = file_result.get("content", "") + if not file_content.strip(): + continue + + await _run_agent_with_tools( + system_prompt=system_prompt, + user_message=( + f"Process this file and extract relevant information.\n\n" + f"File: {entry['path']}\n\nContent:\n{file_content}" + ), + tools=tools, + max_steps=_MAX_PROCESSING_STEPS, + ) + + +# ── Full runner ─────────────────────────────────────────────────────────── + + +async def _run_full( + fixture: EvalFixture, + model: str, + mock: MockExecutor, + user_id: str, +) -> None: + """Run the full two-step pipeline via ``run_local_agent``.""" + from app.agent_runner import run_local_agent - # Build trigger data (same shape as what redis_consumer delivers) trigger_data: dict[str, Any] = { "type": "agent_trigger", "directory": fixture.directory, "directory_paths": [fixture.directory], "data_types": fixture.data_types, "file_extensions": fixture.file_extensions, - "prompt_template": prompt_template, + "prompt_template": fixture.custom_prompt_section, "device_id": "eval-harness", "run_context": { - "agent_id": f"eval-{fixture.name}-{prompt_variant}", - "run_id": None, # skip DB logging during eval + "agent_id": f"eval-{fixture.name}", + "run_id": None, }, } - eval_user_id = f"eval-{uuid.uuid4().hex[:8]}" + with mock.patch(): + await run_local_agent(user_id, trigger_data) - logger.info( - "eval: starting %s | model=%s | variant=%s", - fixture.name, model, prompt_variant, - ) - start_time = time.time() - try: - # Patch execute_on_client + set user context, then run the pipeline - from app.ws_context import set_current_user, clear_current_user - from app.agent_runner import run_local_agent +# ── Scoring helpers ─────────────────────────────────────────────────────── - set_current_user(eval_user_id) - with mock.patch(): - await run_local_agent(eval_user_id, trigger_data) - except Exception as exc: - logger.error("eval: pipeline failed for %s/%s/%s: %s", fixture.name, model, prompt_variant, exc) - finally: - settings.LLM_MODEL = original_model - from app.ws_context import clear_current_user - clear_current_user() - elapsed = time.time() - start_time - logger.info("eval: pipeline completed in %.1fs — %d mutations", elapsed, len(mock.mutations)) +def _score_mutations( + fixture: EvalFixture, + mock: MockExecutor, +) -> tuple[list[FieldScore], float, float, float, int, int]: + """Score mutations against expected records. - # ── Score results ──────────────────────────────────────────── + Returns (field_scores, precision, recall, f1, extra, missing). + """ all_field_scores: list[FieldScore] = [] total_expected = 0 total_actual = 0 @@ -109,12 +233,10 @@ async def run_single_eval( total_extra = 0 total_missing = 0 - # Group expected by table expected_by_table: dict[str, list[dict]] = {} for rec in fixture.expected: expected_by_table.setdefault(rec.table, []).append(rec.fields) - # Compare against actual mutations (inserts + updates) tables = set(expected_by_table.keys()) | {m.table for m in mock.mutations} for table in tables: expected_records = expected_by_table.get(table, []) @@ -131,49 +253,160 @@ async def run_single_eval( total_missing += missing precision, recall, f1 = compute_precision_recall(total_expected, total_actual, total_matched) + return all_field_scores, precision, recall, f1, total_extra, total_missing - scores = EvalScores( - fixture_name=fixture.name, - model=model, - prompt_variant=prompt_variant, - field_scores=all_field_scores, - precision=precision, - recall=recall, - f1=f1, - extra_records=total_extra, - missing_records=total_missing, + +# ── Main entry point ────────────────────────────────────────────────────── + + +async def run_single_eval( + fixture: EvalFixture, + model: str, + *, + use_llm_judge: bool = True, + judge_model: str = "gpt-4o-mini", +) -> EvalScores: + """Execute one eval run for a fixture + model. Mode is read from the fixture.""" + from shared.config import settings + from shared.ws_context import set_current_user, clear_current_user + + seed = copy.deepcopy(fixture.seed_records) + mock = MockExecutor( + fixture_dir=fixture.fixture_path.parent, + seed_records=seed, ) - # ── Optional LLM judge ─────────────────────────────────────── - if use_llm_judge and fixture.expected: - all_expected = [r.fields for r in fixture.expected] - all_actual = [m.data for m in mock.mutations if m.action in ("insert", "update")] - judge_score, reasoning = await llm_judge_score( - all_expected, all_actual, judge_model=judge_model, - ) - scores.llm_judge_score = judge_score - scores.llm_judge_reasoning = reasoning + original_model = settings.LLM_MODEL + settings.LLM_MODEL = model + eval_user_id = str(uuid.uuid4()) - # ── Report to Langfuse ─────────────────────────────────────── - dataset_name = f"batch-eval-{fixture.name}" - dataset_item_id = f"{fixture.name}--{prompt_variant}" - run_name = f"{model}--{prompt_variant}--{int(time.time())}" + logger.info( + "eval: starting %s | mode=%s | model=%s", + fixture.name, fixture.mode, model, + ) + start_time = time.time() + + step1_results: list[dict[str, Any]] = [] + step1_reasoning = "" + + try: + set_current_user(eval_user_id) + + if fixture.mode == "step1": + with mock.patch(): + step1_results = await _run_step1(fixture, model, mock) + + elif fixture.mode == "step2": + with mock.patch(): + await _run_step2(fixture, model, mock) + + elif fixture.mode == "full": + with mock.patch(): + # Step 1 — classification (independent from run_local_agent) + if fixture.expected_classification: + step1_results = await _run_step1(fixture, model, mock) + + # Step 2 — full pipeline (run_local_agent handles both steps) + await _run_full(fixture, model, mock, eval_user_id) + + except Exception as exc: + logger.error("eval: pipeline failed for %s/%s: %s", fixture.name, model, exc) + finally: + settings.LLM_MODEL = original_model + clear_current_user() + + elapsed = time.time() - start_time + logger.info("eval: completed in %.1fs — %d mutations", elapsed, len(mock.mutations)) + + # ── Score ───────────────────────────────────────────────────── + + if fixture.mode == "step1": + s1_precision, s1_recall, s1_f1, step1_reasoning = _score_step1(fixture, step1_results) + scores = EvalScores( + fixture_name=fixture.name, + model=model, + prompt_variant=fixture.mode, + precision=s1_precision, + recall=s1_recall, + f1=s1_f1, + llm_judge_reasoning=step1_reasoning, + ) + else: + # step2 or full — score mutations + field_scores, precision, recall, f1, extra, missing = _score_mutations(fixture, mock) + scores = EvalScores( + fixture_name=fixture.name, + model=model, + prompt_variant=fixture.mode, + field_scores=field_scores, + precision=precision, + recall=recall, + f1=f1, + extra_records=extra, + missing_records=missing, + ) + + # Add step1 classification scores for full mode + if fixture.mode == "full" and fixture.expected_classification: + s1_p, s1_r, s1_f1, step1_reasoning = _score_step1(fixture, step1_results) + scores.llm_judge_reasoning = f"Step1 classification:\n{step1_reasoning}" + + # Optional LLM judge for extraction quality + if use_llm_judge and fixture.expected: + all_expected = [r.fields for r in fixture.expected] + all_actual = [m.data for m in mock.mutations if m.action in ("insert", "update")] + judge_score, reasoning = await llm_judge_score( + all_expected, all_actual, judge_model=judge_model, + ) + scores.llm_judge_score = judge_score + if step1_reasoning: + scores.llm_judge_reasoning += f"\n\nLLM judge:\n{reasoning}" + else: + scores.llm_judge_reasoning = reasoning + + # ── Report to Langfuse ──────────────────────────────────────── + prompt_names = { + "step1": ["batch_file_classifier"], + "step2": ["batch_processing"], + "full": ["batch_file_classifier", "batch_processing"], + }.get(fixture.mode, ["batch_processing"]) trace_id = langfuse_eval.log_eval_trace( fixture_name=fixture.name, model=model, - prompt_variant=prompt_variant, - prompt_template=prompt_template, + prompt_variant=fixture.mode, + prompt_template=fixture.custom_prompt_section or "(default)", actual_mutations=[{"action": m.action, "table": m.table, "data": m.data} for m in mock.mutations], scores_summary=scores.summary(), - dataset_name=dataset_name, - run_name=run_name, - dataset_item_id=dataset_item_id, + step1_results=step1_results or None, + langfuse_prompt_names=prompt_names, ) if trace_id: langfuse_eval.post_eval_scores(scores, trace_id=trace_id) + # For full mode, post classification scores separately + if fixture.mode == "full" and fixture.expected_classification: + s1_p, s1_r, s1_f1, _ = _score_step1(fixture, step1_results) + for name, value in [ + ("classification_precision", s1_p), + ("classification_recall", s1_r), + ("classification_f1", s1_f1), + ]: + try: + from langfuse import get_client + lf = get_client() + if lf: + lf.create_score( + name=name, + value=value, + trace_id=trace_id, + data_type="NUMERIC", + comment=f"{fixture.name} | {model} | full", + ) + except Exception: + pass + return scores @@ -181,29 +414,20 @@ async def run_fixture_eval( fixture: EvalFixture, models: list[str], *, - variants: list[str] | None = None, use_llm_judge: bool = True, judge_model: str = "gpt-4o-mini", ) -> list[EvalScores]: - """Run all (model × variant) combinations for a fixture.""" - if variants is None: - variants = list(fixture.prompt_variants.keys()) - - # Sync fixture to Langfuse dataset + """Run all models for a fixture.""" langfuse_eval.sync_fixture_to_dataset(fixture) results: list[EvalScores] = [] for model in models: - for variant in variants: - if variant not in fixture.prompt_variants: - logger.warning("eval: variant %r not found in fixture %s", variant, fixture.name) - continue - scores = await run_single_eval( - fixture, model, variant, - use_llm_judge=use_llm_judge, - judge_model=judge_model, - ) - results.append(scores) + scores = await run_single_eval( + fixture, model, + use_llm_judge=use_llm_judge, + judge_model=judge_model, + ) + results.append(scores) return results @@ -214,18 +438,21 @@ def print_results(results: list[EvalScores]) -> None: print("\nNo eval results.") return - print("\n" + "=" * 90) - print(f"{'Fixture':<25} {'Model':<25} {'Variant':<15} {'P':>6} {'R':>6} {'F1':>6} {'FA':>6} {'LLM':>6}") - print("-" * 90) + print("\n" + "=" * 95) + print(f"{'Fixture':<25} {'Mode':<6} {'Model':<25} {'P':>6} {'R':>6} {'F1':>6} {'FA':>6} {'LLM':>6}") + print("-" * 95) for s in results: llm_str = f"{s.llm_judge_score:.2f}" if s.llm_judge_score is not None else " --" print( - f"{s.fixture_name:<25} {s.model:<25} {s.prompt_variant:<15} " + f"{s.fixture_name:<25} {s.prompt_variant:<6} {s.model:<25} " f"{s.precision:>6.2f} {s.recall:>6.2f} {s.f1:>6.2f} " f"{s.field_accuracy:>6.2f} {llm_str:>6}" ) + print("=" * 95) + print() + print("=" * 90) # If LLM judge reasoning is available, print it diff --git a/services/batch-agent/eval/scorer.py b/services/batch-agent/eval/scorer.py index 51b2500..40e2101 100644 --- a/services/batch-agent/eval/scorer.py +++ b/services/batch-agent/eval/scorer.py @@ -242,7 +242,7 @@ async def llm_judge_score( Returns (score, reasoning). """ - from app.llm import get_llm + from shared.llm import get_llm llm = get_llm(model=judge_model, temperature=0)