refactor(eval): 3-mode eval harness (step1/step2/full) with Langfuse fixes

- Rewrite eval config with EvalMode (step1, step2, full) replacing prompt_variants - Rewrite runner with _run_step1, _run_step2, _run_full dispatch - CLI: replace --variants with --mode flag - Add 3 fixture YAMLs: classify_invoices (step1), process_invoices (step2), full_invoices (full) - Remove old freelance_invoices fixture - Langfuse: mode-aware dataset items (classifications for step1, extraction for step2, both for full) - Langfuse: link both prompts (batch_file_classifier + batch_processing) in full mode - Langfuse: post separate classification_precision/recall/f1 scores for full mode - Langfuse: skip misleading field_accuracy=0 when field_scores is empty (step1) - Langfuse: include step1_results in trace output - MockExecutor: mock async_session to bypass DB in full mode - Journey fixture: remove user_messages (only interactive test kept)
2026-03-24 16:18:51 +01:00
parent 63fa119543
commit d3f7099d93
13 changed files with 1409 additions and 439 deletions
--- a/services/batch-agent/eval/fixtures/classify_invoices.yaml
+++ b/services/batch-agent/eval/fixtures/classify_invoices.yaml
@@ -0,0 +1,40 @@
+# Fixture: classify-invoices (step1)
+# Tests _STEP1_SYSTEM_PROMPT — file classification and project matching.
+# Verifies that the LLM correctly matches files to existing projects
+# and identifies the right data domains.
+
+name: classify-invoices
+mode: step1
+description: >
+  Test file classification on Italian freelance invoices and meeting notes.
+  Verifies project matching and domain identification.
+
+directory: sample_files/invoices
+data_types: [tasks, notes, timelines]
+file_extensions: [txt, md]
+
+# ── Step-1 prompt variables ──────────────────────────────────────
+domain_definitions: |
+  - tasks: Action items, deliverables, things to do — anything that someone needs to complete.
+  - notes: Meeting summaries, decisions, reference information — permanent knowledge entries.
+  - timelines: Project milestones, deadlines, scheduled events — specific dates that mark a point in the progress of a project.
+
+projects_list:
+  - id: "proj-web-redesign"
+    name: "Redesign Sito Web Corporate"
+    status: "active"
+    aiSummary: "Corporate website redesign for Studio Architettura Bianchi"
+  - id: "proj-ecommerce"
+    name: "E-Commerce FashionStore"
+    status: "active"
+    aiSummary: "Next.js e-commerce platform for FashionStore srl"
+
+# ── Expected classification results ─────────────────────────────
+expected_classification:
+  - file: "sample_files/invoices/fattura_042.txt"
+    project_id: "proj-web-redesign"
+    domains: [tasks, notes, timelines]
+
+  - file: "sample_files/invoices/meeting_ecommerce.md"
+    project_id: "proj-ecommerce"
+    domains: [tasks, notes, timelines]
--- a/services/batch-agent/eval/fixtures/freelance_invoices.yaml
+++ b/services/batch-agent/eval/fixtures/freelance_invoices.yaml
@@ -1,86 +0,0 @@
-# Fixture: freelance-invoices
-# Tests extraction of tasks, notes, and timelines from
-# invoices and meeting notes typical of a freelance workflow.
-
-name: freelance-invoices
-description: >
-  Extract tasks, notes, and timeline events from Italian freelance
-  invoices and meeting notes. Tests project matching, priority
-  mapping, and bilingual content handling.
-
-directory: sample_files/invoices
-data_types: [tasks, notes, timelines]
-file_extensions: [txt, md]
-
-# Pre-existing records in the "database"
-seed_records:
-  projects:
-    - id: "proj-web-redesign"
-      name: "Redesign Sito Web Corporate"
-      status: "active"
-      aiSummary: "Corporate website redesign for Studio Architettura Bianchi"
-    - id: "proj-ecommerce"
-      name: "E-Commerce FashionStore"
-      status: "active"
-      aiSummary: "Next.js e-commerce platform for FashionStore srl"
-  tasks: []
-  notes: []
-  timelines: []
-
-# Prompt variations to compare
-prompt_variants:
-  baseline: |
-    Extract action items as tasks and summaries as notes.
-    For timelines, extract any mentioned dates and deadlines.
-    Set isAiSuggested=1 on every record.
-
-  detailed_italian: |
-    Estrai i dati dai file come segue:
-    - TASK: ogni azione da fare, deliverable, o item con scadenza.
-      Mappa "URGENTE" o "ALTA PRIORITÀ" → priority: high.
-      Mappa "media priorità" → priority: medium.
-      Mappa "bassa priorità" → priority: low.
-      Se un item è marcato come "completato" o [x], impostalo status: done.
-      Altrimenti status: todo.
-    - NOTE: riassunti di meeting, decisioni prese, note tecniche.
-      Il titolo deve essere descrittivo. Il content deve includere tutti i dettagli.
-    - TIMELINE: date di scadenza, milestone, meeting futuri.
-      Formato data: timestamp Unix in millisecondi.
-    Imposta sempre isAiSuggested=1.
-
-  minimal: |
-    Extract only high-priority action items as tasks.
-    Ignore notes and timelines unless explicitly marked as important.
-    Set isAiSuggested=1.
-
-# Expected extractions (what the agent SHOULD produce)
-# Only key fields are specified — scorer uses fuzzy matching
-expected:
-  tasks:
-    - title: "Sviluppo frontend React"
-      priority: "high"
-      status: "todo"
-    - title: "Integrazione API backend"
-      priority: "medium"
-      status: "todo"
-    - title: "Testing cross-browser e fix bug responsive"
-      status: "todo"
-    - title: "Preparare wireframe homepage"
-      priority: "high"
-      status: "todo"
-    - title: "Setup progetto Next.js e configurare CI/CD"
-      priority: "medium"
-      status: "todo"
-    - title: "Ricerca plugin Stripe per gestione abbonamenti"
-      priority: "low"
-      status: "todo"
-
-  notes:
-    - title: "Meeting Kickoff Progetto E-Commerce"
-
-  timelines:
-    - title: "MVP E-Commerce pronto"
-    - title: "Meeting di revisione"
-
-# Models to test (can be overridden via CLI --models)
-models: []
--- a/services/batch-agent/eval/fixtures/full_invoices.yaml
+++ b/services/batch-agent/eval/fixtures/full_invoices.yaml
@@ -0,0 +1,108 @@
+# Fixture: full-invoices (full)
+# Tests both _STEP1_SYSTEM_PROMPT and _PROCESSING_SYSTEM_PROMPT in sequence
+# via run_local_agent(). Verifies end-to-end classification + extraction.
+
+name: full-invoices
+mode: full
+description: >
+  End-to-end test: classify Italian invoices/meeting notes into the
+  correct project, then extract tasks, notes, and timeline events.
+
+directory: sample_files/invoices
+data_types: [tasks, notes, timelines]
+file_extensions: [txt, md]
+
+# ── Step-1 prompt variables ──────────────────────────────────────
+domain_definitions: |
+  - tasks: Action items, deliverables, things to do — anything that someone needs to complete.
+  - notes: Meeting summaries, decisions, reference information — permanent knowledge entries.
+  - timelines: Project milestones, deadlines, scheduled events — specific dates that mark a point in the progress of a project.
+
+projects_list:
+  - id: "proj-web-redesign"
+    name: "Redesign Sito Web Corporate"
+    status: "active"
+    aiSummary: "Corporate website redesign for Studio Architettura Bianchi"
+  - id: "proj-ecommerce"
+    name: "E-Commerce FashionStore"
+    status: "active"
+    aiSummary: "Next.js e-commerce platform for FashionStore srl"
+
+# ── Step-2 prompt variables ──────────────────────────────────────
+existing_context: |
+  Existing tasks:
+    (none)
+
+  Existing notes:
+    (none)
+
+  Existing timelines:
+    (none)
+
+project_context: ""
+
+custom_prompt_section: |
+  User instructions:
+  Estrai i dati dai file come segue:
+  - TASK: ogni azione da fare, deliverable, o item con scadenza.
+    Mappa "URGENTE" o "ALTA PRIORITÀ" → priority: high.
+    Mappa "media priorità" → priority: medium.
+    Mappa "bassa priorità" → priority: low.
+    Se un item è marcato come "completato" o [x], impostalo status: done.
+    Altrimenti status: todo.
+  - NOTE: riassunti di meeting, decisioni prese, note tecniche.
+  - TIMELINE: date di scadenza, milestone, meeting futuri.
+  Imposta sempre isAiSuggested=1.
+
+# ── Seed records (pre-existing DB state) ─────────────────────────
+seed_records:
+  projects:
+    - id: "proj-web-redesign"
+      name: "Redesign Sito Web Corporate"
+      status: "active"
+      aiSummary: "Corporate website redesign for Studio Architettura Bianchi"
+    - id: "proj-ecommerce"
+      name: "E-Commerce FashionStore"
+      status: "active"
+      aiSummary: "Next.js e-commerce platform for FashionStore srl"
+  tasks: []
+  notes: []
+  timelines: []
+
+# ── Expected classification (step 1) ─────────────────────────────
+expected_classification:
+  - file: "sample_files/invoices/fattura_042.txt"
+    project_id: "proj-web-redesign"
+    domains: [tasks, notes, timelines]
+
+  - file: "sample_files/invoices/meeting_ecommerce.md"
+    project_id: "proj-ecommerce"
+    domains: [tasks, notes, timelines]
+
+# ── Expected extractions (step 2) ────────────────────────────────
+expected:
+  tasks:
+    - title: "Sviluppo frontend React"
+      priority: "high"
+      status: "todo"
+    - title: "Integrazione API backend"
+      priority: "medium"
+      status: "todo"
+    - title: "Testing cross-browser e fix bug responsive"
+      status: "todo"
+    - title: "Preparare wireframe homepage"
+      priority: "high"
+      status: "todo"
+    - title: "Setup progetto Next.js e configurare CI/CD"
+      priority: "medium"
+      status: "todo"
+    - title: "Ricerca plugin Stripe per gestione abbonamenti"
+      priority: "low"
+      status: "todo"
+
+  notes:
+    - title: "Meeting Kickoff Progetto E-Commerce"
+
+  timelines:
+    - title: "MVP E-Commerce pronto"
+    - title: "Meeting di revisione"
--- a/services/batch-agent/eval/fixtures/journey_invoice_setup.yaml
+++ b/services/batch-agent/eval/fixtures/journey_invoice_setup.yaml
@@ -1,43 +1,25 @@
 # Journey Fixture: journey-invoice-setup
-# Tests that the journey chatbot correctly builds a prompt_template
-# for extracting tasks and notes from Italian invoices and meeting notes.
+# Used by `python -m eval interactive` for human-in-the-loop testing
+# of the journey chatbot's prompt-building conversation.

 type: journey
 name: journey-invoice-setup
 description: >
-  Test the journey chatbot's ability to explore a directory of Italian
-  invoices and meeting notes, ask relevant questions, and produce a
-  well-structured prompt_template for data extraction.
+  Interactive test for the journey chatbot — explore a directory of
+  Italian invoices and meeting notes, answer the chatbot's questions,
+  and verify it produces a well-structured prompt_template for data
+  extraction.

 directory: sample_files/invoices
-data_types: [tasks, notes, timelines]
-
-# Simulated user responses (the journey starts with the LLM exploring
-# the directory and asking its first question)
-user_messages:
-  - >
-    I want to extract action items from invoices and meeting notes.
-    The invoices are in Italian and contain work descriptions with
-    deadlines. Meeting notes have action items with checkboxes.
-  - >
-    Yes, map Italian priority keywords: "URGENTE" and "ALTA PRIORITÀ"
-    should be high priority, "media priorità" is medium, "bassa priorità"
-    is low. Items marked with [x] are already completed.
-  - >
-    For notes, I want meeting summaries with the full content including
-    decisions and attendees. For timelines, extract deadlines and
-    scheduled meeting dates.
-  - >
-    That's everything I need. Please generate the template.
+data_types: [tasks, notes, timelines, projects]

 # Criteria the generated prompt_template must satisfy
 # Each is scored 0-1 by an LLM judge
 expected_template_criteria:
  - "Mentions creating tasks from action items and work descriptions"
-  - "Includes Italian priority keyword mapping (URGENTE→high, media priorità→medium, bassa priorità→low)"
-  - "Handles completed items marked with [x] as status done"
  - "Mentions creating notes from meeting summaries"
  - "Mentions extracting timeline events from deadlines and meeting dates"
+  - "Mentions creating projects from relevant information"
  - "Sets isAiSuggested=1 on all created records"
  - "Does NOT include projectId assignment logic"
  - "Uses camelCase field names (title, status, priority, dueDate, content)"
--- a/services/batch-agent/eval/fixtures/process_invoices.yaml
+++ b/services/batch-agent/eval/fixtures/process_invoices.yaml
@@ -0,0 +1,81 @@
+# Fixture: process-invoices (step2)
+# Tests _PROCESSING_SYSTEM_PROMPT — data extraction & tool calling.
+# The classification step is skipped; prompt variables are injected directly.
+
+name: process-invoices
+mode: step2
+description: >
+  Test data extraction from Italian freelance invoices.
+  Verifies correct record creation via tool calls with the right
+  fields, priorities, and status values.
+
+directory: sample_files/invoices
+data_types: [tasks, notes, timelines]
+file_extensions: [txt, md]
+
+# ── Step-2 prompt variables ──────────────────────────────────────
+existing_context: |
+  Existing tasks:
+    (none)
+
+  Existing notes:
+    (none)
+
+  Existing timelines:
+    (none)
+
+project_context: >
+  Project: Redesign Sito Web Corporate (id: proj-web-redesign).
+  Always set projectId to this id on every record you create.
+
+custom_prompt_section: |
+  User instructions:
+  Estrai i dati dai file come segue:
+  - TASK: ogni azione da fare, deliverable, o item con scadenza.
+    Mappa "URGENTE" o "ALTA PRIORITÀ" → priority: high.
+    Mappa "media priorità" → priority: medium.
+    Mappa "bassa priorità" → priority: low.
+    Se un item è marcato come "completato" o [x], impostalo status: done.
+    Altrimenti status: todo.
+  - NOTE: riassunti di meeting, decisioni prese, note tecniche.
+    Il titolo deve essere descrittivo. Il content deve includere tutti i dettagli.
+  - TIMELINE: date di scadenza, milestone, meeting futuri.
+  Imposta sempre isAiSuggested=1.
+
+# ── Seed records (pre-existing DB state) ─────────────────────────
+seed_records:
+  projects:
+    - id: "proj-web-redesign"
+      name: "Redesign Sito Web Corporate"
+      status: "active"
+  tasks: []
+  notes: []
+  timelines: []
+
+# ── Expected extractions ─────────────────────────────────────────
+expected:
+  tasks:
+    - title: "Sviluppo frontend React"
+      priority: "high"
+      status: "todo"
+    - title: "Integrazione API backend"
+      priority: "medium"
+      status: "todo"
+    - title: "Testing cross-browser e fix bug responsive"
+      status: "todo"
+    - title: "Preparare wireframe homepage"
+      priority: "high"
+      status: "todo"
+    - title: "Setup progetto Next.js e configurare CI/CD"
+      priority: "medium"
+      status: "todo"
+    - title: "Ricerca plugin Stripe per gestione abbonamenti"
+      priority: "low"
+      status: "todo"
+
+  notes:
+    - title: "Meeting Kickoff Progetto E-Commerce"
+
+  timelines:
+    - title: "MVP E-Commerce pronto"
+    - title: "Meeting di revisione"