- Rewrite eval config with EvalMode (step1, step2, full) replacing prompt_variants - Rewrite runner with _run_step1, _run_step2, _run_full dispatch - CLI: replace --variants with --mode flag - Add 3 fixture YAMLs: classify_invoices (step1), process_invoices (step2), full_invoices (full) - Remove old freelance_invoices fixture - Langfuse: mode-aware dataset items (classifications for step1, extraction for step2, both for full) - Langfuse: link both prompts (batch_file_classifier + batch_processing) in full mode - Langfuse: post separate classification_precision/recall/f1 scores for full mode - Langfuse: skip misleading field_accuracy=0 when field_scores is empty (step1) - Langfuse: include step1_results in trace output - MockExecutor: mock async_session to bypass DB in full mode - Journey fixture: remove user_messages (only interactive test kept)
41 lines
1.7 KiB
YAML
41 lines
1.7 KiB
YAML
# Fixture: classify-invoices (step1)
|
|
# Tests _STEP1_SYSTEM_PROMPT — file classification and project matching.
|
|
# Verifies that the LLM correctly matches files to existing projects
|
|
# and identifies the right data domains.
|
|
|
|
name: classify-invoices
|
|
mode: step1
|
|
description: >
|
|
Test file classification on Italian freelance invoices and meeting notes.
|
|
Verifies project matching and domain identification.
|
|
|
|
directory: sample_files/invoices
|
|
data_types: [tasks, notes, timelines]
|
|
file_extensions: [txt, md]
|
|
|
|
# ── Step-1 prompt variables ──────────────────────────────────────
|
|
domain_definitions: |
|
|
- tasks: Action items, deliverables, things to do — anything that someone needs to complete.
|
|
- notes: Meeting summaries, decisions, reference information — permanent knowledge entries.
|
|
- timelines: Project milestones, deadlines, scheduled events — specific dates that mark a point in the progress of a project.
|
|
|
|
projects_list:
|
|
- id: "proj-web-redesign"
|
|
name: "Redesign Sito Web Corporate"
|
|
status: "active"
|
|
aiSummary: "Corporate website redesign for Studio Architettura Bianchi"
|
|
- id: "proj-ecommerce"
|
|
name: "E-Commerce FashionStore"
|
|
status: "active"
|
|
aiSummary: "Next.js e-commerce platform for FashionStore srl"
|
|
|
|
# ── Expected classification results ─────────────────────────────
|
|
expected_classification:
|
|
- file: "sample_files/invoices/fattura_042.txt"
|
|
project_id: "proj-web-redesign"
|
|
domains: [tasks, notes, timelines]
|
|
|
|
- file: "sample_files/invoices/meeting_ecommerce.md"
|
|
project_id: "proj-ecommerce"
|
|
domains: [tasks, notes, timelines]
|