refactor(eval): 3-mode eval harness (step1/step2/full) with Langfuse fixes

- Rewrite eval config with EvalMode (step1, step2, full) replacing prompt_variants
- Rewrite runner with _run_step1, _run_step2, _run_full dispatch
- CLI: replace --variants with --mode flag
- Add 3 fixture YAMLs: classify_invoices (step1), process_invoices (step2), full_invoices (full)
- Remove old freelance_invoices fixture
- Langfuse: mode-aware dataset items (classifications for step1, extraction for step2, both for full)
- Langfuse: link both prompts (batch_file_classifier + batch_processing) in full mode
- Langfuse: post separate classification_precision/recall/f1 scores for full mode
- Langfuse: skip misleading field_accuracy=0 when field_scores is empty (step1)
- Langfuse: include step1_results in trace output
- MockExecutor: mock async_session to bypass DB in full mode
- Journey fixture: remove user_messages (only interactive test kept)
This commit is contained in:
Roberto Musso
2026-03-24 16:18:51 +01:00
parent 63fa119543
commit d3f7099d93
13 changed files with 1409 additions and 439 deletions

View File

@@ -94,7 +94,7 @@ async def _judge_template(
Returns (criteria_scores, reasoning).
"""
from app.llm import get_llm
from shared.llm import get_llm
llm = get_llm(model=judge_model, temperature=0)
@@ -152,13 +152,23 @@ async def run_single_journey_eval(
model: str,
*,
judge_model: str = "gpt-4o-mini",
data_dir: Path | None = None,
) -> JourneyEvalResult:
"""Execute one journey eval: start → messages → score template."""
"""Execute one journey eval: start \u2192 messages \u2192 score template."""
from shared.config import settings
# Build mock executor for filesystem tools
# When data_dir is given, use its parent as MockExecutor root
# and its name as the journey directory so the LLM sees a
# meaningful path (not ".").
if data_dir:
mock_root = data_dir.parent
journey_directory = data_dir.name
else:
mock_root = fixture.fixture_path.parent
journey_directory = fixture.directory
mock = MockExecutor(
fixture_dir=fixture.fixture_dir,
fixture_dir=mock_root,
seed_records={},
)
@@ -178,7 +188,7 @@ async def run_single_journey_eval(
done = False
try:
from app.ws_context import set_current_user, clear_current_user
from shared.ws_context import set_current_user, clear_current_user
from app.journey import handle_journey_start, handle_journey_message, _sessions
set_current_user(eval_user_id)
@@ -186,7 +196,7 @@ async def run_single_journey_eval(
# ── Start the journey ────────────────────────────────
start_frame: dict[str, Any] = {
"agent_type": "local",
"directory": fixture.directory,
"directory": journey_directory,
"data_types": fixture.data_types,
"session_id": f"eval-{uuid.uuid4().hex[:8]}",
}
@@ -246,7 +256,7 @@ async def run_single_journey_eval(
logger.error("journey_eval: pipeline failed for %s/%s: %s", fixture.name, model, exc)
finally:
settings.LLM_MODEL = original_model
from app.ws_context import clear_current_user
from shared.ws_context import clear_current_user
clear_current_user()
elapsed = time.time() - start_time
@@ -297,6 +307,7 @@ async def run_single_journey_eval(
prompt_template=prompt_template or "(not generated)",
actual_mutations=[{"conversation": conversation[:20]}],
scores_summary=result.summary(),
langfuse_prompt_names=["journey_system"],
)
if trace_id:
@@ -321,6 +332,7 @@ async def run_journey_fixture_eval(
models: list[str],
*,
judge_model: str = "gpt-4o-mini",
data_dir: Path | None = None,
) -> list[JourneyEvalResult]:
"""Run all models for a journey fixture."""
langfuse_eval.sync_journey_fixture_to_dataset(fixture)
@@ -329,6 +341,7 @@ async def run_journey_fixture_eval(
for model in models:
result = await run_single_journey_eval(
fixture, model, judge_model=judge_model,
data_dir=data_dir,
)
results.append(result)