feat(batch-agent): add journey eval to E2E harness
- journey_runner.py: orchestrates journey start → simulated user messages → template extraction → LLM judge scoring - config.py: JourneyFixture dataclass with user_messages and expected_template_criteria, discover_journey_fixtures() - langfuse_eval.py: sync_journey_fixture_to_dataset() - cli.py: new 'journey' subcommand (python -m eval journey) with --fixture, --models, --judge-model flags - fixtures/journey_invoice_setup.yaml: example journey fixture with 4 user messages and 8 quality criteria
This commit is contained in:
@@ -96,6 +96,52 @@ def sync_fixture_to_dataset(fixture: EvalFixture) -> str | None:
|
||||
return dataset_name
|
||||
|
||||
|
||||
def sync_journey_fixture_to_dataset(fixture) -> str | None:
|
||||
"""Create or update a Langfuse dataset from a journey fixture.
|
||||
|
||||
Each journey fixture becomes a single dataset item with:
|
||||
- input: {directory, data_types, user_messages}
|
||||
- expected_output: {criteria}
|
||||
"""
|
||||
lf = _get_langfuse()
|
||||
if lf is None:
|
||||
logger.info("langfuse_eval: Langfuse not configured — skipping journey dataset sync")
|
||||
return None
|
||||
|
||||
dataset_name = f"journey-eval-{fixture.name}"
|
||||
|
||||
try:
|
||||
lf.create_dataset(
|
||||
name=dataset_name,
|
||||
description=fixture.description,
|
||||
metadata={"type": "journey", "data_types": fixture.data_types},
|
||||
)
|
||||
except Exception:
|
||||
pass # Dataset may already exist
|
||||
|
||||
item_id = f"{fixture.name}--journey"
|
||||
try:
|
||||
lf.create_dataset_item(
|
||||
dataset_name=dataset_name,
|
||||
id=item_id,
|
||||
input={
|
||||
"directory": fixture.directory,
|
||||
"data_types": fixture.data_types,
|
||||
"user_messages": fixture.user_messages,
|
||||
},
|
||||
expected_output={
|
||||
"criteria": fixture.expected_template_criteria,
|
||||
},
|
||||
metadata={"type": "journey"},
|
||||
)
|
||||
except Exception as exc:
|
||||
logger.warning("langfuse_eval: failed to upsert journey dataset item %s: %s", item_id, exc)
|
||||
|
||||
lf.flush()
|
||||
logger.info("langfuse_eval: synced journey fixture '%s' → dataset '%s'", fixture.name, dataset_name)
|
||||
return dataset_name
|
||||
|
||||
|
||||
def create_eval_run(
|
||||
dataset_name: str,
|
||||
run_name: str,
|
||||
|
||||
Reference in New Issue
Block a user