feat(eval): add custom system prompt support for step-1 classification

2026-04-06 22:56:30 +02:00
parent fe0dd038ee
commit 7e4374c69b
3 changed files with 126 additions and 36 deletions
--- a/services/batch-agent/app/agent_runner.py
+++ b/services/batch-agent/app/agent_runner.py
@@ -400,6 +400,7 @@ async def _classify_file(
    projects: list[dict],
    config_data_types: list[str],
    langfuse_handler: Any | None = None,
    custom_system_prompt: str | None = None,
 ) -> tuple[str, list[str], str | None]:
    fallback: tuple[str, list[str], str | None] = ("new", list(config_data_types), None)
@@ -421,6 +422,12 @@ async def _classify_file(
        if d in _DOMAIN_DESCRIPTIONS
    )
    if custom_system_prompt:
        # Fixture-provided prompt takes absolute priority
        system = custom_system_prompt.format_map(
            {"domain_definitions": domain_definitions, "projects_list": projects_list}
        )
    else:
        system = tracing.compile_prompt(
            "batch_file_classifier",
            fallback=_STEP1_SYSTEM_PROMPT,
--- a/services/batch-agent/eval/config.py
+++ b/services/batch-agent/eval/config.py
@@ -71,6 +71,7 @@ class EvalFixture:
    # ── Step-1 inputs (classification) ───────────────────────────
    domain_definitions: str = ""
    projects_list: list[dict[str, Any]] = field(default_factory=list)
    custom_step1_prompt: str = ""
    # ── Step-2 inputs (processing) ───────────────────────────────
    existing_context: str = ""
--- a/services/batch-agent/eval/runner.py
+++ b/services/batch-agent/eval/runner.py
@@ -47,30 +47,47 @@ async def _run_step1(
    model: str,
    mock: MockExecutor,
 ) -> list[dict[str, Any]]:
-    """Run step-1 classification for each expected file.
+    """Run step-1 classification for every file in the fixture directory.
-    Returns a list of result dicts:
+    Scans the directory recursively, classifies each file, and returns
    a list of result dicts:
    ``[{file, project_id, domains, new_project_name}, ...]``
    """
    from app.agent_runner import _classify_file
    # Build project name lookup for display
    proj_names: dict[str, str] = {
        p.get("id", ""): p.get("name", "") for p in fixture.projects_list
    }
    # Discover all files in the fixture directory
    all_files = await _scan_fixture_files(mock, fixture.directory)
    print(f"\n  Scanning {len(all_files)} files in {fixture.directory}\n")
    results: list[dict[str, Any]] = []
-    for ec in fixture.expected_classification:
+    for i, file_path in enumerate(all_files, 1):
        # Read the file content through the mock
        file_result = await mock._handle(
            action="read_file_content",
-            data={"path": ec.file},
+            data={"path": file_path},
        )
        file_content: str = file_result.get("content", "")
        if not file_content.strip():
            continue
        project_id, domains, new_name = await _classify_file(
-            file_path=ec.file,
+            file_path=file_path,
            file_content=file_content,
            projects=fixture.projects_list,
            config_data_types=fixture.data_types,
            custom_system_prompt=fixture.custom_step1_prompt or None,
        )
        short_name = file_path.rsplit("/", 1)[-1] if "/" in file_path else file_path
        proj_label = proj_names.get(project_id, new_name or "?")
        print(f"  [{i}/{len(all_files)}] {short_name}  →  {project_id} ({proj_label})  {domains}")
        results.append({
-            "file": ec.file,
+            "file": file_path,
            "project_id": project_id,
            "domains": domains,
            "new_project_name": new_name,
@@ -78,22 +95,64 @@ async def _run_step1(
    return results
 async def _scan_fixture_files(mock: MockExecutor, directory: str) -> list[str]:
    """Recursively list all files under *directory* via the mock executor."""
    files: list[str] = []
    async def _walk(path: str) -> None:
        result = await mock._handle(action="list_directory", data={"path": path})
        for entry in result.get("entries", []):
            if entry.get("type") == "directory":
                await _walk(entry["path"])
            elif entry.get("type") == "file":
                files.append(entry["path"])
    await _walk(directory)
    return sorted(files)
 def _score_step1(
    fixture: EvalFixture,
    results: list[dict[str, Any]],
 ) -> tuple[float, float, float, str]:
-    """Score step-1 results. Returns (precision, recall, f1, reasoning)."""
+    """Score step-1 results. Returns (precision, recall, f1, reasoning).
    Files with expected classifications are scored (OK/FAIL).
    Files without expectations are shown as informational (INFO).
    """
    if not fixture.expected_classification:
        return 0.0, 0.0, 0.0, "No expected classifications"
    # Build project name lookup
    proj_names: dict[str, str] = {
        p.get("id", ""): p.get("name", "") for p in fixture.projects_list
    }
    proj_names["new"] = "(new project)"
    def _proj_label(pid: str, new_name: str | None = None) -> str:
        name = proj_names.get(pid, "?")
        if pid == "new" and new_name:
            return f"new → \"{new_name}\""
        return f"{pid} ({name})" if name and name != "?" else pid
    def _short_file(path: str) -> str:
        """Use just the filename for cleaner display."""
        return path.rsplit("/", 1)[-1] if "/" in path else path
    expected_files = {ec.file for ec in fixture.expected_classification}
    total = len(fixture.expected_classification)
    matched = 0
    details: list[str] = []
    scored_lines: list[str] = []
    info_lines: list[str] = []
    # Score expected files
    for ec in fixture.expected_classification:
        actual = next((r for r in results if r["file"] == ec.file), None)
        fname = _short_file(ec.file)
        if actual is None:
-            details.append(f"  MISS {ec.file}: not processed")
+            scored_lines.append(f"  MISS  {fname}")
            scored_lines.append(f"          expected: {_proj_label(ec.project_id)}")
            continue
        pid_ok = actual["project_id"] == ec.project_id
@@ -101,20 +160,41 @@ def _score_step1(
        if pid_ok and domains_ok:
            matched += 1
-            details.append(f"  OK   {ec.file}: project={actual['project_id']}, domains={actual['domains']}")
+            scored_lines.append(f"  OK    {fname}")
            scored_lines.append(f"          project: {_proj_label(actual['project_id'])}")
            scored_lines.append(f"          domains: {actual['domains']}")
        else:
-            parts: list[str] = []
+            scored_lines.append(f"  FAIL  {fname}")
            if not pid_ok:
-                parts.append(f"project expected={ec.project_id} got={actual['project_id']}")
+                scored_lines.append(f"          project: {_proj_label(actual['project_id'])}  (expected: {_proj_label(ec.project_id)})")
            else:
                scored_lines.append(f"          project: {_proj_label(actual['project_id'])}")
            if not domains_ok:
-                parts.append(f"domains expected={ec.domains} got={actual['domains']}")
+                scored_lines.append(f"          domains: {actual['domains']}  (expected: {ec.domains})")
-            details.append(f"  FAIL {ec.file}: {'; '.join(parts)}")
+            else:
                scored_lines.append(f"          domains: {actual['domains']}")
    # Show unscored files
    for r in results:
        if r["file"] not in expected_files:
            fname = _short_file(r["file"])
            proj = _proj_label(r["project_id"], r.get("new_project_name"))
            info_lines.append(f"  ·     {fname}")
            info_lines.append(f"          project: {proj}  |  domains: {r['domains']}")
    precision = matched / total if total > 0 else 0.0
-    recall = precision  # in step1, precision == recall (same denominator)
+    recall = precision
-    f1 = precision  # same
+    f1 = precision
-    reasoning = "\n".join(details)
+
-    return precision, recall, f1, reasoning
+    parts: list[str] = []
    if scored_lines:
        parts.append(f"Scored ({matched}/{total}):")
        parts.extend(scored_lines)
    if info_lines:
        parts.append(f"\nOther files ({len(info_lines) // 2}):")
        parts.extend(info_lines)
    return precision, recall, f1, "\n".join(parts)
 # ── Step 2 runner ─────────────────────────────────────────────────────────
@@ -438,26 +518,28 @@ def print_results(results: list[EvalScores]) -> None:
        print("\nNo eval results.")
        return
-    print("\n" + "=" * 95)
+    W = 90
    print("\n" + "=" * W)
    print(f"{'Fixture':<25} {'Mode':<6} {'Model':<25} {'P':>6} {'R':>6} {'F1':>6} {'FA':>6} {'LLM':>6}")
-    print("-" * 95)
+    print("-" * W)
    for s in results:
        llm_str = f"{s.llm_judge_score:.2f}" if s.llm_judge_score is not None else "  --"
        fa_str = f"{s.field_accuracy:.2f}" if s.field_scores else "  --"
        print(
            f"{s.fixture_name:<25} {s.prompt_variant:<6} {s.model:<25} "
            f"{s.precision:>6.2f} {s.recall:>6.2f} {s.f1:>6.2f} "
-            f"{s.field_accuracy:>6.2f} {llm_str:>6}"
+            f"{fa_str:>6} {llm_str:>6}"
        )
-    print("=" * 95)
+    print("=" * W)
    print()
    print("=" * 90)
    # If LLM judge reasoning is available, print it
    for s in results:
        if s.llm_judge_reasoning:
-            print(f"\n[{s.model} / {s.prompt_variant}] LLM Judge: {s.llm_judge_reasoning}")
+            print(f"\n{'─' * W}")
            print(f"  {s.fixture_name}  |  {s.model}  |  {s.prompt_variant}")
            print(f"{'─' * W}")
            print(s.llm_judge_reasoning)
    print()