Scoring is only meaningful for LLM-backed steps. Preprocess tests are deterministic Python, so scores add no value. Kept only for detect tests. - test_preprocess: drop _lf_score call, simplify _run_assertions return type - cases.yaml: remove score_name from all op=preprocess entries Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
172 lines
6.0 KiB
Python
172 lines
6.0 KiB
Python
"""Tests for the preprocessor system (Step 1 — Local Agent V2).
|
|
|
|
Fixtures are driven by:
|
|
tests/fixtures/preprocessors/cases.yaml — test case definitions
|
|
tests/fixtures/preprocessors/data/ — input files (HTML, txt, ...)
|
|
|
|
Run:
|
|
pytest tests/test_preprocessors.py -v
|
|
|
|
# Only detection tests
|
|
pytest tests/test_preprocessors.py -v -k detect
|
|
|
|
# Only preprocess tests
|
|
pytest tests/test_preprocessors.py -v -k preprocess
|
|
|
|
Langfuse scores are sent when LANGFUSE_SECRET_KEY / LANGFUSE_PUBLIC_KEY are set.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import re
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
import pytest
|
|
import yaml
|
|
|
|
from app.core.langfuse_client import get_langfuse
|
|
from app.core.preprocessors import detect_content_type, preprocess
|
|
|
|
# ── Paths ──────────────────────────────────────────────────────────────
|
|
|
|
_FIXTURES_DIR = Path(__file__).parent / "fixtures" / "preprocessors"
|
|
_DATA_DIR = _FIXTURES_DIR / "data"
|
|
_CASES_FILE = _FIXTURES_DIR / "cases.yaml"
|
|
|
|
# ── Content generators ─────────────────────────────────────────────────
|
|
|
|
_GENERATORS: dict[str, str] = {
|
|
# High ratio of non-printable chars → triggers "unknown" heuristic
|
|
"binary_noise": "some\x00\x01\x02\x03\x04\x05content" * 20,
|
|
}
|
|
|
|
|
|
def _load_cases() -> list[dict]:
|
|
with _CASES_FILE.open(encoding="utf-8") as f:
|
|
return yaml.safe_load(f)["cases"]
|
|
|
|
|
|
def _read_content(case: dict) -> str:
|
|
if "generate" in case:
|
|
key = case["generate"]
|
|
if key not in _GENERATORS:
|
|
raise ValueError(f"Unknown generator '{key}' in case {case['id']}")
|
|
return _GENERATORS[key]
|
|
file_path = _DATA_DIR / case["file"]
|
|
return file_path.read_text(encoding="utf-8")
|
|
|
|
|
|
# ── Langfuse helper ───────────────────────────────────────────────────
|
|
|
|
def _lf_score(score_name: str, value: float, comment: str = "") -> None:
|
|
lf = get_langfuse()
|
|
if lf:
|
|
trace = lf.trace(name=f"eval-{score_name}")
|
|
lf.score(
|
|
trace_id=trace.id,
|
|
name=score_name,
|
|
value=value,
|
|
data_type="NUMERIC",
|
|
comment=comment,
|
|
)
|
|
lf.flush()
|
|
|
|
|
|
# ── Assertion engine ──────────────────────────────────────────────────
|
|
|
|
def _run_assertions(assertions: dict[str, Any], result: Any, raw: str) -> list[str]:
|
|
"""Run all assertions declared in the YAML case. Returns failure messages."""
|
|
failures: list[str] = []
|
|
|
|
if assertions.get("no_html_tags"):
|
|
if re.search(r"<[^>]+>", result.clean_text):
|
|
failures.append("clean_text still contains HTML tags")
|
|
|
|
min_len = assertions.get("min_length")
|
|
if min_len is not None:
|
|
if len(result.clean_text) < min_len:
|
|
failures.append(
|
|
f"clean_text too short: {len(result.clean_text)} < {min_len}"
|
|
)
|
|
|
|
ratio_lt = assertions.get("compression_ratio_lt")
|
|
if ratio_lt is not None and len(raw) > 0:
|
|
ratio = len(result.clean_text) / len(raw)
|
|
if ratio >= ratio_lt:
|
|
failures.append(f"compression ratio {ratio:.2f} >= {ratio_lt}")
|
|
|
|
meta_keys = assertions.get("metadata_keys", [])
|
|
for key in meta_keys:
|
|
if not result.metadata.get(key):
|
|
failures.append(f"metadata missing key '{key}' (got {result.metadata})")
|
|
|
|
contains = assertions.get("contains")
|
|
if contains:
|
|
items = [contains] if isinstance(contains, str) else contains
|
|
for item in items:
|
|
if item not in result.clean_text:
|
|
failures.append(f"clean_text missing expected substring: {item!r}")
|
|
|
|
not_contains = assertions.get("not_contains")
|
|
if not_contains:
|
|
items = [not_contains] if isinstance(not_contains, str) else not_contains
|
|
for item in items:
|
|
if item in result.clean_text:
|
|
failures.append(f"clean_text contains forbidden substring: {item!r}")
|
|
|
|
expected_ct = assertions.get("content_type")
|
|
if expected_ct and result.content_type != expected_ct:
|
|
failures.append(
|
|
f"content_type mismatch: expected {expected_ct!r}, got {result.content_type!r}"
|
|
)
|
|
|
|
return failures
|
|
|
|
|
|
# ── Parametrized: detect ──────────────────────────────────────────────
|
|
|
|
_detect_cases = [c for c in _load_cases() if c["op"] == "detect"]
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"case",
|
|
_detect_cases,
|
|
ids=[c["id"] for c in _detect_cases],
|
|
)
|
|
def test_detect(case: dict) -> None:
|
|
raw = _read_content(case)
|
|
ct = detect_content_type(case["input_filename"], raw)
|
|
|
|
expected = case["expected_content_type"]
|
|
score = 1.0 if ct == expected else 0.0
|
|
_lf_score(case["score_name"], score, f"got={ct}, expected={expected}")
|
|
|
|
assert ct == expected, (
|
|
f"[{case['id']}] {case['description']}: "
|
|
f"expected content_type={expected!r}, got {ct!r}"
|
|
)
|
|
|
|
|
|
# ── Parametrized: preprocess ──────────────────────────────────────────
|
|
|
|
_preprocess_cases = [c for c in _load_cases() if c["op"] == "preprocess"]
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"case",
|
|
_preprocess_cases,
|
|
ids=[c["id"] for c in _preprocess_cases],
|
|
)
|
|
def test_preprocess(case: dict) -> None:
|
|
raw = _read_content(case)
|
|
result = preprocess(case["input_content_type"], raw)
|
|
|
|
assertions = case.get("assertions", {})
|
|
failures = _run_assertions(assertions, result, raw)
|
|
|
|
assert not failures, (
|
|
f"[{case['id']}] {case['description']} — {len(failures)} assertion(s) failed:\n"
|
|
+ "\n".join(f" • {f}" for f in failures)
|
|
)
|