- cases.yaml: 10 test cases con schema dichiarativo (op, assertions) - data/: 7 file reali (email_action.html, email_thread.html, email_single.html, email_heavy.html, generic_page.html, notes.txt, fallback.txt) - test_preprocessors.py: parametrize da YAML via test_detect / test_preprocess; assertion engine generico (no_html_tags, min_length, compression_ratio, metadata_keys, contains, not_contains, content_type) - requirements.txt: add PyYAML Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
179 lines
6.2 KiB
Python
179 lines
6.2 KiB
Python
"""Tests for the preprocessor system (Step 1 — Local Agent V2).
|
||
|
||
Fixtures are driven by:
|
||
tests/fixtures/preprocessors/cases.yaml — test case definitions
|
||
tests/fixtures/preprocessors/data/ — input files (HTML, txt, ...)
|
||
|
||
Run:
|
||
pytest tests/test_preprocessors.py -v
|
||
|
||
# Only detection tests
|
||
pytest tests/test_preprocessors.py -v -k detect
|
||
|
||
# Only preprocess tests
|
||
pytest tests/test_preprocessors.py -v -k preprocess
|
||
|
||
Langfuse scores are sent when LANGFUSE_SECRET_KEY / LANGFUSE_PUBLIC_KEY are set.
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import re
|
||
from pathlib import Path
|
||
from typing import Any
|
||
|
||
import pytest
|
||
import yaml
|
||
|
||
from app.core.langfuse_client import get_langfuse
|
||
from app.core.preprocessors import detect_content_type, preprocess
|
||
|
||
# ── Paths ──────────────────────────────────────────────────────────────
|
||
|
||
_FIXTURES_DIR = Path(__file__).parent / "fixtures" / "preprocessors"
|
||
_DATA_DIR = _FIXTURES_DIR / "data"
|
||
_CASES_FILE = _FIXTURES_DIR / "cases.yaml"
|
||
|
||
# ── Content generators ─────────────────────────────────────────────────
|
||
|
||
_GENERATORS: dict[str, str] = {
|
||
# High ratio of non-printable chars → triggers "unknown" heuristic
|
||
"binary_noise": "some\x00\x01\x02\x03\x04\x05content" * 20,
|
||
}
|
||
|
||
|
||
def _load_cases() -> list[dict]:
|
||
with _CASES_FILE.open(encoding="utf-8") as f:
|
||
return yaml.safe_load(f)["cases"]
|
||
|
||
|
||
def _read_content(case: dict) -> str:
|
||
if "generate" in case:
|
||
key = case["generate"]
|
||
if key not in _GENERATORS:
|
||
raise ValueError(f"Unknown generator '{key}' in case {case['id']}")
|
||
return _GENERATORS[key]
|
||
file_path = _DATA_DIR / case["file"]
|
||
return file_path.read_text(encoding="utf-8")
|
||
|
||
|
||
# ── Langfuse helper ───────────────────────────────────────────────────
|
||
|
||
def _lf_score(score_name: str, value: float, comment: str = "") -> None:
|
||
lf = get_langfuse()
|
||
if lf:
|
||
trace = lf.trace(name=f"eval-{score_name}")
|
||
lf.score(
|
||
trace_id=trace.id,
|
||
name=score_name,
|
||
value=value,
|
||
data_type="NUMERIC",
|
||
comment=comment,
|
||
)
|
||
lf.flush()
|
||
|
||
|
||
# ── Assertion engine ──────────────────────────────────────────────────
|
||
|
||
def _run_assertions(assertions: dict[str, Any], result: Any, raw: str) -> tuple[float, list[str]]:
|
||
"""Run all assertions declared in the YAML case.
|
||
|
||
Returns (score 0.0–1.0, list of failure messages).
|
||
"""
|
||
failures: list[str] = []
|
||
|
||
if assertions.get("no_html_tags"):
|
||
if re.search(r"<[^>]+>", result.clean_text):
|
||
failures.append("clean_text still contains HTML tags")
|
||
|
||
min_len = assertions.get("min_length")
|
||
if min_len is not None:
|
||
if len(result.clean_text) < min_len:
|
||
failures.append(
|
||
f"clean_text too short: {len(result.clean_text)} < {min_len}"
|
||
)
|
||
|
||
ratio_lt = assertions.get("compression_ratio_lt")
|
||
if ratio_lt is not None and len(raw) > 0:
|
||
ratio = len(result.clean_text) / len(raw)
|
||
if ratio >= ratio_lt:
|
||
failures.append(f"compression ratio {ratio:.2f} >= {ratio_lt}")
|
||
|
||
meta_keys = assertions.get("metadata_keys", [])
|
||
for key in meta_keys:
|
||
if not result.metadata.get(key):
|
||
failures.append(f"metadata missing key '{key}' (got {result.metadata})")
|
||
|
||
contains = assertions.get("contains")
|
||
if contains:
|
||
items = [contains] if isinstance(contains, str) else contains
|
||
for item in items:
|
||
if item not in result.clean_text:
|
||
failures.append(f"clean_text missing expected substring: {item!r}")
|
||
|
||
not_contains = assertions.get("not_contains")
|
||
if not_contains:
|
||
items = [not_contains] if isinstance(not_contains, str) else not_contains
|
||
for item in items:
|
||
if item in result.clean_text:
|
||
failures.append(f"clean_text contains forbidden substring: {item!r}")
|
||
|
||
expected_ct = assertions.get("content_type")
|
||
if expected_ct and result.content_type != expected_ct:
|
||
failures.append(
|
||
f"content_type mismatch: expected {expected_ct!r}, got {result.content_type!r}"
|
||
)
|
||
|
||
score = 1.0 if not failures else 0.0
|
||
return score, failures
|
||
|
||
|
||
# ── Parametrized: detect ──────────────────────────────────────────────
|
||
|
||
_detect_cases = [c for c in _load_cases() if c["op"] == "detect"]
|
||
|
||
|
||
@pytest.mark.parametrize(
|
||
"case",
|
||
_detect_cases,
|
||
ids=[c["id"] for c in _detect_cases],
|
||
)
|
||
def test_detect(case: dict) -> None:
|
||
raw = _read_content(case)
|
||
ct = detect_content_type(case["input_filename"], raw)
|
||
|
||
expected = case["expected_content_type"]
|
||
score = 1.0 if ct == expected else 0.0
|
||
_lf_score(case["score_name"], score, f"got={ct}, expected={expected}")
|
||
|
||
assert ct == expected, (
|
||
f"[{case['id']}] {case['description']}: "
|
||
f"expected content_type={expected!r}, got {ct!r}"
|
||
)
|
||
|
||
|
||
# ── Parametrized: preprocess ──────────────────────────────────────────
|
||
|
||
_preprocess_cases = [c for c in _load_cases() if c["op"] == "preprocess"]
|
||
|
||
|
||
@pytest.mark.parametrize(
|
||
"case",
|
||
_preprocess_cases,
|
||
ids=[c["id"] for c in _preprocess_cases],
|
||
)
|
||
def test_preprocess(case: dict) -> None:
|
||
raw = _read_content(case)
|
||
result = preprocess(case["input_content_type"], raw)
|
||
|
||
assertions = case.get("assertions", {})
|
||
score, failures = _run_assertions(assertions, result, raw)
|
||
|
||
comment = "; ".join(failures) if failures else f"len={len(result.clean_text)}"
|
||
_lf_score(case["score_name"], score, comment)
|
||
|
||
assert not failures, (
|
||
f"[{case['id']}] {case['description']} — {len(failures)} assertion(s) failed:\n"
|
||
+ "\n".join(f" • {f}" for f in failures)
|
||
)
|