Files
api/tests/test_preprocessors.py
Roberto Musso bf445ac2ce refactor(tests): YAML-driven fixtures for preprocessor tests
- cases.yaml: 10 test cases con schema dichiarativo (op, assertions)
- data/: 7 file reali (email_action.html, email_thread.html, email_single.html,
  email_heavy.html, generic_page.html, notes.txt, fallback.txt)
- test_preprocessors.py: parametrize da YAML via test_detect / test_preprocess;
  assertion engine generico (no_html_tags, min_length, compression_ratio,
  metadata_keys, contains, not_contains, content_type)
- requirements.txt: add PyYAML

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 10:44:41 +02:00

179 lines
6.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""Tests for the preprocessor system (Step 1 — Local Agent V2).
Fixtures are driven by:
tests/fixtures/preprocessors/cases.yaml — test case definitions
tests/fixtures/preprocessors/data/ — input files (HTML, txt, ...)
Run:
pytest tests/test_preprocessors.py -v
# Only detection tests
pytest tests/test_preprocessors.py -v -k detect
# Only preprocess tests
pytest tests/test_preprocessors.py -v -k preprocess
Langfuse scores are sent when LANGFUSE_SECRET_KEY / LANGFUSE_PUBLIC_KEY are set.
"""
from __future__ import annotations
import re
from pathlib import Path
from typing import Any
import pytest
import yaml
from app.core.langfuse_client import get_langfuse
from app.core.preprocessors import detect_content_type, preprocess
# ── Paths ──────────────────────────────────────────────────────────────
_FIXTURES_DIR = Path(__file__).parent / "fixtures" / "preprocessors"
_DATA_DIR = _FIXTURES_DIR / "data"
_CASES_FILE = _FIXTURES_DIR / "cases.yaml"
# ── Content generators ─────────────────────────────────────────────────
_GENERATORS: dict[str, str] = {
# High ratio of non-printable chars → triggers "unknown" heuristic
"binary_noise": "some\x00\x01\x02\x03\x04\x05content" * 20,
}
def _load_cases() -> list[dict]:
with _CASES_FILE.open(encoding="utf-8") as f:
return yaml.safe_load(f)["cases"]
def _read_content(case: dict) -> str:
if "generate" in case:
key = case["generate"]
if key not in _GENERATORS:
raise ValueError(f"Unknown generator '{key}' in case {case['id']}")
return _GENERATORS[key]
file_path = _DATA_DIR / case["file"]
return file_path.read_text(encoding="utf-8")
# ── Langfuse helper ───────────────────────────────────────────────────
def _lf_score(score_name: str, value: float, comment: str = "") -> None:
lf = get_langfuse()
if lf:
trace = lf.trace(name=f"eval-{score_name}")
lf.score(
trace_id=trace.id,
name=score_name,
value=value,
data_type="NUMERIC",
comment=comment,
)
lf.flush()
# ── Assertion engine ──────────────────────────────────────────────────
def _run_assertions(assertions: dict[str, Any], result: Any, raw: str) -> tuple[float, list[str]]:
"""Run all assertions declared in the YAML case.
Returns (score 0.01.0, list of failure messages).
"""
failures: list[str] = []
if assertions.get("no_html_tags"):
if re.search(r"<[^>]+>", result.clean_text):
failures.append("clean_text still contains HTML tags")
min_len = assertions.get("min_length")
if min_len is not None:
if len(result.clean_text) < min_len:
failures.append(
f"clean_text too short: {len(result.clean_text)} < {min_len}"
)
ratio_lt = assertions.get("compression_ratio_lt")
if ratio_lt is not None and len(raw) > 0:
ratio = len(result.clean_text) / len(raw)
if ratio >= ratio_lt:
failures.append(f"compression ratio {ratio:.2f} >= {ratio_lt}")
meta_keys = assertions.get("metadata_keys", [])
for key in meta_keys:
if not result.metadata.get(key):
failures.append(f"metadata missing key '{key}' (got {result.metadata})")
contains = assertions.get("contains")
if contains:
items = [contains] if isinstance(contains, str) else contains
for item in items:
if item not in result.clean_text:
failures.append(f"clean_text missing expected substring: {item!r}")
not_contains = assertions.get("not_contains")
if not_contains:
items = [not_contains] if isinstance(not_contains, str) else not_contains
for item in items:
if item in result.clean_text:
failures.append(f"clean_text contains forbidden substring: {item!r}")
expected_ct = assertions.get("content_type")
if expected_ct and result.content_type != expected_ct:
failures.append(
f"content_type mismatch: expected {expected_ct!r}, got {result.content_type!r}"
)
score = 1.0 if not failures else 0.0
return score, failures
# ── Parametrized: detect ──────────────────────────────────────────────
_detect_cases = [c for c in _load_cases() if c["op"] == "detect"]
@pytest.mark.parametrize(
"case",
_detect_cases,
ids=[c["id"] for c in _detect_cases],
)
def test_detect(case: dict) -> None:
raw = _read_content(case)
ct = detect_content_type(case["input_filename"], raw)
expected = case["expected_content_type"]
score = 1.0 if ct == expected else 0.0
_lf_score(case["score_name"], score, f"got={ct}, expected={expected}")
assert ct == expected, (
f"[{case['id']}] {case['description']}: "
f"expected content_type={expected!r}, got {ct!r}"
)
# ── Parametrized: preprocess ──────────────────────────────────────────
_preprocess_cases = [c for c in _load_cases() if c["op"] == "preprocess"]
@pytest.mark.parametrize(
"case",
_preprocess_cases,
ids=[c["id"] for c in _preprocess_cases],
)
def test_preprocess(case: dict) -> None:
raw = _read_content(case)
result = preprocess(case["input_content_type"], raw)
assertions = case.get("assertions", {})
score, failures = _run_assertions(assertions, result, raw)
comment = "; ".join(failures) if failures else f"len={len(result.clean_text)}"
_lf_score(case["score_name"], score, comment)
assert not failures, (
f"[{case['id']}] {case['description']}{len(failures)} assertion(s) failed:\n"
+ "\n".join(f"{f}" for f in failures)
)