YAML: rimosse op/description/score_name/assertions block — ora detect/process come chiave diretta, assertions piatte sullo stesso livello del caso. Runner: eliminato _run_assertions engine, assertions inline in test_preprocess. Riduzione da ~170 a ~75 righe totali tra YAML + test. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
94 lines
3.4 KiB
Python
94 lines
3.4 KiB
Python
"""Tests for the preprocessor system (Step 1 — Local Agent V2).
|
|
|
|
Fixtures: tests/fixtures/preprocessors/cases.yaml + data/
|
|
|
|
Run:
|
|
pytest tests/test_preprocessors.py -v
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import re
|
|
from pathlib import Path
|
|
|
|
import pytest
|
|
import yaml
|
|
|
|
from app.core.langfuse_client import get_langfuse
|
|
from app.core.preprocessors import detect_content_type, preprocess
|
|
|
|
_DATA_DIR = Path(__file__).parent / "fixtures" / "preprocessors" / "data"
|
|
_CASES_FILE = Path(__file__).parent / "fixtures" / "preprocessors" / "cases.yaml"
|
|
|
|
_GENERATORS = {
|
|
"binary_noise": "some\x00\x01\x02\x03\x04\x05content" * 20,
|
|
}
|
|
|
|
|
|
def _cases():
|
|
return yaml.safe_load(_CASES_FILE.read_text(encoding="utf-8"))
|
|
|
|
|
|
def _content(case: dict) -> str:
|
|
if "generate" in case:
|
|
return _GENERATORS[case["generate"]]
|
|
return (_DATA_DIR / case["file"]).read_text(encoding="utf-8")
|
|
|
|
|
|
def _lf_score(name: str, value: float, comment: str = "") -> None:
|
|
lf = get_langfuse()
|
|
if lf:
|
|
trace = lf.trace(name=f"eval-{name}")
|
|
lf.score(trace_id=trace.id, name=name, value=value, data_type="NUMERIC", comment=comment)
|
|
lf.flush()
|
|
|
|
|
|
# ── detect ────────────────────────────────────────────────────────────
|
|
|
|
_detect = [c for c in _cases() if "detect" in c]
|
|
|
|
|
|
@pytest.mark.parametrize("case", _detect, ids=[c["id"] for c in _detect])
|
|
def test_detect(case: dict) -> None:
|
|
raw = _content(case)
|
|
filename = case.get("filename", case.get("file", ""))
|
|
ct = detect_content_type(filename, raw)
|
|
expected = case["detect"]
|
|
_lf_score(f"preprocess.detect.{case['id']}", 1.0 if ct == expected else 0.0)
|
|
assert ct == expected, f"[{case['id']}] expected {expected!r}, got {ct!r}"
|
|
|
|
|
|
# ── preprocess ────────────────────────────────────────────────────────
|
|
|
|
_process = [c for c in _cases() if "process" in c]
|
|
|
|
|
|
@pytest.mark.parametrize("case", _process, ids=[c["id"] for c in _process])
|
|
def test_preprocess(case: dict) -> None:
|
|
raw = _content(case)
|
|
result = preprocess(case["process"], raw)
|
|
|
|
if case.get("no_html"):
|
|
assert not re.search(r"<[^>]+>", result.clean_text), "clean_text contains HTML tags"
|
|
|
|
if "min_chars" in case:
|
|
assert len(result.clean_text) >= case["min_chars"], \
|
|
f"clean_text too short: {len(result.clean_text)} < {case['min_chars']}"
|
|
|
|
if "ratio_lt" in case:
|
|
ratio = len(result.clean_text) / len(raw)
|
|
assert ratio < case["ratio_lt"], f"compression ratio {ratio:.2f} >= {case['ratio_lt']}"
|
|
|
|
for key in case.get("has_meta", []):
|
|
assert result.metadata.get(key), f"metadata missing {key!r} (got {result.metadata})"
|
|
|
|
for item in ([case["contains"]] if isinstance(case.get("contains"), str) else case.get("contains", [])):
|
|
assert item in result.clean_text, f"clean_text missing {item!r}"
|
|
|
|
for item in ([case["excludes"]] if isinstance(case.get("excludes"), str) else case.get("excludes", [])):
|
|
assert item not in result.clean_text, f"clean_text contains forbidden {item!r}"
|
|
|
|
if "content_type" in case:
|
|
assert result.content_type == case["content_type"], \
|
|
f"expected content_type {case['content_type']!r}, got {result.content_type!r}"
|