"""Tests for the preprocessor system (Step 1 — Local Agent V2). Fixtures are driven by: tests/fixtures/preprocessors/cases.yaml — test case definitions tests/fixtures/preprocessors/data/ — input files (HTML, txt, ...) Run: pytest tests/test_preprocessors.py -v # Only detection tests pytest tests/test_preprocessors.py -v -k detect # Only preprocess tests pytest tests/test_preprocessors.py -v -k preprocess Langfuse scores are sent when LANGFUSE_SECRET_KEY / LANGFUSE_PUBLIC_KEY are set. """ from __future__ import annotations import re from pathlib import Path from typing import Any import pytest import yaml from app.core.langfuse_client import get_langfuse from app.core.preprocessors import detect_content_type, preprocess # ── Paths ────────────────────────────────────────────────────────────── _FIXTURES_DIR = Path(__file__).parent / "fixtures" / "preprocessors" _DATA_DIR = _FIXTURES_DIR / "data" _CASES_FILE = _FIXTURES_DIR / "cases.yaml" # ── Content generators ───────────────────────────────────────────────── _GENERATORS: dict[str, str] = { # High ratio of non-printable chars → triggers "unknown" heuristic "binary_noise": "some\x00\x01\x02\x03\x04\x05content" * 20, } def _load_cases() -> list[dict]: with _CASES_FILE.open(encoding="utf-8") as f: return yaml.safe_load(f)["cases"] def _read_content(case: dict) -> str: if "generate" in case: key = case["generate"] if key not in _GENERATORS: raise ValueError(f"Unknown generator '{key}' in case {case['id']}") return _GENERATORS[key] file_path = _DATA_DIR / case["file"] return file_path.read_text(encoding="utf-8") # ── Langfuse helper ─────────────────────────────────────────────────── def _lf_score(score_name: str, value: float, comment: str = "") -> None: lf = get_langfuse() if lf: trace = lf.trace(name=f"eval-{score_name}") lf.score( trace_id=trace.id, name=score_name, value=value, data_type="NUMERIC", comment=comment, ) lf.flush() # ── Assertion engine ────────────────────────────────────────────────── def _run_assertions(assertions: dict[str, Any], result: Any, raw: str) -> list[str]: """Run all assertions declared in the YAML case. Returns failure messages.""" failures: list[str] = [] if assertions.get("no_html_tags"): if re.search(r"<[^>]+>", result.clean_text): failures.append("clean_text still contains HTML tags") min_len = assertions.get("min_length") if min_len is not None: if len(result.clean_text) < min_len: failures.append( f"clean_text too short: {len(result.clean_text)} < {min_len}" ) ratio_lt = assertions.get("compression_ratio_lt") if ratio_lt is not None and len(raw) > 0: ratio = len(result.clean_text) / len(raw) if ratio >= ratio_lt: failures.append(f"compression ratio {ratio:.2f} >= {ratio_lt}") meta_keys = assertions.get("metadata_keys", []) for key in meta_keys: if not result.metadata.get(key): failures.append(f"metadata missing key '{key}' (got {result.metadata})") contains = assertions.get("contains") if contains: items = [contains] if isinstance(contains, str) else contains for item in items: if item not in result.clean_text: failures.append(f"clean_text missing expected substring: {item!r}") not_contains = assertions.get("not_contains") if not_contains: items = [not_contains] if isinstance(not_contains, str) else not_contains for item in items: if item in result.clean_text: failures.append(f"clean_text contains forbidden substring: {item!r}") expected_ct = assertions.get("content_type") if expected_ct and result.content_type != expected_ct: failures.append( f"content_type mismatch: expected {expected_ct!r}, got {result.content_type!r}" ) return failures # ── Parametrized: detect ────────────────────────────────────────────── _detect_cases = [c for c in _load_cases() if c["op"] == "detect"] @pytest.mark.parametrize( "case", _detect_cases, ids=[c["id"] for c in _detect_cases], ) def test_detect(case: dict) -> None: raw = _read_content(case) ct = detect_content_type(case["input_filename"], raw) expected = case["expected_content_type"] score = 1.0 if ct == expected else 0.0 _lf_score(case["score_name"], score, f"got={ct}, expected={expected}") assert ct == expected, ( f"[{case['id']}] {case['description']}: " f"expected content_type={expected!r}, got {ct!r}" ) # ── Parametrized: preprocess ────────────────────────────────────────── _preprocess_cases = [c for c in _load_cases() if c["op"] == "preprocess"] @pytest.mark.parametrize( "case", _preprocess_cases, ids=[c["id"] for c in _preprocess_cases], ) def test_preprocess(case: dict) -> None: raw = _read_content(case) result = preprocess(case["input_content_type"], raw) assertions = case.get("assertions", {}) failures = _run_assertions(assertions, result, raw) assert not failures, ( f"[{case['id']}] {case['description']} — {len(failures)} assertion(s) failed:\n" + "\n".join(f" • {f}" for f in failures) )