"""Tests for the preprocessor system (Step 1 — Local Agent V2). Run: pytest tests/test_preprocessors.py -v pytest tests/test_preprocessors.py -v --preprocess-dir /path/to/folder The folder must contain cases.yaml + data/. """ from __future__ import annotations import re from pathlib import Path import pytest import yaml from app.core.langfuse_client import get_langfuse from app.core.preprocessors import detect_content_type, preprocess _DEFAULT_DIR = Path(__file__).parent / "fixtures" / "preprocessors" _GENERATORS = { "binary_noise": "some\x00\x01\x02\x03\x04\x05content" * 20, } def _fixtures_dir(config) -> Path: override = config.getoption("--preprocess-dir") return Path(override) if override else _DEFAULT_DIR def _load_cases(config) -> list[dict]: return yaml.safe_load((_fixtures_dir(config) / "cases.yaml").read_text(encoding="utf-8")) def _content(case: dict, data_dir: Path) -> str: if "generate" in case: return _GENERATORS[case["generate"]] return (data_dir / case["file"]).read_text(encoding="utf-8") def _lf_score(name: str, value: float) -> None: lf = get_langfuse() if lf: trace = lf.trace(name=f"eval-{name}") lf.score(trace_id=trace.id, name=name, value=value, data_type="NUMERIC") lf.flush() # ── parametrize at collection time via pytest hook ──────────────────── def pytest_generate_tests(metafunc): if "preprocess_case" not in metafunc.fixturenames: return cases = _load_cases(metafunc.config) test_name = metafunc.function.__name__ if test_name == "test_detect": subset = [c for c in cases if "detect" in c] else: subset = [c for c in cases if "process" in c] metafunc.parametrize("preprocess_case", subset, ids=[c["id"] for c in subset]) # ── detect ──────────────────────────────────────────────────────────── def test_detect(preprocess_case, pytestconfig) -> None: case = preprocess_case data_dir = _fixtures_dir(pytestconfig) / "data" raw = _content(case, data_dir) filename = case.get("file", "") ct = detect_content_type(filename, raw) expected = case["detect"] _lf_score(f"preprocess.detect.{case['id']}", 1.0 if ct == expected else 0.0) assert ct == expected, f"[{case['id']}] expected {expected!r}, got {ct!r}" # ── preprocess ──────────────────────────────────────────────────────── def test_preprocess(preprocess_case, pytestconfig) -> None: case = preprocess_case data_dir = _fixtures_dir(pytestconfig) / "data" raw = _content(case, data_dir) result = preprocess(case["process"], raw) if case.get("no_html"): assert not re.search(r"<[^>]+>", result.clean_text), "clean_text contains HTML tags" if "min_chars" in case: assert len(result.clean_text) >= case["min_chars"], \ f"clean_text too short: {len(result.clean_text)} < {case['min_chars']}" if "ratio_lt" in case: ratio = len(result.clean_text) / len(raw) assert ratio < case["ratio_lt"], f"compression ratio {ratio:.2f} >= {case['ratio_lt']}" for key in case.get("has_meta", []): assert result.metadata.get(key), f"metadata missing {key!r} (got {result.metadata})" for item in ([case["contains"]] if isinstance(case.get("contains"), str) else case.get("contains", [])): assert item in result.clean_text, f"clean_text missing {item!r}" for item in ([case["excludes"]] if isinstance(case.get("excludes"), str) else case.get("excludes", [])): assert item not in result.clean_text, f"clean_text contains forbidden {item!r}" if "content_type" in case: assert result.content_type == case["content_type"], \ f"expected content_type {case['content_type']!r}, got {result.content_type!r}"