Files
api/tests/test_preprocessors.py
Roberto Musso 7ccdad431f feat(i18n): inject user language into AI agent system prompts
- Add _language_instruction() to deep_agent.py, reads language from core memory
- Append language directive to all 4 run_* functions (task/project/checkpoint/note)
- Minor fixes: alembic env, route imports, test cleanup
2026-04-12 00:35:23 +02:00

98 lines
3.6 KiB
Python

"""Tests for the preprocessor system (Step 1 — Local Agent V2).
Run:
pytest tests/test_preprocessors.py -v
pytest tests/test_preprocessors.py -v --preprocess-dir /path/to/folder
The folder must contain cases.yaml + data/.
"""
from __future__ import annotations
import re
from pathlib import Path
import yaml
from app.core.preprocessors import detect_content_type, preprocess
_DEFAULT_DIR = Path(__file__).parent / "fixtures" / "preprocessors"
_GENERATORS = {
"binary_noise": "some\x00\x01\x02\x03\x04\x05content" * 20,
}
def _fixtures_dir(config) -> Path:
override = config.getoption("--preprocess-dir")
return Path(override) if override else _DEFAULT_DIR
def _load_cases(config) -> list[dict]:
return yaml.safe_load((_fixtures_dir(config) / "cases.yaml").read_text(encoding="utf-8"))
def _content(case: dict, data_dir: Path) -> str:
if "generate" in case:
return _GENERATORS[case["generate"]]
return (data_dir / case["file"]).read_text(encoding="utf-8")
# ── parametrize at collection time via pytest hook ────────────────────
def pytest_generate_tests(metafunc):
if "preprocess_case" not in metafunc.fixturenames:
return
cases = _load_cases(metafunc.config)
test_name = metafunc.function.__name__
if test_name == "test_detect":
subset = [c for c in cases if "detect" in c]
else:
subset = [c for c in cases if "process" in c]
metafunc.parametrize("preprocess_case", subset, ids=[c["id"] for c in subset])
# ── detect ────────────────────────────────────────────────────────────
def test_detect(preprocess_case, pytestconfig) -> None:
case = preprocess_case
data_dir = _fixtures_dir(pytestconfig) / "data"
raw = _content(case, data_dir)
filename = case.get("file", "")
ct = detect_content_type(filename, raw)
expected = case["detect"]
assert ct == expected, f"[{case['id']}] expected {expected!r}, got {ct!r}"
# ── preprocess ────────────────────────────────────────────────────────
def test_preprocess(preprocess_case, pytestconfig) -> None:
case = preprocess_case
data_dir = _fixtures_dir(pytestconfig) / "data"
raw = _content(case, data_dir)
result = preprocess(case["process"], raw)
if case.get("no_html"):
assert not re.search(r"<[^>]+>", result.clean_text), "clean_text contains HTML tags"
if "min_chars" in case:
assert len(result.clean_text) >= case["min_chars"], \
f"clean_text too short: {len(result.clean_text)} < {case['min_chars']}"
if "ratio_lt" in case:
ratio = len(result.clean_text) / len(raw)
assert ratio < case["ratio_lt"], f"compression ratio {ratio:.2f} >= {case['ratio_lt']}"
for key in case.get("has_meta", []):
assert result.metadata.get(key), f"metadata missing {key!r} (got {result.metadata})"
for item in ([case["contains"]] if isinstance(case.get("contains"), str) else case.get("contains", [])):
assert item in result.clean_text, f"clean_text missing {item!r}"
for item in ([case["excludes"]] if isinstance(case.get("excludes"), str) else case.get("excludes", [])):
assert item not in result.clean_text, f"clean_text contains forbidden {item!r}"
if "content_type" in case:
assert result.content_type == case["content_type"], \
f"expected content_type {case['content_type']!r}, got {result.content_type!r}"