diff --git a/requirements.txt b/requirements.txt
index 6a7b5a6..ff06d05 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -35,4 +35,5 @@ cryptography>=42.0.0
langfuse>=2.0.0
beautifulsoup4>=4.12.0
lxml>=5.0.0
+PyYAML>=6.0.0
ruff>=0.8.0
diff --git a/tests/fixtures/preprocessors/cases.yaml b/tests/fixtures/preprocessors/cases.yaml
new file mode 100644
index 0000000..f40e84b
--- /dev/null
+++ b/tests/fixtures/preprocessors/cases.yaml
@@ -0,0 +1,127 @@
+# Preprocessor test cases — Step 1 (Local Agent V2)
+#
+# Schema per caso:
+# id: "1.N"
+# description: str
+# score_name: str # nome score inviato a Langfuse
+#
+# Sorgente contenuto (una delle due):
+# file: # letto come testo UTF-8
+# generate: binary_noise # contenuto generato dal runner (per test binari)
+#
+# Per op=detect:
+# op: detect
+# input_filename: str # filename passato a detect_content_type
+# expected_content_type: str
+#
+# Per op=preprocess:
+# op: preprocess
+# input_content_type: str # content_type passato a preprocess()
+# assertions:
+# no_html_tags: bool
+# min_length: int
+# compression_ratio_lt: float # len(clean) / len(raw) < soglia
+# metadata_keys: [str, ...] # chiavi che devono essere in metadata
+# contains: str | [str, ...] # substring(s) presenti in clean_text
+# not_contains: str | [str, ...] # substring(s) assenti da clean_text
+# content_type: str # valore atteso di result.content_type
+
+cases:
+
+ # ── Detection tests ────────────────────────────────────────────────
+
+ - id: "1.1"
+ description: "Detect email HTML"
+ score_name: preprocess.detect_email
+ file: email_action.html
+ op: detect
+ input_filename: email_export.html
+ expected_content_type: email_html
+
+ - id: "1.2"
+ description: "Detect generic HTML"
+ score_name: preprocess.detect_generic
+ file: generic_page.html
+ op: detect
+ input_filename: index.html
+ expected_content_type: generic_html
+
+ - id: "1.3"
+ description: "Detect plain text"
+ score_name: preprocess.detect_text
+ file: notes.txt
+ op: detect
+ input_filename: notes.txt
+ expected_content_type: plain_text
+
+ - id: "1.4"
+ description: "Detect unknown (binary-like content)"
+ score_name: preprocess.detect_unknown
+ generate: binary_noise
+ op: detect
+ input_filename: archive.xyz
+ expected_content_type: unknown
+
+ # ── Preprocess tests ───────────────────────────────────────────────
+
+ - id: "1.5"
+ description: "Email: strip HTML tags"
+ score_name: preprocess.email_strip
+ file: email_action.html
+ op: preprocess
+ input_content_type: email_html
+ assertions:
+ no_html_tags: true
+ min_length: 50
+ compression_ratio_lt: 0.8
+
+ - id: "1.6"
+ description: "Email: extract metadata (Subject + From)"
+ score_name: preprocess.email_metadata
+ file: email_action.html
+ op: preprocess
+ input_content_type: email_html
+ assertions:
+ metadata_keys: [subject, from]
+
+ - id: "1.7"
+ description: "Email: split thread — solo ultimo messaggio"
+ score_name: preprocess.email_thread
+ file: email_thread.html
+ op: preprocess
+ input_content_type: email_html
+ assertions:
+ contains: "Sure, I'll handle the deploy"
+ not_contains: "Let's plan the deploy"
+
+ - id: "1.8"
+ description: "Email: singolo messaggio senza thread"
+ score_name: preprocess.email_single
+ file: email_single.html
+ op: preprocess
+ input_content_type: email_html
+ assertions:
+ contains: "deploy is done"
+
+ - id: "1.9"
+ description: "Email: HTML pesante con table layout"
+ score_name: preprocess.email_heavy_html
+ file: email_heavy.html
+ op: preprocess
+ input_content_type: email_html
+ assertions:
+ no_html_tags: true
+ min_length: 30
+ not_contains:
+ - "border-collapse"
+ - "font-size"
+
+ - id: "1.10"
+ description: "Fallback: file sconosciuto → testo restituito"
+ score_name: preprocess.fallback
+ file: fallback.txt
+ op: preprocess
+ input_content_type: unknown
+ assertions:
+ min_length: 1
+ content_type: unknown
diff --git a/tests/fixtures/preprocessors/data/email_action.html b/tests/fixtures/preprocessors/data/email_action.html
new file mode 100644
index 0000000..6981b1b
--- /dev/null
+++ b/tests/fixtures/preprocessors/data/email_action.html
@@ -0,0 +1,25 @@
+
+
+
+ Fix the login bug
+
+
+
+
+
+
Hi,
+
Please fix the login bug by Friday. It is blocking the release.
+
Priority: high. Let me know if you need anything.
+
Thanks, Boss
+
+
+
diff --git a/tests/fixtures/preprocessors/data/email_heavy.html b/tests/fixtures/preprocessors/data/email_heavy.html
new file mode 100644
index 0000000..1c9efc9
--- /dev/null
+++ b/tests/fixtures/preprocessors/data/email_heavy.html
@@ -0,0 +1,49 @@
+
+
+
+
+
+
+
+
+
+
+ From:
+ newsletter@corp.com
+
+
+ Subject:
+ Q1 Results Update
+
+
+ Date:
+ Apr 7, 2026
+
+
+
+
+
+
+ Dear Team,
+ Q1 results are in. Revenue up 15% year-over-year.
+ Please review the attached report and share any feedback by EOW.
+
+
+
+
+
+
+
+
+
+
diff --git a/tests/fixtures/preprocessors/data/email_single.html b/tests/fixtures/preprocessors/data/email_single.html
new file mode 100644
index 0000000..bc4358d
--- /dev/null
+++ b/tests/fixtures/preprocessors/data/email_single.html
@@ -0,0 +1,8 @@
+
+
+ From: alice@co.com
+ To: team@co.com
+ Subject: Quick update
+ Date: Tue, 7 Apr 2026 10:30:00 +0200
+ The deploy is done. Everything looks good. No issues so far.
+
diff --git a/tests/fixtures/preprocessors/data/email_thread.html b/tests/fixtures/preprocessors/data/email_thread.html
new file mode 100644
index 0000000..0ba94a1
--- /dev/null
+++ b/tests/fixtures/preprocessors/data/email_thread.html
@@ -0,0 +1,24 @@
+
+
+
+
From: alice@co.com
+
Subject: Re: Re: Deploy plan
+
Sure, I'll handle the deploy.
+
+
+ On Mon, Apr 6, 2026 at 3:00 PM, Bob <bob@co.com> wrote:
+
+ From: bob@co.com
+ Can you handle the deploy?
+ On Sun, Apr 5, 2026 at 1:00 PM, Alice <alice@co.com> wrote:
+
+ From: alice@co.com
+ Let's plan the deploy for Monday.
+ On Sat, Apr 4, 2026 at 11:00 AM, Charlie <charlie@co.com> wrote:
+
+ From: charlie@co.com
+ We need to schedule the deploy. What day works?
+
+
+
+
diff --git a/tests/fixtures/preprocessors/data/fallback.txt b/tests/fixtures/preprocessors/data/fallback.txt
new file mode 100644
index 0000000..ce461b9
--- /dev/null
+++ b/tests/fixtures/preprocessors/data/fallback.txt
@@ -0,0 +1,3 @@
+random text content without any structure
+line two with some words
+line three and more content here
diff --git a/tests/fixtures/preprocessors/data/generic_page.html b/tests/fixtures/preprocessors/data/generic_page.html
new file mode 100644
index 0000000..edfe8a3
--- /dev/null
+++ b/tests/fixtures/preprocessors/data/generic_page.html
@@ -0,0 +1,35 @@
+
+
+
+
+ My Web App
+
+
+
+
+ Home
+ About
+ Contact
+
+
+
+
+ This is a generic web page with no email headers.
+ It has navigation, main content, and a footer.
+
+
+ Features
+
+ Fast
+ Reliable
+ Secure
+
+
+
+
+
+
diff --git a/tests/fixtures/preprocessors/data/notes.txt b/tests/fixtures/preprocessors/data/notes.txt
new file mode 100644
index 0000000..4a66216
--- /dev/null
+++ b/tests/fixtures/preprocessors/data/notes.txt
@@ -0,0 +1,15 @@
+Meeting notes - April 7, 2026
+
+Attendees: Alice, Bob, Charlie
+
+Discussion points:
+- Deploy scheduled for Friday
+- Bug fix for login must be completed by Thursday
+- Review Q1 numbers before EOW
+
+Action items:
+- Alice: fix login bug
+- Bob: prepare deploy checklist
+- Charlie: send Q1 report
+
+Next meeting: April 14, 2026
diff --git a/tests/test_preprocessors.py b/tests/test_preprocessors.py
index 83b68cd..00dcff8 100644
--- a/tests/test_preprocessors.py
+++ b/tests/test_preprocessors.py
@@ -1,221 +1,178 @@
-"""Tests for the preprocessor system (Step 1).
+"""Tests for the preprocessor system (Step 1 — Local Agent V2).
-Test IDs map to the plan:
- 1.1 detect_email, 1.2 detect_generic, 1.3 detect_text, 1.4 detect_unknown
- 1.5 email_strip, 1.6 email_metadata, 1.7 email_thread, 1.8 email_single
- 1.9 email_heavy_html, 1.10 fallback
+Fixtures are driven by:
+ tests/fixtures/preprocessors/cases.yaml — test case definitions
+ tests/fixtures/preprocessors/data/ — input files (HTML, txt, ...)
Run:
pytest tests/test_preprocessors.py -v
+ # Only detection tests
+ pytest tests/test_preprocessors.py -v -k detect
+
+ # Only preprocess tests
+ pytest tests/test_preprocessors.py -v -k preprocess
+
Langfuse scores are sent when LANGFUSE_SECRET_KEY / LANGFUSE_PUBLIC_KEY are set.
"""
from __future__ import annotations
+import re
+from pathlib import Path
+from typing import Any
+
import pytest
+import yaml
-from app.core.preprocessors import detect_content_type, preprocess
from app.core.langfuse_client import get_langfuse
+from app.core.preprocessors import detect_content_type, preprocess
-# ── Fixtures ──────────────────────────────────────────────────────────
+# ── Paths ──────────────────────────────────────────────────────────────
+
+_FIXTURES_DIR = Path(__file__).parent / "fixtures" / "preprocessors"
+_DATA_DIR = _FIXTURES_DIR / "data"
+_CASES_FILE = _FIXTURES_DIR / "cases.yaml"
+
+# ── Content generators ─────────────────────────────────────────────────
+
+_GENERATORS: dict[str, str] = {
+ # High ratio of non-printable chars → triggers "unknown" heuristic
+ "binary_noise": "some\x00\x01\x02\x03\x04\x05content" * 20,
+}
-@pytest.fixture
-def sample_email_html() -> str:
- return """
-
-
- Fix the login bug
-
-
-
- Subject: Fix the login bug
- From: boss@company.com
- To: dev@company.com
- Date: Mon, 7 Apr 2026 09:00:00 +0200
- Please fix the login bug by Friday. It is blocking the release.
-
-"""
+def _load_cases() -> list[dict]:
+ with _CASES_FILE.open(encoding="utf-8") as f:
+ return yaml.safe_load(f)["cases"]
-@pytest.fixture
-def sample_thread_email_html() -> str:
- return """
-
-From: alice@co.com
-Subject: Re: Re: Deploy plan
-Sure, I'll handle the deploy.
-
-On Mon, Apr 6, 2026 at 3:00 PM, Bob <bob@co.com> wrote:
-
-From: bob@co.com
-Can you handle the deploy?
-On Sun, Apr 5, 2026 at 1:00 PM, Alice <alice@co.com> wrote:
-
-From: alice@co.com
-Let's plan the deploy for Monday.
-
-
-"""
+def _read_content(case: dict) -> str:
+ if "generate" in case:
+ key = case["generate"]
+ if key not in _GENERATORS:
+ raise ValueError(f"Unknown generator '{key}' in case {case['id']}")
+ return _GENERATORS[key]
+ file_path = _DATA_DIR / case["file"]
+ return file_path.read_text(encoding="utf-8")
-@pytest.fixture
-def sample_heavy_html_email() -> str:
- return """
-
-
-
-
-
- From: newsletter@corp.com
- Subject: Q1 Results Update
- Date: Apr 7, 2026
-
- Dear Team,
- Q1 results are in. Revenue up 15% year-over-year.
- Please review the attached report.
-
-
-
-"""
+# ── Langfuse helper ───────────────────────────────────────────────────
-
-# ── Helper ────────────────────────────────────────────────────────────
-
-def _score(name: str, value: float, comment: str = "") -> None:
+def _lf_score(score_name: str, value: float, comment: str = "") -> None:
lf = get_langfuse()
if lf:
- trace = lf.trace(name=f"eval-{name}")
- lf.score(trace_id=trace.id, name=name, value=value,
- data_type="NUMERIC", comment=comment)
+ trace = lf.trace(name=f"eval-{score_name}")
+ lf.score(
+ trace_id=trace.id,
+ name=score_name,
+ value=value,
+ data_type="NUMERIC",
+ comment=comment,
+ )
lf.flush()
-# ── 1.1 — Detect email HTML ───────────────────────────────────────────
+# ── Assertion engine ──────────────────────────────────────────────────
-def test_detect_email_html(sample_email_html):
- ct = detect_content_type("email_export.html", sample_email_html)
- score = 1.0 if ct == "email_html" else 0.0
- _score("preprocess.detect_email", score)
- assert ct == "email_html", f"Expected 'email_html', got '{ct}'"
+def _run_assertions(assertions: dict[str, Any], result: Any, raw: str) -> tuple[float, list[str]]:
+ """Run all assertions declared in the YAML case.
+
+ Returns (score 0.0–1.0, list of failure messages).
+ """
+ failures: list[str] = []
+
+ if assertions.get("no_html_tags"):
+ if re.search(r"<[^>]+>", result.clean_text):
+ failures.append("clean_text still contains HTML tags")
+
+ min_len = assertions.get("min_length")
+ if min_len is not None:
+ if len(result.clean_text) < min_len:
+ failures.append(
+ f"clean_text too short: {len(result.clean_text)} < {min_len}"
+ )
+
+ ratio_lt = assertions.get("compression_ratio_lt")
+ if ratio_lt is not None and len(raw) > 0:
+ ratio = len(result.clean_text) / len(raw)
+ if ratio >= ratio_lt:
+ failures.append(f"compression ratio {ratio:.2f} >= {ratio_lt}")
+
+ meta_keys = assertions.get("metadata_keys", [])
+ for key in meta_keys:
+ if not result.metadata.get(key):
+ failures.append(f"metadata missing key '{key}' (got {result.metadata})")
+
+ contains = assertions.get("contains")
+ if contains:
+ items = [contains] if isinstance(contains, str) else contains
+ for item in items:
+ if item not in result.clean_text:
+ failures.append(f"clean_text missing expected substring: {item!r}")
+
+ not_contains = assertions.get("not_contains")
+ if not_contains:
+ items = [not_contains] if isinstance(not_contains, str) else not_contains
+ for item in items:
+ if item in result.clean_text:
+ failures.append(f"clean_text contains forbidden substring: {item!r}")
+
+ expected_ct = assertions.get("content_type")
+ if expected_ct and result.content_type != expected_ct:
+ failures.append(
+ f"content_type mismatch: expected {expected_ct!r}, got {result.content_type!r}"
+ )
+
+ score = 1.0 if not failures else 0.0
+ return score, failures
-# ── 1.2 — Detect generic HTML ─────────────────────────────────────────
+# ── Parametrized: detect ──────────────────────────────────────────────
-def test_detect_generic_html():
- generic = """My App
-Home Welcome
"""
- ct = detect_content_type("index.html", generic)
- score = 1.0 if ct == "generic_html" else 0.0
- _score("preprocess.detect_generic", score)
- assert ct == "generic_html", f"Expected 'generic_html', got '{ct}'"
+_detect_cases = [c for c in _load_cases() if c["op"] == "detect"]
-# ── 1.3 — Detect plain text ───────────────────────────────────────────
+@pytest.mark.parametrize(
+ "case",
+ _detect_cases,
+ ids=[c["id"] for c in _detect_cases],
+)
+def test_detect(case: dict) -> None:
+ raw = _read_content(case)
+ ct = detect_content_type(case["input_filename"], raw)
-def test_detect_plain_text():
- ct = detect_content_type("notes.txt", "Just some notes here.\nNo HTML at all.")
- score = 1.0 if ct == "plain_text" else 0.0
- _score("preprocess.detect_text", score)
- assert ct == "plain_text", f"Expected 'plain_text', got '{ct}'"
+ expected = case["expected_content_type"]
+ score = 1.0 if ct == expected else 0.0
+ _lf_score(case["score_name"], score, f"got={ct}, expected={expected}")
+
+ assert ct == expected, (
+ f"[{case['id']}] {case['description']}: "
+ f"expected content_type={expected!r}, got {ct!r}"
+ )
-# ── 1.4 — Detect unknown ──────────────────────────────────────────────
+# ── Parametrized: preprocess ──────────────────────────────────────────
-def test_detect_unknown():
- # Simulate binary-like content with non-printable chars
- binary_like = "some\x00\x01\x02\x03\x04\x05content" * 20
- ct = detect_content_type("archive.xyz", binary_like)
- score = 1.0 if ct == "unknown" else 0.0
- _score("preprocess.detect_unknown", score)
- assert ct == "unknown", f"Expected 'unknown', got '{ct}'"
+_preprocess_cases = [c for c in _load_cases() if c["op"] == "preprocess"]
-# ── 1.5 — Email: strip HTML tags ─────────────────────────────────────
+@pytest.mark.parametrize(
+ "case",
+ _preprocess_cases,
+ ids=[c["id"] for c in _preprocess_cases],
+)
+def test_preprocess(case: dict) -> None:
+ raw = _read_content(case)
+ result = preprocess(case["input_content_type"], raw)
-def test_email_strip_html(sample_email_html):
- result = preprocess("email_html", sample_email_html)
- has_no_tags = "<" not in result.clean_text
- has_content = len(result.clean_text) > 50
- ratio = len(result.clean_text) / len(sample_email_html)
- score = 1.0 if (has_no_tags and has_content and ratio < 0.8) else 0.0
- _score("preprocess.email_strip", score, f"ratio={ratio:.2f}, len={len(result.clean_text)}")
- assert has_no_tags, "clean_text still contains HTML tags"
- assert has_content, "clean_text is too short"
+ assertions = case.get("assertions", {})
+ score, failures = _run_assertions(assertions, result, raw)
+ comment = "; ".join(failures) if failures else f"len={len(result.clean_text)}"
+ _lf_score(case["score_name"], score, comment)
-# ── 1.6 — Email: extract metadata ────────────────────────────────────
-
-def test_email_extract_metadata(sample_email_html):
- result = preprocess("email_html", sample_email_html)
- has_subject = bool(result.metadata.get("subject"))
- has_from = bool(result.metadata.get("from"))
- score = 1.0 if (has_subject and has_from) else 0.5 if (has_subject or has_from) else 0.0
- _score("preprocess.email_metadata", score,
- f"subject={result.metadata.get('subject')}, from={result.metadata.get('from')}")
- assert has_subject, f"metadata missing 'subject'. Got: {result.metadata}"
- assert has_from, f"metadata missing 'from'. Got: {result.metadata}"
-
-
-# ── 1.7 — Email: split thread ─────────────────────────────────────────
-
-def test_email_split_thread(sample_thread_email_html):
- result = preprocess("email_html", sample_thread_email_html)
- # The latest message is "Sure, I'll handle the deploy."
- # Quoted content from Bob/Alice should not appear in clean_text
- has_latest = "Sure, I'll handle the deploy" in result.clean_text
- lacks_quoted = "Let's plan the deploy" not in result.clean_text
- score = 1.0 if (has_latest and lacks_quoted) else 0.5 if has_latest else 0.0
- _score("preprocess.email_thread", score,
- f"has_latest={has_latest}, lacks_quoted={lacks_quoted}")
- assert has_latest, "Latest message not found in clean_text"
- assert lacks_quoted, "Quoted older message leaked into clean_text"
-
-
-# ── 1.8 — Email: single message (no thread) ──────────────────────────
-
-def test_email_single_message():
- single = """
-From: alice@co.com
-Subject: Quick update
-The deploy is done. Everything looks good.
-"""
- result = preprocess("email_html", single)
- has_body = "deploy is done" in result.clean_text
- score = 1.0 if has_body else 0.0
- _score("preprocess.email_single", score)
- assert has_body, "Body of single message not found in clean_text"
-
-
-# ── 1.9 — Email: heavy HTML (table layout) ───────────────────────────
-
-def test_email_heavy_html(sample_heavy_html_email):
- result = preprocess("email_html", sample_heavy_html_email)
- has_no_tags = "<" not in result.clean_text
- has_content = len(result.clean_text) > 30
- # CSS properties should not appear in clean text
- no_css = "border-collapse" not in result.clean_text and "font-size" not in result.clean_text
- score = 1.0 if (has_no_tags and has_content and no_css) else 0.0
- _score("preprocess.email_heavy_html", score,
- f"no_tags={has_no_tags}, has_content={has_content}, no_css={no_css}")
- assert has_no_tags, "HTML tags found in clean_text"
- assert has_content, "clean_text is empty"
- assert no_css, "CSS properties leaked into clean_text"
-
-
-# ── 1.10 — Fallback: unknown file type ───────────────────────────────
-
-def test_fallback_unknown_content():
- raw = "random text content without any structure\nline two\nline three"
- result = preprocess("unknown", raw)
- has_text = len(result.clean_text) > 0
- score = 1.0 if has_text else 0.0
- _score("preprocess.fallback", score)
- assert has_text, "fallback handler returned empty clean_text"
- assert result.content_type == "unknown"
+ assert not failures, (
+ f"[{case['id']}] {case['description']} — {len(failures)} assertion(s) failed:\n"
+ + "\n".join(f" • {f}" for f in failures)
+ )