diff --git a/requirements.txt b/requirements.txt index 6a7b5a6..ff06d05 100644 --- a/requirements.txt +++ b/requirements.txt @@ -35,4 +35,5 @@ cryptography>=42.0.0 langfuse>=2.0.0 beautifulsoup4>=4.12.0 lxml>=5.0.0 +PyYAML>=6.0.0 ruff>=0.8.0 diff --git a/tests/fixtures/preprocessors/cases.yaml b/tests/fixtures/preprocessors/cases.yaml new file mode 100644 index 0000000..f40e84b --- /dev/null +++ b/tests/fixtures/preprocessors/cases.yaml @@ -0,0 +1,127 @@ +# Preprocessor test cases — Step 1 (Local Agent V2) +# +# Schema per caso: +# id: "1.N" +# description: str +# score_name: str # nome score inviato a Langfuse +# +# Sorgente contenuto (una delle due): +# file: # letto come testo UTF-8 +# generate: binary_noise # contenuto generato dal runner (per test binari) +# +# Per op=detect: +# op: detect +# input_filename: str # filename passato a detect_content_type +# expected_content_type: str +# +# Per op=preprocess: +# op: preprocess +# input_content_type: str # content_type passato a preprocess() +# assertions: +# no_html_tags: bool +# min_length: int +# compression_ratio_lt: float # len(clean) / len(raw) < soglia +# metadata_keys: [str, ...] # chiavi che devono essere in metadata +# contains: str | [str, ...] # substring(s) presenti in clean_text +# not_contains: str | [str, ...] # substring(s) assenti da clean_text +# content_type: str # valore atteso di result.content_type + +cases: + + # ── Detection tests ──────────────────────────────────────────────── + + - id: "1.1" + description: "Detect email HTML" + score_name: preprocess.detect_email + file: email_action.html + op: detect + input_filename: email_export.html + expected_content_type: email_html + + - id: "1.2" + description: "Detect generic HTML" + score_name: preprocess.detect_generic + file: generic_page.html + op: detect + input_filename: index.html + expected_content_type: generic_html + + - id: "1.3" + description: "Detect plain text" + score_name: preprocess.detect_text + file: notes.txt + op: detect + input_filename: notes.txt + expected_content_type: plain_text + + - id: "1.4" + description: "Detect unknown (binary-like content)" + score_name: preprocess.detect_unknown + generate: binary_noise + op: detect + input_filename: archive.xyz + expected_content_type: unknown + + # ── Preprocess tests ─────────────────────────────────────────────── + + - id: "1.5" + description: "Email: strip HTML tags" + score_name: preprocess.email_strip + file: email_action.html + op: preprocess + input_content_type: email_html + assertions: + no_html_tags: true + min_length: 50 + compression_ratio_lt: 0.8 + + - id: "1.6" + description: "Email: extract metadata (Subject + From)" + score_name: preprocess.email_metadata + file: email_action.html + op: preprocess + input_content_type: email_html + assertions: + metadata_keys: [subject, from] + + - id: "1.7" + description: "Email: split thread — solo ultimo messaggio" + score_name: preprocess.email_thread + file: email_thread.html + op: preprocess + input_content_type: email_html + assertions: + contains: "Sure, I'll handle the deploy" + not_contains: "Let's plan the deploy" + + - id: "1.8" + description: "Email: singolo messaggio senza thread" + score_name: preprocess.email_single + file: email_single.html + op: preprocess + input_content_type: email_html + assertions: + contains: "deploy is done" + + - id: "1.9" + description: "Email: HTML pesante con table layout" + score_name: preprocess.email_heavy_html + file: email_heavy.html + op: preprocess + input_content_type: email_html + assertions: + no_html_tags: true + min_length: 30 + not_contains: + - "border-collapse" + - "font-size" + + - id: "1.10" + description: "Fallback: file sconosciuto → testo restituito" + score_name: preprocess.fallback + file: fallback.txt + op: preprocess + input_content_type: unknown + assertions: + min_length: 1 + content_type: unknown diff --git a/tests/fixtures/preprocessors/data/email_action.html b/tests/fixtures/preprocessors/data/email_action.html new file mode 100644 index 0000000..6981b1b --- /dev/null +++ b/tests/fixtures/preprocessors/data/email_action.html @@ -0,0 +1,25 @@ + + + + Fix the login bug + + + +
+

From: boss@company.com

+

To: dev@company.com

+

Subject: Fix the login bug

+

Date: Mon, 7 Apr 2026 09:00:00 +0200

+
+
+

Hi,

+

Please fix the login bug by Friday. It is blocking the release.

+

Priority: high. Let me know if you need anything.

+

Thanks,
Boss

+
+ + diff --git a/tests/fixtures/preprocessors/data/email_heavy.html b/tests/fixtures/preprocessors/data/email_heavy.html new file mode 100644 index 0000000..1c9efc9 --- /dev/null +++ b/tests/fixtures/preprocessors/data/email_heavy.html @@ -0,0 +1,49 @@ + + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + +
Company Internal Update
From:newsletter@corp.com
Subject:Q1 Results Update
Date:Apr 7, 2026
+ + + + +
+

Dear Team,

+

Q1 results are in. Revenue up 15% year-over-year.

+

Please review the attached report and share any feedback by EOW.

+
+
+
+ + diff --git a/tests/fixtures/preprocessors/data/email_single.html b/tests/fixtures/preprocessors/data/email_single.html new file mode 100644 index 0000000..bc4358d --- /dev/null +++ b/tests/fixtures/preprocessors/data/email_single.html @@ -0,0 +1,8 @@ + + +

From: alice@co.com

+

To: team@co.com

+

Subject: Quick update

+

Date: Tue, 7 Apr 2026 10:30:00 +0200

+

The deploy is done. Everything looks good. No issues so far.

+ diff --git a/tests/fixtures/preprocessors/data/email_thread.html b/tests/fixtures/preprocessors/data/email_thread.html new file mode 100644 index 0000000..0ba94a1 --- /dev/null +++ b/tests/fixtures/preprocessors/data/email_thread.html @@ -0,0 +1,24 @@ + + +
+

From: alice@co.com

+

Subject: Re: Re: Deploy plan

+

Sure, I'll handle the deploy.

+
+ +

On Mon, Apr 6, 2026 at 3:00 PM, Bob <bob@co.com> wrote:

+
+

From: bob@co.com

+

Can you handle the deploy?

+

On Sun, Apr 5, 2026 at 1:00 PM, Alice <alice@co.com> wrote:

+
+

From: alice@co.com

+

Let's plan the deploy for Monday.

+

On Sat, Apr 4, 2026 at 11:00 AM, Charlie <charlie@co.com> wrote:

+
+

From: charlie@co.com

+

We need to schedule the deploy. What day works?

+
+
+
+ diff --git a/tests/fixtures/preprocessors/data/fallback.txt b/tests/fixtures/preprocessors/data/fallback.txt new file mode 100644 index 0000000..ce461b9 --- /dev/null +++ b/tests/fixtures/preprocessors/data/fallback.txt @@ -0,0 +1,3 @@ +random text content without any structure +line two with some words +line three and more content here diff --git a/tests/fixtures/preprocessors/data/generic_page.html b/tests/fixtures/preprocessors/data/generic_page.html new file mode 100644 index 0000000..edfe8a3 --- /dev/null +++ b/tests/fixtures/preprocessors/data/generic_page.html @@ -0,0 +1,35 @@ + + + + + My Web App + + + + +
+
+

Welcome to My App

+
+
+

This is a generic web page with no email headers.

+

It has navigation, main content, and a footer.

+
+
+

Features

+
    +
  • Fast
  • +
  • Reliable
  • +
  • Secure
  • +
+
+
+ + + diff --git a/tests/fixtures/preprocessors/data/notes.txt b/tests/fixtures/preprocessors/data/notes.txt new file mode 100644 index 0000000..4a66216 --- /dev/null +++ b/tests/fixtures/preprocessors/data/notes.txt @@ -0,0 +1,15 @@ +Meeting notes - April 7, 2026 + +Attendees: Alice, Bob, Charlie + +Discussion points: +- Deploy scheduled for Friday +- Bug fix for login must be completed by Thursday +- Review Q1 numbers before EOW + +Action items: +- Alice: fix login bug +- Bob: prepare deploy checklist +- Charlie: send Q1 report + +Next meeting: April 14, 2026 diff --git a/tests/test_preprocessors.py b/tests/test_preprocessors.py index 83b68cd..00dcff8 100644 --- a/tests/test_preprocessors.py +++ b/tests/test_preprocessors.py @@ -1,221 +1,178 @@ -"""Tests for the preprocessor system (Step 1). +"""Tests for the preprocessor system (Step 1 — Local Agent V2). -Test IDs map to the plan: - 1.1 detect_email, 1.2 detect_generic, 1.3 detect_text, 1.4 detect_unknown - 1.5 email_strip, 1.6 email_metadata, 1.7 email_thread, 1.8 email_single - 1.9 email_heavy_html, 1.10 fallback +Fixtures are driven by: + tests/fixtures/preprocessors/cases.yaml — test case definitions + tests/fixtures/preprocessors/data/ — input files (HTML, txt, ...) Run: pytest tests/test_preprocessors.py -v + # Only detection tests + pytest tests/test_preprocessors.py -v -k detect + + # Only preprocess tests + pytest tests/test_preprocessors.py -v -k preprocess + Langfuse scores are sent when LANGFUSE_SECRET_KEY / LANGFUSE_PUBLIC_KEY are set. """ from __future__ import annotations +import re +from pathlib import Path +from typing import Any + import pytest +import yaml -from app.core.preprocessors import detect_content_type, preprocess from app.core.langfuse_client import get_langfuse +from app.core.preprocessors import detect_content_type, preprocess -# ── Fixtures ────────────────────────────────────────────────────────── +# ── Paths ────────────────────────────────────────────────────────────── + +_FIXTURES_DIR = Path(__file__).parent / "fixtures" / "preprocessors" +_DATA_DIR = _FIXTURES_DIR / "data" +_CASES_FILE = _FIXTURES_DIR / "cases.yaml" + +# ── Content generators ───────────────────────────────────────────────── + +_GENERATORS: dict[str, str] = { + # High ratio of non-printable chars → triggers "unknown" heuristic + "binary_noise": "some\x00\x01\x02\x03\x04\x05content" * 20, +} -@pytest.fixture -def sample_email_html() -> str: - return """ - - - Fix the login bug - - - -

Subject: Fix the login bug

-

From: boss@company.com

-

To: dev@company.com

-

Date: Mon, 7 Apr 2026 09:00:00 +0200

-

Please fix the login bug by Friday. It is blocking the release.

- -""" +def _load_cases() -> list[dict]: + with _CASES_FILE.open(encoding="utf-8") as f: + return yaml.safe_load(f)["cases"] -@pytest.fixture -def sample_thread_email_html() -> str: - return """ - -

From: alice@co.com

-

Subject: Re: Re: Deploy plan

-

Sure, I'll handle the deploy.

- -

On Mon, Apr 6, 2026 at 3:00 PM, Bob <bob@co.com> wrote:

-
-

From: bob@co.com

-

Can you handle the deploy?

-

On Sun, Apr 5, 2026 at 1:00 PM, Alice <alice@co.com> wrote:

-
-

From: alice@co.com

-

Let's plan the deploy for Monday.

-
-
-""" +def _read_content(case: dict) -> str: + if "generate" in case: + key = case["generate"] + if key not in _GENERATORS: + raise ValueError(f"Unknown generator '{key}' in case {case['id']}") + return _GENERATORS[key] + file_path = _DATA_DIR / case["file"] + return file_path.read_text(encoding="utf-8") -@pytest.fixture -def sample_heavy_html_email() -> str: - return """ - - - - - - - - - - -
Company Newsletter
From:newsletter@corp.com
Subject:Q1 Results Update
Date:Apr 7, 2026
-

Dear Team,

-

Q1 results are in. Revenue up 15% year-over-year.

-

Please review the attached report.

-
-""" +# ── Langfuse helper ─────────────────────────────────────────────────── - -# ── Helper ──────────────────────────────────────────────────────────── - -def _score(name: str, value: float, comment: str = "") -> None: +def _lf_score(score_name: str, value: float, comment: str = "") -> None: lf = get_langfuse() if lf: - trace = lf.trace(name=f"eval-{name}") - lf.score(trace_id=trace.id, name=name, value=value, - data_type="NUMERIC", comment=comment) + trace = lf.trace(name=f"eval-{score_name}") + lf.score( + trace_id=trace.id, + name=score_name, + value=value, + data_type="NUMERIC", + comment=comment, + ) lf.flush() -# ── 1.1 — Detect email HTML ─────────────────────────────────────────── +# ── Assertion engine ────────────────────────────────────────────────── -def test_detect_email_html(sample_email_html): - ct = detect_content_type("email_export.html", sample_email_html) - score = 1.0 if ct == "email_html" else 0.0 - _score("preprocess.detect_email", score) - assert ct == "email_html", f"Expected 'email_html', got '{ct}'" +def _run_assertions(assertions: dict[str, Any], result: Any, raw: str) -> tuple[float, list[str]]: + """Run all assertions declared in the YAML case. + + Returns (score 0.0–1.0, list of failure messages). + """ + failures: list[str] = [] + + if assertions.get("no_html_tags"): + if re.search(r"<[^>]+>", result.clean_text): + failures.append("clean_text still contains HTML tags") + + min_len = assertions.get("min_length") + if min_len is not None: + if len(result.clean_text) < min_len: + failures.append( + f"clean_text too short: {len(result.clean_text)} < {min_len}" + ) + + ratio_lt = assertions.get("compression_ratio_lt") + if ratio_lt is not None and len(raw) > 0: + ratio = len(result.clean_text) / len(raw) + if ratio >= ratio_lt: + failures.append(f"compression ratio {ratio:.2f} >= {ratio_lt}") + + meta_keys = assertions.get("metadata_keys", []) + for key in meta_keys: + if not result.metadata.get(key): + failures.append(f"metadata missing key '{key}' (got {result.metadata})") + + contains = assertions.get("contains") + if contains: + items = [contains] if isinstance(contains, str) else contains + for item in items: + if item not in result.clean_text: + failures.append(f"clean_text missing expected substring: {item!r}") + + not_contains = assertions.get("not_contains") + if not_contains: + items = [not_contains] if isinstance(not_contains, str) else not_contains + for item in items: + if item in result.clean_text: + failures.append(f"clean_text contains forbidden substring: {item!r}") + + expected_ct = assertions.get("content_type") + if expected_ct and result.content_type != expected_ct: + failures.append( + f"content_type mismatch: expected {expected_ct!r}, got {result.content_type!r}" + ) + + score = 1.0 if not failures else 0.0 + return score, failures -# ── 1.2 — Detect generic HTML ───────────────────────────────────────── +# ── Parametrized: detect ────────────────────────────────────────────── -def test_detect_generic_html(): - generic = """My App -

Welcome

""" - ct = detect_content_type("index.html", generic) - score = 1.0 if ct == "generic_html" else 0.0 - _score("preprocess.detect_generic", score) - assert ct == "generic_html", f"Expected 'generic_html', got '{ct}'" +_detect_cases = [c for c in _load_cases() if c["op"] == "detect"] -# ── 1.3 — Detect plain text ─────────────────────────────────────────── +@pytest.mark.parametrize( + "case", + _detect_cases, + ids=[c["id"] for c in _detect_cases], +) +def test_detect(case: dict) -> None: + raw = _read_content(case) + ct = detect_content_type(case["input_filename"], raw) -def test_detect_plain_text(): - ct = detect_content_type("notes.txt", "Just some notes here.\nNo HTML at all.") - score = 1.0 if ct == "plain_text" else 0.0 - _score("preprocess.detect_text", score) - assert ct == "plain_text", f"Expected 'plain_text', got '{ct}'" + expected = case["expected_content_type"] + score = 1.0 if ct == expected else 0.0 + _lf_score(case["score_name"], score, f"got={ct}, expected={expected}") + + assert ct == expected, ( + f"[{case['id']}] {case['description']}: " + f"expected content_type={expected!r}, got {ct!r}" + ) -# ── 1.4 — Detect unknown ────────────────────────────────────────────── +# ── Parametrized: preprocess ────────────────────────────────────────── -def test_detect_unknown(): - # Simulate binary-like content with non-printable chars - binary_like = "some\x00\x01\x02\x03\x04\x05content" * 20 - ct = detect_content_type("archive.xyz", binary_like) - score = 1.0 if ct == "unknown" else 0.0 - _score("preprocess.detect_unknown", score) - assert ct == "unknown", f"Expected 'unknown', got '{ct}'" +_preprocess_cases = [c for c in _load_cases() if c["op"] == "preprocess"] -# ── 1.5 — Email: strip HTML tags ───────────────────────────────────── +@pytest.mark.parametrize( + "case", + _preprocess_cases, + ids=[c["id"] for c in _preprocess_cases], +) +def test_preprocess(case: dict) -> None: + raw = _read_content(case) + result = preprocess(case["input_content_type"], raw) -def test_email_strip_html(sample_email_html): - result = preprocess("email_html", sample_email_html) - has_no_tags = "<" not in result.clean_text - has_content = len(result.clean_text) > 50 - ratio = len(result.clean_text) / len(sample_email_html) - score = 1.0 if (has_no_tags and has_content and ratio < 0.8) else 0.0 - _score("preprocess.email_strip", score, f"ratio={ratio:.2f}, len={len(result.clean_text)}") - assert has_no_tags, "clean_text still contains HTML tags" - assert has_content, "clean_text is too short" + assertions = case.get("assertions", {}) + score, failures = _run_assertions(assertions, result, raw) + comment = "; ".join(failures) if failures else f"len={len(result.clean_text)}" + _lf_score(case["score_name"], score, comment) -# ── 1.6 — Email: extract metadata ──────────────────────────────────── - -def test_email_extract_metadata(sample_email_html): - result = preprocess("email_html", sample_email_html) - has_subject = bool(result.metadata.get("subject")) - has_from = bool(result.metadata.get("from")) - score = 1.0 if (has_subject and has_from) else 0.5 if (has_subject or has_from) else 0.0 - _score("preprocess.email_metadata", score, - f"subject={result.metadata.get('subject')}, from={result.metadata.get('from')}") - assert has_subject, f"metadata missing 'subject'. Got: {result.metadata}" - assert has_from, f"metadata missing 'from'. Got: {result.metadata}" - - -# ── 1.7 — Email: split thread ───────────────────────────────────────── - -def test_email_split_thread(sample_thread_email_html): - result = preprocess("email_html", sample_thread_email_html) - # The latest message is "Sure, I'll handle the deploy." - # Quoted content from Bob/Alice should not appear in clean_text - has_latest = "Sure, I'll handle the deploy" in result.clean_text - lacks_quoted = "Let's plan the deploy" not in result.clean_text - score = 1.0 if (has_latest and lacks_quoted) else 0.5 if has_latest else 0.0 - _score("preprocess.email_thread", score, - f"has_latest={has_latest}, lacks_quoted={lacks_quoted}") - assert has_latest, "Latest message not found in clean_text" - assert lacks_quoted, "Quoted older message leaked into clean_text" - - -# ── 1.8 — Email: single message (no thread) ────────────────────────── - -def test_email_single_message(): - single = """ -

From: alice@co.com

-

Subject: Quick update

-

The deploy is done. Everything looks good.

-""" - result = preprocess("email_html", single) - has_body = "deploy is done" in result.clean_text - score = 1.0 if has_body else 0.0 - _score("preprocess.email_single", score) - assert has_body, "Body of single message not found in clean_text" - - -# ── 1.9 — Email: heavy HTML (table layout) ─────────────────────────── - -def test_email_heavy_html(sample_heavy_html_email): - result = preprocess("email_html", sample_heavy_html_email) - has_no_tags = "<" not in result.clean_text - has_content = len(result.clean_text) > 30 - # CSS properties should not appear in clean text - no_css = "border-collapse" not in result.clean_text and "font-size" not in result.clean_text - score = 1.0 if (has_no_tags and has_content and no_css) else 0.0 - _score("preprocess.email_heavy_html", score, - f"no_tags={has_no_tags}, has_content={has_content}, no_css={no_css}") - assert has_no_tags, "HTML tags found in clean_text" - assert has_content, "clean_text is empty" - assert no_css, "CSS properties leaked into clean_text" - - -# ── 1.10 — Fallback: unknown file type ─────────────────────────────── - -def test_fallback_unknown_content(): - raw = "random text content without any structure\nline two\nline three" - result = preprocess("unknown", raw) - has_text = len(result.clean_text) > 0 - score = 1.0 if has_text else 0.0 - _score("preprocess.fallback", score) - assert has_text, "fallback handler returned empty clean_text" - assert result.content_type == "unknown" + assert not failures, ( + f"[{case['id']}] {case['description']} — {len(failures)} assertion(s) failed:\n" + + "\n".join(f" • {f}" for f in failures) + )