refactor(tests): YAML-driven fixtures for preprocessor tests
- cases.yaml: 10 test cases con schema dichiarativo (op, assertions) - data/: 7 file reali (email_action.html, email_thread.html, email_single.html, email_heavy.html, generic_page.html, notes.txt, fallback.txt) - test_preprocessors.py: parametrize da YAML via test_detect / test_preprocess; assertion engine generico (no_html_tags, min_length, compression_ratio, metadata_keys, contains, not_contains, content_type) - requirements.txt: add PyYAML Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -35,4 +35,5 @@ cryptography>=42.0.0
|
|||||||
langfuse>=2.0.0
|
langfuse>=2.0.0
|
||||||
beautifulsoup4>=4.12.0
|
beautifulsoup4>=4.12.0
|
||||||
lxml>=5.0.0
|
lxml>=5.0.0
|
||||||
|
PyYAML>=6.0.0
|
||||||
ruff>=0.8.0
|
ruff>=0.8.0
|
||||||
|
|||||||
127
tests/fixtures/preprocessors/cases.yaml
vendored
Normal file
127
tests/fixtures/preprocessors/cases.yaml
vendored
Normal file
@@ -0,0 +1,127 @@
|
|||||||
|
# Preprocessor test cases — Step 1 (Local Agent V2)
|
||||||
|
#
|
||||||
|
# Schema per caso:
|
||||||
|
# id: "1.N"
|
||||||
|
# description: str
|
||||||
|
# score_name: str # nome score inviato a Langfuse
|
||||||
|
#
|
||||||
|
# Sorgente contenuto (una delle due):
|
||||||
|
# file: <nome file in data/> # letto come testo UTF-8
|
||||||
|
# generate: binary_noise # contenuto generato dal runner (per test binari)
|
||||||
|
#
|
||||||
|
# Per op=detect:
|
||||||
|
# op: detect
|
||||||
|
# input_filename: str # filename passato a detect_content_type
|
||||||
|
# expected_content_type: str
|
||||||
|
#
|
||||||
|
# Per op=preprocess:
|
||||||
|
# op: preprocess
|
||||||
|
# input_content_type: str # content_type passato a preprocess()
|
||||||
|
# assertions:
|
||||||
|
# no_html_tags: bool
|
||||||
|
# min_length: int
|
||||||
|
# compression_ratio_lt: float # len(clean) / len(raw) < soglia
|
||||||
|
# metadata_keys: [str, ...] # chiavi che devono essere in metadata
|
||||||
|
# contains: str | [str, ...] # substring(s) presenti in clean_text
|
||||||
|
# not_contains: str | [str, ...] # substring(s) assenti da clean_text
|
||||||
|
# content_type: str # valore atteso di result.content_type
|
||||||
|
|
||||||
|
cases:
|
||||||
|
|
||||||
|
# ── Detection tests ────────────────────────────────────────────────
|
||||||
|
|
||||||
|
- id: "1.1"
|
||||||
|
description: "Detect email HTML"
|
||||||
|
score_name: preprocess.detect_email
|
||||||
|
file: email_action.html
|
||||||
|
op: detect
|
||||||
|
input_filename: email_export.html
|
||||||
|
expected_content_type: email_html
|
||||||
|
|
||||||
|
- id: "1.2"
|
||||||
|
description: "Detect generic HTML"
|
||||||
|
score_name: preprocess.detect_generic
|
||||||
|
file: generic_page.html
|
||||||
|
op: detect
|
||||||
|
input_filename: index.html
|
||||||
|
expected_content_type: generic_html
|
||||||
|
|
||||||
|
- id: "1.3"
|
||||||
|
description: "Detect plain text"
|
||||||
|
score_name: preprocess.detect_text
|
||||||
|
file: notes.txt
|
||||||
|
op: detect
|
||||||
|
input_filename: notes.txt
|
||||||
|
expected_content_type: plain_text
|
||||||
|
|
||||||
|
- id: "1.4"
|
||||||
|
description: "Detect unknown (binary-like content)"
|
||||||
|
score_name: preprocess.detect_unknown
|
||||||
|
generate: binary_noise
|
||||||
|
op: detect
|
||||||
|
input_filename: archive.xyz
|
||||||
|
expected_content_type: unknown
|
||||||
|
|
||||||
|
# ── Preprocess tests ───────────────────────────────────────────────
|
||||||
|
|
||||||
|
- id: "1.5"
|
||||||
|
description: "Email: strip HTML tags"
|
||||||
|
score_name: preprocess.email_strip
|
||||||
|
file: email_action.html
|
||||||
|
op: preprocess
|
||||||
|
input_content_type: email_html
|
||||||
|
assertions:
|
||||||
|
no_html_tags: true
|
||||||
|
min_length: 50
|
||||||
|
compression_ratio_lt: 0.8
|
||||||
|
|
||||||
|
- id: "1.6"
|
||||||
|
description: "Email: extract metadata (Subject + From)"
|
||||||
|
score_name: preprocess.email_metadata
|
||||||
|
file: email_action.html
|
||||||
|
op: preprocess
|
||||||
|
input_content_type: email_html
|
||||||
|
assertions:
|
||||||
|
metadata_keys: [subject, from]
|
||||||
|
|
||||||
|
- id: "1.7"
|
||||||
|
description: "Email: split thread — solo ultimo messaggio"
|
||||||
|
score_name: preprocess.email_thread
|
||||||
|
file: email_thread.html
|
||||||
|
op: preprocess
|
||||||
|
input_content_type: email_html
|
||||||
|
assertions:
|
||||||
|
contains: "Sure, I'll handle the deploy"
|
||||||
|
not_contains: "Let's plan the deploy"
|
||||||
|
|
||||||
|
- id: "1.8"
|
||||||
|
description: "Email: singolo messaggio senza thread"
|
||||||
|
score_name: preprocess.email_single
|
||||||
|
file: email_single.html
|
||||||
|
op: preprocess
|
||||||
|
input_content_type: email_html
|
||||||
|
assertions:
|
||||||
|
contains: "deploy is done"
|
||||||
|
|
||||||
|
- id: "1.9"
|
||||||
|
description: "Email: HTML pesante con table layout"
|
||||||
|
score_name: preprocess.email_heavy_html
|
||||||
|
file: email_heavy.html
|
||||||
|
op: preprocess
|
||||||
|
input_content_type: email_html
|
||||||
|
assertions:
|
||||||
|
no_html_tags: true
|
||||||
|
min_length: 30
|
||||||
|
not_contains:
|
||||||
|
- "border-collapse"
|
||||||
|
- "font-size"
|
||||||
|
|
||||||
|
- id: "1.10"
|
||||||
|
description: "Fallback: file sconosciuto → testo restituito"
|
||||||
|
score_name: preprocess.fallback
|
||||||
|
file: fallback.txt
|
||||||
|
op: preprocess
|
||||||
|
input_content_type: unknown
|
||||||
|
assertions:
|
||||||
|
min_length: 1
|
||||||
|
content_type: unknown
|
||||||
25
tests/fixtures/preprocessors/data/email_action.html
vendored
Normal file
25
tests/fixtures/preprocessors/data/email_action.html
vendored
Normal file
@@ -0,0 +1,25 @@
|
|||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>Fix the login bug</title>
|
||||||
|
<style>
|
||||||
|
body { font-family: Arial, sans-serif; color: #333; margin: 0; padding: 20px; }
|
||||||
|
.header { background: #f5f5f5; padding: 10px; border-bottom: 1px solid #ddd; }
|
||||||
|
.body { padding: 20px; }
|
||||||
|
</style>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<div class="header">
|
||||||
|
<p><strong>From:</strong> boss@company.com</p>
|
||||||
|
<p><strong>To:</strong> dev@company.com</p>
|
||||||
|
<p><strong>Subject:</strong> Fix the login bug</p>
|
||||||
|
<p><strong>Date:</strong> Mon, 7 Apr 2026 09:00:00 +0200</p>
|
||||||
|
</div>
|
||||||
|
<div class="body">
|
||||||
|
<p>Hi,</p>
|
||||||
|
<p>Please fix the login bug by Friday. It is blocking the release.</p>
|
||||||
|
<p>Priority: high. Let me know if you need anything.</p>
|
||||||
|
<p>Thanks,<br>Boss</p>
|
||||||
|
</div>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
49
tests/fixtures/preprocessors/data/email_heavy.html
vendored
Normal file
49
tests/fixtures/preprocessors/data/email_heavy.html
vendored
Normal file
@@ -0,0 +1,49 @@
|
|||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<style>
|
||||||
|
table { border-collapse: collapse; width: 100%; max-width: 600px; margin: 0 auto; }
|
||||||
|
td { padding: 8px 12px; border: 1px solid #dddddd; font-size: 12px; color: #444444; }
|
||||||
|
.header-row { background-color: #003366; color: #ffffff; font-weight: bold; }
|
||||||
|
.label-col { background-color: #f0f0f0; width: 80px; font-weight: bold; }
|
||||||
|
.footer-row { font-size: 10px; color: #999999; text-align: center; }
|
||||||
|
</style>
|
||||||
|
</head>
|
||||||
|
<body bgcolor="#eeeeee">
|
||||||
|
<center>
|
||||||
|
<table cellpadding="0" cellspacing="0">
|
||||||
|
<tr class="header-row">
|
||||||
|
<td colspan="2">Company Internal Update</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td class="label-col">From:</td>
|
||||||
|
<td>newsletter@corp.com</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td class="label-col">Subject:</td>
|
||||||
|
<td>Q1 Results Update</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td class="label-col">Date:</td>
|
||||||
|
<td>Apr 7, 2026</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td colspan="2">
|
||||||
|
<table width="100%" cellpadding="10">
|
||||||
|
<tr>
|
||||||
|
<td>
|
||||||
|
<p style="font-size:14px; font-weight:bold;">Dear Team,</p>
|
||||||
|
<p>Q1 results are in. Revenue up 15% year-over-year.</p>
|
||||||
|
<p>Please review the attached report and share any feedback by EOW.</p>
|
||||||
|
</td>
|
||||||
|
</tr>
|
||||||
|
</table>
|
||||||
|
</td>
|
||||||
|
</tr>
|
||||||
|
<tr class="footer-row">
|
||||||
|
<td colspan="2">Confidential — do not forward outside the company.</td>
|
||||||
|
</tr>
|
||||||
|
</table>
|
||||||
|
</center>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
8
tests/fixtures/preprocessors/data/email_single.html
vendored
Normal file
8
tests/fixtures/preprocessors/data/email_single.html
vendored
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
<!DOCTYPE html>
|
||||||
|
<html><body>
|
||||||
|
<p><strong>From:</strong> alice@co.com</p>
|
||||||
|
<p><strong>To:</strong> team@co.com</p>
|
||||||
|
<p><strong>Subject:</strong> Quick update</p>
|
||||||
|
<p><strong>Date:</strong> Tue, 7 Apr 2026 10:30:00 +0200</p>
|
||||||
|
<p>The deploy is done. Everything looks good. No issues so far.</p>
|
||||||
|
</body></html>
|
||||||
24
tests/fixtures/preprocessors/data/email_thread.html
vendored
Normal file
24
tests/fixtures/preprocessors/data/email_thread.html
vendored
Normal file
@@ -0,0 +1,24 @@
|
|||||||
|
<!DOCTYPE html>
|
||||||
|
<html><body>
|
||||||
|
<div class="message-latest">
|
||||||
|
<p><strong>From:</strong> alice@co.com</p>
|
||||||
|
<p><strong>Subject:</strong> Re: Re: Deploy plan</p>
|
||||||
|
<p>Sure, I'll handle the deploy.</p>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<p>On Mon, Apr 6, 2026 at 3:00 PM, Bob <bob@co.com> wrote:</p>
|
||||||
|
<blockquote>
|
||||||
|
<p>From: bob@co.com</p>
|
||||||
|
<p>Can you handle the deploy?</p>
|
||||||
|
<p>On Sun, Apr 5, 2026 at 1:00 PM, Alice <alice@co.com> wrote:</p>
|
||||||
|
<blockquote>
|
||||||
|
<p>From: alice@co.com</p>
|
||||||
|
<p>Let's plan the deploy for Monday.</p>
|
||||||
|
<p>On Sat, Apr 4, 2026 at 11:00 AM, Charlie <charlie@co.com> wrote:</p>
|
||||||
|
<blockquote>
|
||||||
|
<p>From: charlie@co.com</p>
|
||||||
|
<p>We need to schedule the deploy. What day works?</p>
|
||||||
|
</blockquote>
|
||||||
|
</blockquote>
|
||||||
|
</blockquote>
|
||||||
|
</body></html>
|
||||||
3
tests/fixtures/preprocessors/data/fallback.txt
vendored
Normal file
3
tests/fixtures/preprocessors/data/fallback.txt
vendored
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
random text content without any structure
|
||||||
|
line two with some words
|
||||||
|
line three and more content here
|
||||||
35
tests/fixtures/preprocessors/data/generic_page.html
vendored
Normal file
35
tests/fixtures/preprocessors/data/generic_page.html
vendored
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
<!DOCTYPE html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<meta charset="UTF-8">
|
||||||
|
<title>My Web App</title>
|
||||||
|
<link rel="stylesheet" href="styles.css">
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<nav>
|
||||||
|
<a href="/">Home</a>
|
||||||
|
<a href="/about">About</a>
|
||||||
|
<a href="/contact">Contact</a>
|
||||||
|
</nav>
|
||||||
|
<main>
|
||||||
|
<header>
|
||||||
|
<h1>Welcome to My App</h1>
|
||||||
|
</header>
|
||||||
|
<article>
|
||||||
|
<p>This is a generic web page with no email headers.</p>
|
||||||
|
<p>It has navigation, main content, and a footer.</p>
|
||||||
|
</article>
|
||||||
|
<section>
|
||||||
|
<h2>Features</h2>
|
||||||
|
<ul>
|
||||||
|
<li>Fast</li>
|
||||||
|
<li>Reliable</li>
|
||||||
|
<li>Secure</li>
|
||||||
|
</ul>
|
||||||
|
</section>
|
||||||
|
</main>
|
||||||
|
<footer>
|
||||||
|
<p>© 2026 My App</p>
|
||||||
|
</footer>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
15
tests/fixtures/preprocessors/data/notes.txt
vendored
Normal file
15
tests/fixtures/preprocessors/data/notes.txt
vendored
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
Meeting notes - April 7, 2026
|
||||||
|
|
||||||
|
Attendees: Alice, Bob, Charlie
|
||||||
|
|
||||||
|
Discussion points:
|
||||||
|
- Deploy scheduled for Friday
|
||||||
|
- Bug fix for login must be completed by Thursday
|
||||||
|
- Review Q1 numbers before EOW
|
||||||
|
|
||||||
|
Action items:
|
||||||
|
- Alice: fix login bug
|
||||||
|
- Bob: prepare deploy checklist
|
||||||
|
- Charlie: send Q1 report
|
||||||
|
|
||||||
|
Next meeting: April 14, 2026
|
||||||
@@ -1,221 +1,178 @@
|
|||||||
"""Tests for the preprocessor system (Step 1).
|
"""Tests for the preprocessor system (Step 1 — Local Agent V2).
|
||||||
|
|
||||||
Test IDs map to the plan:
|
Fixtures are driven by:
|
||||||
1.1 detect_email, 1.2 detect_generic, 1.3 detect_text, 1.4 detect_unknown
|
tests/fixtures/preprocessors/cases.yaml — test case definitions
|
||||||
1.5 email_strip, 1.6 email_metadata, 1.7 email_thread, 1.8 email_single
|
tests/fixtures/preprocessors/data/ — input files (HTML, txt, ...)
|
||||||
1.9 email_heavy_html, 1.10 fallback
|
|
||||||
|
|
||||||
Run:
|
Run:
|
||||||
pytest tests/test_preprocessors.py -v
|
pytest tests/test_preprocessors.py -v
|
||||||
|
|
||||||
|
# Only detection tests
|
||||||
|
pytest tests/test_preprocessors.py -v -k detect
|
||||||
|
|
||||||
|
# Only preprocess tests
|
||||||
|
pytest tests/test_preprocessors.py -v -k preprocess
|
||||||
|
|
||||||
Langfuse scores are sent when LANGFUSE_SECRET_KEY / LANGFUSE_PUBLIC_KEY are set.
|
Langfuse scores are sent when LANGFUSE_SECRET_KEY / LANGFUSE_PUBLIC_KEY are set.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import re
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
import yaml
|
||||||
|
|
||||||
from app.core.preprocessors import detect_content_type, preprocess
|
|
||||||
from app.core.langfuse_client import get_langfuse
|
from app.core.langfuse_client import get_langfuse
|
||||||
|
from app.core.preprocessors import detect_content_type, preprocess
|
||||||
|
|
||||||
# ── Fixtures ──────────────────────────────────────────────────────────
|
# ── Paths ──────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
_FIXTURES_DIR = Path(__file__).parent / "fixtures" / "preprocessors"
|
||||||
|
_DATA_DIR = _FIXTURES_DIR / "data"
|
||||||
|
_CASES_FILE = _FIXTURES_DIR / "cases.yaml"
|
||||||
|
|
||||||
|
# ── Content generators ─────────────────────────────────────────────────
|
||||||
|
|
||||||
|
_GENERATORS: dict[str, str] = {
|
||||||
|
# High ratio of non-printable chars → triggers "unknown" heuristic
|
||||||
|
"binary_noise": "some\x00\x01\x02\x03\x04\x05content" * 20,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
def _load_cases() -> list[dict]:
|
||||||
def sample_email_html() -> str:
|
with _CASES_FILE.open(encoding="utf-8") as f:
|
||||||
return """<!DOCTYPE html>
|
return yaml.safe_load(f)["cases"]
|
||||||
<html>
|
|
||||||
<head>
|
|
||||||
<title>Fix the login bug</title>
|
|
||||||
<style>body { font-family: Arial; color: #333; }</style>
|
|
||||||
</head>
|
|
||||||
<body>
|
|
||||||
<p>Subject: Fix the login bug</p>
|
|
||||||
<p>From: boss@company.com</p>
|
|
||||||
<p>To: dev@company.com</p>
|
|
||||||
<p>Date: Mon, 7 Apr 2026 09:00:00 +0200</p>
|
|
||||||
<p>Please fix the login bug by Friday. It is blocking the release.</p>
|
|
||||||
</body>
|
|
||||||
</html>"""
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
def _read_content(case: dict) -> str:
|
||||||
def sample_thread_email_html() -> str:
|
if "generate" in case:
|
||||||
return """<!DOCTYPE html>
|
key = case["generate"]
|
||||||
<html><body>
|
if key not in _GENERATORS:
|
||||||
<p>From: alice@co.com</p>
|
raise ValueError(f"Unknown generator '{key}' in case {case['id']}")
|
||||||
<p>Subject: Re: Re: Deploy plan</p>
|
return _GENERATORS[key]
|
||||||
<p>Sure, I'll handle the deploy.</p>
|
file_path = _DATA_DIR / case["file"]
|
||||||
|
return file_path.read_text(encoding="utf-8")
|
||||||
<p>On Mon, Apr 6, 2026 at 3:00 PM, Bob <bob@co.com> wrote:</p>
|
|
||||||
<blockquote>
|
|
||||||
<p>From: bob@co.com</p>
|
|
||||||
<p>Can you handle the deploy?</p>
|
|
||||||
<p>On Sun, Apr 5, 2026 at 1:00 PM, Alice <alice@co.com> wrote:</p>
|
|
||||||
<blockquote>
|
|
||||||
<p>From: alice@co.com</p>
|
|
||||||
<p>Let's plan the deploy for Monday.</p>
|
|
||||||
</blockquote>
|
|
||||||
</blockquote>
|
|
||||||
</body></html>"""
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
# ── Langfuse helper ───────────────────────────────────────────────────
|
||||||
def sample_heavy_html_email() -> str:
|
|
||||||
return """<!DOCTYPE html>
|
|
||||||
<html><head>
|
|
||||||
<style>
|
|
||||||
table { border-collapse: collapse; width: 100%; }
|
|
||||||
td { padding: 8px; border: 1px solid #ddd; font-size: 12px; }
|
|
||||||
.header { background: #003366; color: white; }
|
|
||||||
.footer { font-size: 10px; color: #999; }
|
|
||||||
</style>
|
|
||||||
</head><body>
|
|
||||||
<table>
|
|
||||||
<tr class="header"><td colspan="2">Company Newsletter</td></tr>
|
|
||||||
<tr><td>From:</td><td>newsletter@corp.com</td></tr>
|
|
||||||
<tr><td>Subject:</td><td>Q1 Results Update</td></tr>
|
|
||||||
<tr><td>Date:</td><td>Apr 7, 2026</td></tr>
|
|
||||||
<tr><td colspan="2">
|
|
||||||
<p>Dear Team,</p>
|
|
||||||
<p>Q1 results are in. Revenue up 15% year-over-year.</p>
|
|
||||||
<p>Please review the attached report.</p>
|
|
||||||
</td></tr>
|
|
||||||
<tr class="footer"><td colspan="2">Confidential — do not forward</td></tr>
|
|
||||||
</table>
|
|
||||||
</body></html>"""
|
|
||||||
|
|
||||||
|
def _lf_score(score_name: str, value: float, comment: str = "") -> None:
|
||||||
# ── Helper ────────────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
def _score(name: str, value: float, comment: str = "") -> None:
|
|
||||||
lf = get_langfuse()
|
lf = get_langfuse()
|
||||||
if lf:
|
if lf:
|
||||||
trace = lf.trace(name=f"eval-{name}")
|
trace = lf.trace(name=f"eval-{score_name}")
|
||||||
lf.score(trace_id=trace.id, name=name, value=value,
|
lf.score(
|
||||||
data_type="NUMERIC", comment=comment)
|
trace_id=trace.id,
|
||||||
|
name=score_name,
|
||||||
|
value=value,
|
||||||
|
data_type="NUMERIC",
|
||||||
|
comment=comment,
|
||||||
|
)
|
||||||
lf.flush()
|
lf.flush()
|
||||||
|
|
||||||
|
|
||||||
# ── 1.1 — Detect email HTML ───────────────────────────────────────────
|
# ── Assertion engine ──────────────────────────────────────────────────
|
||||||
|
|
||||||
def test_detect_email_html(sample_email_html):
|
def _run_assertions(assertions: dict[str, Any], result: Any, raw: str) -> tuple[float, list[str]]:
|
||||||
ct = detect_content_type("email_export.html", sample_email_html)
|
"""Run all assertions declared in the YAML case.
|
||||||
score = 1.0 if ct == "email_html" else 0.0
|
|
||||||
_score("preprocess.detect_email", score)
|
Returns (score 0.0–1.0, list of failure messages).
|
||||||
assert ct == "email_html", f"Expected 'email_html', got '{ct}'"
|
"""
|
||||||
|
failures: list[str] = []
|
||||||
|
|
||||||
|
if assertions.get("no_html_tags"):
|
||||||
|
if re.search(r"<[^>]+>", result.clean_text):
|
||||||
|
failures.append("clean_text still contains HTML tags")
|
||||||
|
|
||||||
|
min_len = assertions.get("min_length")
|
||||||
|
if min_len is not None:
|
||||||
|
if len(result.clean_text) < min_len:
|
||||||
|
failures.append(
|
||||||
|
f"clean_text too short: {len(result.clean_text)} < {min_len}"
|
||||||
|
)
|
||||||
|
|
||||||
|
ratio_lt = assertions.get("compression_ratio_lt")
|
||||||
|
if ratio_lt is not None and len(raw) > 0:
|
||||||
|
ratio = len(result.clean_text) / len(raw)
|
||||||
|
if ratio >= ratio_lt:
|
||||||
|
failures.append(f"compression ratio {ratio:.2f} >= {ratio_lt}")
|
||||||
|
|
||||||
|
meta_keys = assertions.get("metadata_keys", [])
|
||||||
|
for key in meta_keys:
|
||||||
|
if not result.metadata.get(key):
|
||||||
|
failures.append(f"metadata missing key '{key}' (got {result.metadata})")
|
||||||
|
|
||||||
|
contains = assertions.get("contains")
|
||||||
|
if contains:
|
||||||
|
items = [contains] if isinstance(contains, str) else contains
|
||||||
|
for item in items:
|
||||||
|
if item not in result.clean_text:
|
||||||
|
failures.append(f"clean_text missing expected substring: {item!r}")
|
||||||
|
|
||||||
|
not_contains = assertions.get("not_contains")
|
||||||
|
if not_contains:
|
||||||
|
items = [not_contains] if isinstance(not_contains, str) else not_contains
|
||||||
|
for item in items:
|
||||||
|
if item in result.clean_text:
|
||||||
|
failures.append(f"clean_text contains forbidden substring: {item!r}")
|
||||||
|
|
||||||
|
expected_ct = assertions.get("content_type")
|
||||||
|
if expected_ct and result.content_type != expected_ct:
|
||||||
|
failures.append(
|
||||||
|
f"content_type mismatch: expected {expected_ct!r}, got {result.content_type!r}"
|
||||||
|
)
|
||||||
|
|
||||||
|
score = 1.0 if not failures else 0.0
|
||||||
|
return score, failures
|
||||||
|
|
||||||
|
|
||||||
# ── 1.2 — Detect generic HTML ─────────────────────────────────────────
|
# ── Parametrized: detect ──────────────────────────────────────────────
|
||||||
|
|
||||||
def test_detect_generic_html():
|
_detect_cases = [c for c in _load_cases() if c["op"] == "detect"]
|
||||||
generic = """<!DOCTYPE html><html><head><title>My App</title></head>
|
|
||||||
<body><nav><a href="/">Home</a></nav><main><p>Welcome</p></main></body></html>"""
|
|
||||||
ct = detect_content_type("index.html", generic)
|
|
||||||
score = 1.0 if ct == "generic_html" else 0.0
|
|
||||||
_score("preprocess.detect_generic", score)
|
|
||||||
assert ct == "generic_html", f"Expected 'generic_html', got '{ct}'"
|
|
||||||
|
|
||||||
|
|
||||||
# ── 1.3 — Detect plain text ───────────────────────────────────────────
|
@pytest.mark.parametrize(
|
||||||
|
"case",
|
||||||
|
_detect_cases,
|
||||||
|
ids=[c["id"] for c in _detect_cases],
|
||||||
|
)
|
||||||
|
def test_detect(case: dict) -> None:
|
||||||
|
raw = _read_content(case)
|
||||||
|
ct = detect_content_type(case["input_filename"], raw)
|
||||||
|
|
||||||
def test_detect_plain_text():
|
expected = case["expected_content_type"]
|
||||||
ct = detect_content_type("notes.txt", "Just some notes here.\nNo HTML at all.")
|
score = 1.0 if ct == expected else 0.0
|
||||||
score = 1.0 if ct == "plain_text" else 0.0
|
_lf_score(case["score_name"], score, f"got={ct}, expected={expected}")
|
||||||
_score("preprocess.detect_text", score)
|
|
||||||
assert ct == "plain_text", f"Expected 'plain_text', got '{ct}'"
|
assert ct == expected, (
|
||||||
|
f"[{case['id']}] {case['description']}: "
|
||||||
|
f"expected content_type={expected!r}, got {ct!r}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# ── 1.4 — Detect unknown ──────────────────────────────────────────────
|
# ── Parametrized: preprocess ──────────────────────────────────────────
|
||||||
|
|
||||||
def test_detect_unknown():
|
_preprocess_cases = [c for c in _load_cases() if c["op"] == "preprocess"]
|
||||||
# Simulate binary-like content with non-printable chars
|
|
||||||
binary_like = "some\x00\x01\x02\x03\x04\x05content" * 20
|
|
||||||
ct = detect_content_type("archive.xyz", binary_like)
|
|
||||||
score = 1.0 if ct == "unknown" else 0.0
|
|
||||||
_score("preprocess.detect_unknown", score)
|
|
||||||
assert ct == "unknown", f"Expected 'unknown', got '{ct}'"
|
|
||||||
|
|
||||||
|
|
||||||
# ── 1.5 — Email: strip HTML tags ─────────────────────────────────────
|
@pytest.mark.parametrize(
|
||||||
|
"case",
|
||||||
|
_preprocess_cases,
|
||||||
|
ids=[c["id"] for c in _preprocess_cases],
|
||||||
|
)
|
||||||
|
def test_preprocess(case: dict) -> None:
|
||||||
|
raw = _read_content(case)
|
||||||
|
result = preprocess(case["input_content_type"], raw)
|
||||||
|
|
||||||
def test_email_strip_html(sample_email_html):
|
assertions = case.get("assertions", {})
|
||||||
result = preprocess("email_html", sample_email_html)
|
score, failures = _run_assertions(assertions, result, raw)
|
||||||
has_no_tags = "<" not in result.clean_text
|
|
||||||
has_content = len(result.clean_text) > 50
|
|
||||||
ratio = len(result.clean_text) / len(sample_email_html)
|
|
||||||
score = 1.0 if (has_no_tags and has_content and ratio < 0.8) else 0.0
|
|
||||||
_score("preprocess.email_strip", score, f"ratio={ratio:.2f}, len={len(result.clean_text)}")
|
|
||||||
assert has_no_tags, "clean_text still contains HTML tags"
|
|
||||||
assert has_content, "clean_text is too short"
|
|
||||||
|
|
||||||
|
comment = "; ".join(failures) if failures else f"len={len(result.clean_text)}"
|
||||||
|
_lf_score(case["score_name"], score, comment)
|
||||||
|
|
||||||
# ── 1.6 — Email: extract metadata ────────────────────────────────────
|
assert not failures, (
|
||||||
|
f"[{case['id']}] {case['description']} — {len(failures)} assertion(s) failed:\n"
|
||||||
def test_email_extract_metadata(sample_email_html):
|
+ "\n".join(f" • {f}" for f in failures)
|
||||||
result = preprocess("email_html", sample_email_html)
|
)
|
||||||
has_subject = bool(result.metadata.get("subject"))
|
|
||||||
has_from = bool(result.metadata.get("from"))
|
|
||||||
score = 1.0 if (has_subject and has_from) else 0.5 if (has_subject or has_from) else 0.0
|
|
||||||
_score("preprocess.email_metadata", score,
|
|
||||||
f"subject={result.metadata.get('subject')}, from={result.metadata.get('from')}")
|
|
||||||
assert has_subject, f"metadata missing 'subject'. Got: {result.metadata}"
|
|
||||||
assert has_from, f"metadata missing 'from'. Got: {result.metadata}"
|
|
||||||
|
|
||||||
|
|
||||||
# ── 1.7 — Email: split thread ─────────────────────────────────────────
|
|
||||||
|
|
||||||
def test_email_split_thread(sample_thread_email_html):
|
|
||||||
result = preprocess("email_html", sample_thread_email_html)
|
|
||||||
# The latest message is "Sure, I'll handle the deploy."
|
|
||||||
# Quoted content from Bob/Alice should not appear in clean_text
|
|
||||||
has_latest = "Sure, I'll handle the deploy" in result.clean_text
|
|
||||||
lacks_quoted = "Let's plan the deploy" not in result.clean_text
|
|
||||||
score = 1.0 if (has_latest and lacks_quoted) else 0.5 if has_latest else 0.0
|
|
||||||
_score("preprocess.email_thread", score,
|
|
||||||
f"has_latest={has_latest}, lacks_quoted={lacks_quoted}")
|
|
||||||
assert has_latest, "Latest message not found in clean_text"
|
|
||||||
assert lacks_quoted, "Quoted older message leaked into clean_text"
|
|
||||||
|
|
||||||
|
|
||||||
# ── 1.8 — Email: single message (no thread) ──────────────────────────
|
|
||||||
|
|
||||||
def test_email_single_message():
|
|
||||||
single = """<!DOCTYPE html><html><body>
|
|
||||||
<p>From: alice@co.com</p>
|
|
||||||
<p>Subject: Quick update</p>
|
|
||||||
<p>The deploy is done. Everything looks good.</p>
|
|
||||||
</body></html>"""
|
|
||||||
result = preprocess("email_html", single)
|
|
||||||
has_body = "deploy is done" in result.clean_text
|
|
||||||
score = 1.0 if has_body else 0.0
|
|
||||||
_score("preprocess.email_single", score)
|
|
||||||
assert has_body, "Body of single message not found in clean_text"
|
|
||||||
|
|
||||||
|
|
||||||
# ── 1.9 — Email: heavy HTML (table layout) ───────────────────────────
|
|
||||||
|
|
||||||
def test_email_heavy_html(sample_heavy_html_email):
|
|
||||||
result = preprocess("email_html", sample_heavy_html_email)
|
|
||||||
has_no_tags = "<" not in result.clean_text
|
|
||||||
has_content = len(result.clean_text) > 30
|
|
||||||
# CSS properties should not appear in clean text
|
|
||||||
no_css = "border-collapse" not in result.clean_text and "font-size" not in result.clean_text
|
|
||||||
score = 1.0 if (has_no_tags and has_content and no_css) else 0.0
|
|
||||||
_score("preprocess.email_heavy_html", score,
|
|
||||||
f"no_tags={has_no_tags}, has_content={has_content}, no_css={no_css}")
|
|
||||||
assert has_no_tags, "HTML tags found in clean_text"
|
|
||||||
assert has_content, "clean_text is empty"
|
|
||||||
assert no_css, "CSS properties leaked into clean_text"
|
|
||||||
|
|
||||||
|
|
||||||
# ── 1.10 — Fallback: unknown file type ───────────────────────────────
|
|
||||||
|
|
||||||
def test_fallback_unknown_content():
|
|
||||||
raw = "random text content without any structure\nline two\nline three"
|
|
||||||
result = preprocess("unknown", raw)
|
|
||||||
has_text = len(result.clean_text) > 0
|
|
||||||
score = 1.0 if has_text else 0.0
|
|
||||||
_score("preprocess.fallback", score)
|
|
||||||
assert has_text, "fallback handler returned empty clean_text"
|
|
||||||
assert result.content_type == "unknown"
|
|
||||||
|
|||||||
Reference in New Issue
Block a user