refactor(tests): YAML-driven fixtures for preprocessor tests
- cases.yaml: 10 test cases con schema dichiarativo (op, assertions) - data/: 7 file reali (email_action.html, email_thread.html, email_single.html, email_heavy.html, generic_page.html, notes.txt, fallback.txt) - test_preprocessors.py: parametrize da YAML via test_detect / test_preprocess; assertion engine generico (no_html_tags, min_length, compression_ratio, metadata_keys, contains, not_contains, content_type) - requirements.txt: add PyYAML Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
127
tests/fixtures/preprocessors/cases.yaml
vendored
Normal file
127
tests/fixtures/preprocessors/cases.yaml
vendored
Normal file
@@ -0,0 +1,127 @@
|
||||
# Preprocessor test cases — Step 1 (Local Agent V2)
|
||||
#
|
||||
# Schema per caso:
|
||||
# id: "1.N"
|
||||
# description: str
|
||||
# score_name: str # nome score inviato a Langfuse
|
||||
#
|
||||
# Sorgente contenuto (una delle due):
|
||||
# file: <nome file in data/> # letto come testo UTF-8
|
||||
# generate: binary_noise # contenuto generato dal runner (per test binari)
|
||||
#
|
||||
# Per op=detect:
|
||||
# op: detect
|
||||
# input_filename: str # filename passato a detect_content_type
|
||||
# expected_content_type: str
|
||||
#
|
||||
# Per op=preprocess:
|
||||
# op: preprocess
|
||||
# input_content_type: str # content_type passato a preprocess()
|
||||
# assertions:
|
||||
# no_html_tags: bool
|
||||
# min_length: int
|
||||
# compression_ratio_lt: float # len(clean) / len(raw) < soglia
|
||||
# metadata_keys: [str, ...] # chiavi che devono essere in metadata
|
||||
# contains: str | [str, ...] # substring(s) presenti in clean_text
|
||||
# not_contains: str | [str, ...] # substring(s) assenti da clean_text
|
||||
# content_type: str # valore atteso di result.content_type
|
||||
|
||||
cases:
|
||||
|
||||
# ── Detection tests ────────────────────────────────────────────────
|
||||
|
||||
- id: "1.1"
|
||||
description: "Detect email HTML"
|
||||
score_name: preprocess.detect_email
|
||||
file: email_action.html
|
||||
op: detect
|
||||
input_filename: email_export.html
|
||||
expected_content_type: email_html
|
||||
|
||||
- id: "1.2"
|
||||
description: "Detect generic HTML"
|
||||
score_name: preprocess.detect_generic
|
||||
file: generic_page.html
|
||||
op: detect
|
||||
input_filename: index.html
|
||||
expected_content_type: generic_html
|
||||
|
||||
- id: "1.3"
|
||||
description: "Detect plain text"
|
||||
score_name: preprocess.detect_text
|
||||
file: notes.txt
|
||||
op: detect
|
||||
input_filename: notes.txt
|
||||
expected_content_type: plain_text
|
||||
|
||||
- id: "1.4"
|
||||
description: "Detect unknown (binary-like content)"
|
||||
score_name: preprocess.detect_unknown
|
||||
generate: binary_noise
|
||||
op: detect
|
||||
input_filename: archive.xyz
|
||||
expected_content_type: unknown
|
||||
|
||||
# ── Preprocess tests ───────────────────────────────────────────────
|
||||
|
||||
- id: "1.5"
|
||||
description: "Email: strip HTML tags"
|
||||
score_name: preprocess.email_strip
|
||||
file: email_action.html
|
||||
op: preprocess
|
||||
input_content_type: email_html
|
||||
assertions:
|
||||
no_html_tags: true
|
||||
min_length: 50
|
||||
compression_ratio_lt: 0.8
|
||||
|
||||
- id: "1.6"
|
||||
description: "Email: extract metadata (Subject + From)"
|
||||
score_name: preprocess.email_metadata
|
||||
file: email_action.html
|
||||
op: preprocess
|
||||
input_content_type: email_html
|
||||
assertions:
|
||||
metadata_keys: [subject, from]
|
||||
|
||||
- id: "1.7"
|
||||
description: "Email: split thread — solo ultimo messaggio"
|
||||
score_name: preprocess.email_thread
|
||||
file: email_thread.html
|
||||
op: preprocess
|
||||
input_content_type: email_html
|
||||
assertions:
|
||||
contains: "Sure, I'll handle the deploy"
|
||||
not_contains: "Let's plan the deploy"
|
||||
|
||||
- id: "1.8"
|
||||
description: "Email: singolo messaggio senza thread"
|
||||
score_name: preprocess.email_single
|
||||
file: email_single.html
|
||||
op: preprocess
|
||||
input_content_type: email_html
|
||||
assertions:
|
||||
contains: "deploy is done"
|
||||
|
||||
- id: "1.9"
|
||||
description: "Email: HTML pesante con table layout"
|
||||
score_name: preprocess.email_heavy_html
|
||||
file: email_heavy.html
|
||||
op: preprocess
|
||||
input_content_type: email_html
|
||||
assertions:
|
||||
no_html_tags: true
|
||||
min_length: 30
|
||||
not_contains:
|
||||
- "border-collapse"
|
||||
- "font-size"
|
||||
|
||||
- id: "1.10"
|
||||
description: "Fallback: file sconosciuto → testo restituito"
|
||||
score_name: preprocess.fallback
|
||||
file: fallback.txt
|
||||
op: preprocess
|
||||
input_content_type: unknown
|
||||
assertions:
|
||||
min_length: 1
|
||||
content_type: unknown
|
||||
25
tests/fixtures/preprocessors/data/email_action.html
vendored
Normal file
25
tests/fixtures/preprocessors/data/email_action.html
vendored
Normal file
@@ -0,0 +1,25 @@
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<title>Fix the login bug</title>
|
||||
<style>
|
||||
body { font-family: Arial, sans-serif; color: #333; margin: 0; padding: 20px; }
|
||||
.header { background: #f5f5f5; padding: 10px; border-bottom: 1px solid #ddd; }
|
||||
.body { padding: 20px; }
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<div class="header">
|
||||
<p><strong>From:</strong> boss@company.com</p>
|
||||
<p><strong>To:</strong> dev@company.com</p>
|
||||
<p><strong>Subject:</strong> Fix the login bug</p>
|
||||
<p><strong>Date:</strong> Mon, 7 Apr 2026 09:00:00 +0200</p>
|
||||
</div>
|
||||
<div class="body">
|
||||
<p>Hi,</p>
|
||||
<p>Please fix the login bug by Friday. It is blocking the release.</p>
|
||||
<p>Priority: high. Let me know if you need anything.</p>
|
||||
<p>Thanks,<br>Boss</p>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
49
tests/fixtures/preprocessors/data/email_heavy.html
vendored
Normal file
49
tests/fixtures/preprocessors/data/email_heavy.html
vendored
Normal file
@@ -0,0 +1,49 @@
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<style>
|
||||
table { border-collapse: collapse; width: 100%; max-width: 600px; margin: 0 auto; }
|
||||
td { padding: 8px 12px; border: 1px solid #dddddd; font-size: 12px; color: #444444; }
|
||||
.header-row { background-color: #003366; color: #ffffff; font-weight: bold; }
|
||||
.label-col { background-color: #f0f0f0; width: 80px; font-weight: bold; }
|
||||
.footer-row { font-size: 10px; color: #999999; text-align: center; }
|
||||
</style>
|
||||
</head>
|
||||
<body bgcolor="#eeeeee">
|
||||
<center>
|
||||
<table cellpadding="0" cellspacing="0">
|
||||
<tr class="header-row">
|
||||
<td colspan="2">Company Internal Update</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td class="label-col">From:</td>
|
||||
<td>newsletter@corp.com</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td class="label-col">Subject:</td>
|
||||
<td>Q1 Results Update</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td class="label-col">Date:</td>
|
||||
<td>Apr 7, 2026</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td colspan="2">
|
||||
<table width="100%" cellpadding="10">
|
||||
<tr>
|
||||
<td>
|
||||
<p style="font-size:14px; font-weight:bold;">Dear Team,</p>
|
||||
<p>Q1 results are in. Revenue up 15% year-over-year.</p>
|
||||
<p>Please review the attached report and share any feedback by EOW.</p>
|
||||
</td>
|
||||
</tr>
|
||||
</table>
|
||||
</td>
|
||||
</tr>
|
||||
<tr class="footer-row">
|
||||
<td colspan="2">Confidential — do not forward outside the company.</td>
|
||||
</tr>
|
||||
</table>
|
||||
</center>
|
||||
</body>
|
||||
</html>
|
||||
8
tests/fixtures/preprocessors/data/email_single.html
vendored
Normal file
8
tests/fixtures/preprocessors/data/email_single.html
vendored
Normal file
@@ -0,0 +1,8 @@
|
||||
<!DOCTYPE html>
|
||||
<html><body>
|
||||
<p><strong>From:</strong> alice@co.com</p>
|
||||
<p><strong>To:</strong> team@co.com</p>
|
||||
<p><strong>Subject:</strong> Quick update</p>
|
||||
<p><strong>Date:</strong> Tue, 7 Apr 2026 10:30:00 +0200</p>
|
||||
<p>The deploy is done. Everything looks good. No issues so far.</p>
|
||||
</body></html>
|
||||
24
tests/fixtures/preprocessors/data/email_thread.html
vendored
Normal file
24
tests/fixtures/preprocessors/data/email_thread.html
vendored
Normal file
@@ -0,0 +1,24 @@
|
||||
<!DOCTYPE html>
|
||||
<html><body>
|
||||
<div class="message-latest">
|
||||
<p><strong>From:</strong> alice@co.com</p>
|
||||
<p><strong>Subject:</strong> Re: Re: Deploy plan</p>
|
||||
<p>Sure, I'll handle the deploy.</p>
|
||||
</div>
|
||||
|
||||
<p>On Mon, Apr 6, 2026 at 3:00 PM, Bob <bob@co.com> wrote:</p>
|
||||
<blockquote>
|
||||
<p>From: bob@co.com</p>
|
||||
<p>Can you handle the deploy?</p>
|
||||
<p>On Sun, Apr 5, 2026 at 1:00 PM, Alice <alice@co.com> wrote:</p>
|
||||
<blockquote>
|
||||
<p>From: alice@co.com</p>
|
||||
<p>Let's plan the deploy for Monday.</p>
|
||||
<p>On Sat, Apr 4, 2026 at 11:00 AM, Charlie <charlie@co.com> wrote:</p>
|
||||
<blockquote>
|
||||
<p>From: charlie@co.com</p>
|
||||
<p>We need to schedule the deploy. What day works?</p>
|
||||
</blockquote>
|
||||
</blockquote>
|
||||
</blockquote>
|
||||
</body></html>
|
||||
3
tests/fixtures/preprocessors/data/fallback.txt
vendored
Normal file
3
tests/fixtures/preprocessors/data/fallback.txt
vendored
Normal file
@@ -0,0 +1,3 @@
|
||||
random text content without any structure
|
||||
line two with some words
|
||||
line three and more content here
|
||||
35
tests/fixtures/preprocessors/data/generic_page.html
vendored
Normal file
35
tests/fixtures/preprocessors/data/generic_page.html
vendored
Normal file
@@ -0,0 +1,35 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<title>My Web App</title>
|
||||
<link rel="stylesheet" href="styles.css">
|
||||
</head>
|
||||
<body>
|
||||
<nav>
|
||||
<a href="/">Home</a>
|
||||
<a href="/about">About</a>
|
||||
<a href="/contact">Contact</a>
|
||||
</nav>
|
||||
<main>
|
||||
<header>
|
||||
<h1>Welcome to My App</h1>
|
||||
</header>
|
||||
<article>
|
||||
<p>This is a generic web page with no email headers.</p>
|
||||
<p>It has navigation, main content, and a footer.</p>
|
||||
</article>
|
||||
<section>
|
||||
<h2>Features</h2>
|
||||
<ul>
|
||||
<li>Fast</li>
|
||||
<li>Reliable</li>
|
||||
<li>Secure</li>
|
||||
</ul>
|
||||
</section>
|
||||
</main>
|
||||
<footer>
|
||||
<p>© 2026 My App</p>
|
||||
</footer>
|
||||
</body>
|
||||
</html>
|
||||
15
tests/fixtures/preprocessors/data/notes.txt
vendored
Normal file
15
tests/fixtures/preprocessors/data/notes.txt
vendored
Normal file
@@ -0,0 +1,15 @@
|
||||
Meeting notes - April 7, 2026
|
||||
|
||||
Attendees: Alice, Bob, Charlie
|
||||
|
||||
Discussion points:
|
||||
- Deploy scheduled for Friday
|
||||
- Bug fix for login must be completed by Thursday
|
||||
- Review Q1 numbers before EOW
|
||||
|
||||
Action items:
|
||||
- Alice: fix login bug
|
||||
- Bob: prepare deploy checklist
|
||||
- Charlie: send Q1 report
|
||||
|
||||
Next meeting: April 14, 2026
|
||||
Reference in New Issue
Block a user