refactor(tests): simplify YAML fixture schema and test runner

YAML: rimosse op/description/score_name/assertions block — ora detect/process
come chiave diretta, assertions piatte sullo stesso livello del caso.

Runner: eliminato _run_assertions engine, assertions inline in test_preprocess.
Riduzione da ~170 a ~75 righe totali tra YAML + test.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Roberto Musso
2026-04-07 11:30:38 +02:00
parent 3cc32569d9
commit dcd14220ca
2 changed files with 106 additions and 234 deletions

View File

@@ -1,121 +1,71 @@
# Preprocessor test cases — Step 1 (Local Agent V2)
# Preprocessor test cases
#
# Schema per caso:
# id: "1.N"
# description: str
# score_name: str # nome score inviato a Langfuse
# detect: <expected_type> → chiama detect_content_type(filename, content)
# process: <content_type> → chiama preprocess(content_type, content)
#
# Sorgente contenuto (una delle due):
# file: <nome file in data/> # letto come testo UTF-8
# generate: binary_noise # contenuto generato dal runner (per test binari)
# Sorgente: file: <nome in data/> oppure generate: binary_noise
# filename: override del nome file passato a detect (default: valore di file:)
#
# Per op=detect:
# op: detect
# input_filename: str # filename passato a detect_content_type
# expected_content_type: str
#
# Per op=preprocess:
# op: preprocess
# input_content_type: str # content_type passato a preprocess()
# assertions:
# no_html_tags: bool
# min_length: int
# compression_ratio_lt: float # len(clean) / len(raw) < soglia
# metadata_keys: [str, ...] # chiavi che devono essere in metadata
# contains: str | [str, ...] # substring(s) presenti in clean_text
# not_contains: str | [str, ...] # substring(s) assenti da clean_text
# content_type: str # valore atteso di result.content_type
# Assertions piatte (solo per process):
# no_html: true clean_text senza tag HTML
# min_chars: N len(clean_text) >= N
# ratio_lt: F len(clean) / len(raw) < F
# has_meta: [k, ...] chiavi presenti in metadata
# contains: str | [str] substring(s) presenti in clean_text
# excludes: str | [str] substring(s) assenti da clean_text
# content_type: str result.content_type == questo valore
cases:
- id: "1.1"
file: email_action.html
filename: email_export.html
detect: email_html
# ── Detection tests ────────────────────────────────────────────────
- id: "1.2"
file: generic_page.html
filename: index.html
detect: generic_html
- id: "1.1"
description: "Detect email HTML"
score_name: preprocess.detect_email
file: email_action.html
op: detect
input_filename: email_export.html
expected_content_type: email_html
- id: "1.3"
file: notes.txt
detect: plain_text
- id: "1.2"
description: "Detect generic HTML"
score_name: preprocess.detect_generic
file: generic_page.html
op: detect
input_filename: index.html
expected_content_type: generic_html
- id: "1.4"
generate: binary_noise
filename: archive.xyz
detect: unknown
- id: "1.3"
description: "Detect plain text"
score_name: preprocess.detect_text
file: notes.txt
op: detect
input_filename: notes.txt
expected_content_type: plain_text
- id: "1.5"
file: email_action.html
process: email_html
no_html: true
min_chars: 50
ratio_lt: 0.8
- id: "1.4"
description: "Detect unknown (binary-like content)"
score_name: preprocess.detect_unknown
generate: binary_noise
op: detect
input_filename: archive.xyz
expected_content_type: unknown
- id: "1.6"
file: email_action.html
process: email_html
has_meta: [subject, from]
# ── Preprocess tests ───────────────────────────────────────────────
- id: "1.7"
file: email_thread.html
process: email_html
contains: "Sure, I'll handle the deploy"
excludes: "Let's plan the deploy"
- id: "1.5"
description: "Email: strip HTML tags"
file: email_action.html
op: preprocess
input_content_type: email_html
assertions:
no_html_tags: true
min_length: 50
compression_ratio_lt: 0.8
- id: "1.8"
file: email_single.html
process: email_html
contains: "deploy is done"
- id: "1.6"
description: "Email: extract metadata (Subject + From)"
file: email_action.html
op: preprocess
input_content_type: email_html
assertions:
metadata_keys: [subject, from]
- id: "1.9"
file: email_heavy.html
process: email_html
no_html: true
min_chars: 30
excludes: [border-collapse, font-size]
- id: "1.7"
description: "Email: split thread — solo ultimo messaggio"
file: email_thread.html
op: preprocess
input_content_type: email_html
assertions:
contains: "Sure, I'll handle the deploy"
not_contains: "Let's plan the deploy"
- id: "1.8"
description: "Email: singolo messaggio senza thread"
file: email_single.html
op: preprocess
input_content_type: email_html
assertions:
contains: "deploy is done"
- id: "1.9"
description: "Email: HTML pesante con table layout"
file: email_heavy.html
op: preprocess
input_content_type: email_html
assertions:
no_html_tags: true
min_length: 30
not_contains:
- "border-collapse"
- "font-size"
- id: "1.10"
description: "Fallback: file sconosciuto → testo restituito"
file: fallback.txt
op: preprocess
input_content_type: unknown
assertions:
min_length: 1
content_type: unknown
- id: "1.10"
file: fallback.txt
process: unknown
min_chars: 1
content_type: unknown