- cases.yaml: 10 test cases con schema dichiarativo (op, assertions) - data/: 7 file reali (email_action.html, email_thread.html, email_single.html, email_heavy.html, generic_page.html, notes.txt, fallback.txt) - test_preprocessors.py: parametrize da YAML via test_detect / test_preprocess; assertion engine generico (no_html_tags, min_length, compression_ratio, metadata_keys, contains, not_contains, content_type) - requirements.txt: add PyYAML Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
128 lines
3.8 KiB
YAML
128 lines
3.8 KiB
YAML
# Preprocessor test cases — Step 1 (Local Agent V2)
|
|
#
|
|
# Schema per caso:
|
|
# id: "1.N"
|
|
# description: str
|
|
# score_name: str # nome score inviato a Langfuse
|
|
#
|
|
# Sorgente contenuto (una delle due):
|
|
# file: <nome file in data/> # letto come testo UTF-8
|
|
# generate: binary_noise # contenuto generato dal runner (per test binari)
|
|
#
|
|
# Per op=detect:
|
|
# op: detect
|
|
# input_filename: str # filename passato a detect_content_type
|
|
# expected_content_type: str
|
|
#
|
|
# Per op=preprocess:
|
|
# op: preprocess
|
|
# input_content_type: str # content_type passato a preprocess()
|
|
# assertions:
|
|
# no_html_tags: bool
|
|
# min_length: int
|
|
# compression_ratio_lt: float # len(clean) / len(raw) < soglia
|
|
# metadata_keys: [str, ...] # chiavi che devono essere in metadata
|
|
# contains: str | [str, ...] # substring(s) presenti in clean_text
|
|
# not_contains: str | [str, ...] # substring(s) assenti da clean_text
|
|
# content_type: str # valore atteso di result.content_type
|
|
|
|
cases:
|
|
|
|
# ── Detection tests ────────────────────────────────────────────────
|
|
|
|
- id: "1.1"
|
|
description: "Detect email HTML"
|
|
score_name: preprocess.detect_email
|
|
file: email_action.html
|
|
op: detect
|
|
input_filename: email_export.html
|
|
expected_content_type: email_html
|
|
|
|
- id: "1.2"
|
|
description: "Detect generic HTML"
|
|
score_name: preprocess.detect_generic
|
|
file: generic_page.html
|
|
op: detect
|
|
input_filename: index.html
|
|
expected_content_type: generic_html
|
|
|
|
- id: "1.3"
|
|
description: "Detect plain text"
|
|
score_name: preprocess.detect_text
|
|
file: notes.txt
|
|
op: detect
|
|
input_filename: notes.txt
|
|
expected_content_type: plain_text
|
|
|
|
- id: "1.4"
|
|
description: "Detect unknown (binary-like content)"
|
|
score_name: preprocess.detect_unknown
|
|
generate: binary_noise
|
|
op: detect
|
|
input_filename: archive.xyz
|
|
expected_content_type: unknown
|
|
|
|
# ── Preprocess tests ───────────────────────────────────────────────
|
|
|
|
- id: "1.5"
|
|
description: "Email: strip HTML tags"
|
|
score_name: preprocess.email_strip
|
|
file: email_action.html
|
|
op: preprocess
|
|
input_content_type: email_html
|
|
assertions:
|
|
no_html_tags: true
|
|
min_length: 50
|
|
compression_ratio_lt: 0.8
|
|
|
|
- id: "1.6"
|
|
description: "Email: extract metadata (Subject + From)"
|
|
score_name: preprocess.email_metadata
|
|
file: email_action.html
|
|
op: preprocess
|
|
input_content_type: email_html
|
|
assertions:
|
|
metadata_keys: [subject, from]
|
|
|
|
- id: "1.7"
|
|
description: "Email: split thread — solo ultimo messaggio"
|
|
score_name: preprocess.email_thread
|
|
file: email_thread.html
|
|
op: preprocess
|
|
input_content_type: email_html
|
|
assertions:
|
|
contains: "Sure, I'll handle the deploy"
|
|
not_contains: "Let's plan the deploy"
|
|
|
|
- id: "1.8"
|
|
description: "Email: singolo messaggio senza thread"
|
|
score_name: preprocess.email_single
|
|
file: email_single.html
|
|
op: preprocess
|
|
input_content_type: email_html
|
|
assertions:
|
|
contains: "deploy is done"
|
|
|
|
- id: "1.9"
|
|
description: "Email: HTML pesante con table layout"
|
|
score_name: preprocess.email_heavy_html
|
|
file: email_heavy.html
|
|
op: preprocess
|
|
input_content_type: email_html
|
|
assertions:
|
|
no_html_tags: true
|
|
min_length: 30
|
|
not_contains:
|
|
- "border-collapse"
|
|
- "font-size"
|
|
|
|
- id: "1.10"
|
|
description: "Fallback: file sconosciuto → testo restituito"
|
|
score_name: preprocess.fallback
|
|
file: fallback.txt
|
|
op: preprocess
|
|
input_content_type: unknown
|
|
assertions:
|
|
min_length: 1
|
|
content_type: unknown
|