Files
api/tests/fixtures/preprocessors/cases.yaml
Roberto Musso bf445ac2ce refactor(tests): YAML-driven fixtures for preprocessor tests
- cases.yaml: 10 test cases con schema dichiarativo (op, assertions)
- data/: 7 file reali (email_action.html, email_thread.html, email_single.html,
  email_heavy.html, generic_page.html, notes.txt, fallback.txt)
- test_preprocessors.py: parametrize da YAML via test_detect / test_preprocess;
  assertion engine generico (no_html_tags, min_length, compression_ratio,
  metadata_keys, contains, not_contains, content_type)
- requirements.txt: add PyYAML

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 10:44:41 +02:00

128 lines
3.8 KiB
YAML

# Preprocessor test cases — Step 1 (Local Agent V2)
#
# Schema per caso:
# id: "1.N"
# description: str
# score_name: str # nome score inviato a Langfuse
#
# Sorgente contenuto (una delle due):
# file: <nome file in data/> # letto come testo UTF-8
# generate: binary_noise # contenuto generato dal runner (per test binari)
#
# Per op=detect:
# op: detect
# input_filename: str # filename passato a detect_content_type
# expected_content_type: str
#
# Per op=preprocess:
# op: preprocess
# input_content_type: str # content_type passato a preprocess()
# assertions:
# no_html_tags: bool
# min_length: int
# compression_ratio_lt: float # len(clean) / len(raw) < soglia
# metadata_keys: [str, ...] # chiavi che devono essere in metadata
# contains: str | [str, ...] # substring(s) presenti in clean_text
# not_contains: str | [str, ...] # substring(s) assenti da clean_text
# content_type: str # valore atteso di result.content_type
cases:
# ── Detection tests ────────────────────────────────────────────────
- id: "1.1"
description: "Detect email HTML"
score_name: preprocess.detect_email
file: email_action.html
op: detect
input_filename: email_export.html
expected_content_type: email_html
- id: "1.2"
description: "Detect generic HTML"
score_name: preprocess.detect_generic
file: generic_page.html
op: detect
input_filename: index.html
expected_content_type: generic_html
- id: "1.3"
description: "Detect plain text"
score_name: preprocess.detect_text
file: notes.txt
op: detect
input_filename: notes.txt
expected_content_type: plain_text
- id: "1.4"
description: "Detect unknown (binary-like content)"
score_name: preprocess.detect_unknown
generate: binary_noise
op: detect
input_filename: archive.xyz
expected_content_type: unknown
# ── Preprocess tests ───────────────────────────────────────────────
- id: "1.5"
description: "Email: strip HTML tags"
score_name: preprocess.email_strip
file: email_action.html
op: preprocess
input_content_type: email_html
assertions:
no_html_tags: true
min_length: 50
compression_ratio_lt: 0.8
- id: "1.6"
description: "Email: extract metadata (Subject + From)"
score_name: preprocess.email_metadata
file: email_action.html
op: preprocess
input_content_type: email_html
assertions:
metadata_keys: [subject, from]
- id: "1.7"
description: "Email: split thread — solo ultimo messaggio"
score_name: preprocess.email_thread
file: email_thread.html
op: preprocess
input_content_type: email_html
assertions:
contains: "Sure, I'll handle the deploy"
not_contains: "Let's plan the deploy"
- id: "1.8"
description: "Email: singolo messaggio senza thread"
score_name: preprocess.email_single
file: email_single.html
op: preprocess
input_content_type: email_html
assertions:
contains: "deploy is done"
- id: "1.9"
description: "Email: HTML pesante con table layout"
score_name: preprocess.email_heavy_html
file: email_heavy.html
op: preprocess
input_content_type: email_html
assertions:
no_html_tags: true
min_length: 30
not_contains:
- "border-collapse"
- "font-size"
- id: "1.10"
description: "Fallback: file sconosciuto → testo restituito"
score_name: preprocess.fallback
file: fallback.txt
op: preprocess
input_content_type: unknown
assertions:
min_length: 1
content_type: unknown