# Preprocessor test cases # # detect: → chiama detect_content_type(filename, content) # process: → chiama preprocess(content_type, content) # # Sorgente: file: oppure generate: binary_noise # # Assertions piatte (solo per process): # no_html: true clean_text senza tag HTML # min_chars: N len(clean_text) >= N # ratio_lt: F len(clean) / len(raw) < F # has_meta: [k, ...] chiavi presenti in metadata # contains: str | [str] substring(s) presenti in clean_text # excludes: str | [str] substring(s) assenti da clean_text # content_type: str result.content_type == questo valore - id: "1.1" file: email_action.html detect: email_html - id: "1.2" file: generic_page.html detect: generic_html - id: "1.3" file: notes.txt detect: plain_text - id: "1.4" file: archive.xyz generate: binary_noise detect: unknown - id: "1.5" file: email_action.html process: email_html no_html: true min_chars: 50 ratio_lt: 0.8 - id: "1.6" file: email_action.html process: email_html has_meta: [subject, from] - id: "1.7" file: email_thread.html process: email_html contains: "Sure, I'll handle the deploy" excludes: "Let's plan the deploy" - id: "1.8" file: email_single.html process: email_html contains: "deploy is done" - id: "1.9" file: email_heavy.html process: email_html no_html: true min_chars: 30 excludes: [border-collapse, font-size] - id: "1.10" file: fallback.txt process: unknown min_chars: 1 content_type: unknown