# Preprocessor test cases — Step 1 (Local Agent V2) # # Schema per caso: # id: "1.N" # description: str # score_name: str # nome score inviato a Langfuse # # Sorgente contenuto (una delle due): # file: # letto come testo UTF-8 # generate: binary_noise # contenuto generato dal runner (per test binari) # # Per op=detect: # op: detect # input_filename: str # filename passato a detect_content_type # expected_content_type: str # # Per op=preprocess: # op: preprocess # input_content_type: str # content_type passato a preprocess() # assertions: # no_html_tags: bool # min_length: int # compression_ratio_lt: float # len(clean) / len(raw) < soglia # metadata_keys: [str, ...] # chiavi che devono essere in metadata # contains: str | [str, ...] # substring(s) presenti in clean_text # not_contains: str | [str, ...] # substring(s) assenti da clean_text # content_type: str # valore atteso di result.content_type cases: # ── Detection tests ──────────────────────────────────────────────── - id: "1.1" description: "Detect email HTML" score_name: preprocess.detect_email file: email_action.html op: detect input_filename: email_export.html expected_content_type: email_html - id: "1.2" description: "Detect generic HTML" score_name: preprocess.detect_generic file: generic_page.html op: detect input_filename: index.html expected_content_type: generic_html - id: "1.3" description: "Detect plain text" score_name: preprocess.detect_text file: notes.txt op: detect input_filename: notes.txt expected_content_type: plain_text - id: "1.4" description: "Detect unknown (binary-like content)" score_name: preprocess.detect_unknown generate: binary_noise op: detect input_filename: archive.xyz expected_content_type: unknown # ── Preprocess tests ─────────────────────────────────────────────── - id: "1.5" description: "Email: strip HTML tags" score_name: preprocess.email_strip file: email_action.html op: preprocess input_content_type: email_html assertions: no_html_tags: true min_length: 50 compression_ratio_lt: 0.8 - id: "1.6" description: "Email: extract metadata (Subject + From)" score_name: preprocess.email_metadata file: email_action.html op: preprocess input_content_type: email_html assertions: metadata_keys: [subject, from] - id: "1.7" description: "Email: split thread — solo ultimo messaggio" score_name: preprocess.email_thread file: email_thread.html op: preprocess input_content_type: email_html assertions: contains: "Sure, I'll handle the deploy" not_contains: "Let's plan the deploy" - id: "1.8" description: "Email: singolo messaggio senza thread" score_name: preprocess.email_single file: email_single.html op: preprocess input_content_type: email_html assertions: contains: "deploy is done" - id: "1.9" description: "Email: HTML pesante con table layout" score_name: preprocess.email_heavy_html file: email_heavy.html op: preprocess input_content_type: email_html assertions: no_html_tags: true min_length: 30 not_contains: - "border-collapse" - "font-size" - id: "1.10" description: "Fallback: file sconosciuto → testo restituito" score_name: preprocess.fallback file: fallback.txt op: preprocess input_content_type: unknown assertions: min_length: 1 content_type: unknown