"""Tests for the preprocessor system (Step 1). Test IDs map to the plan: 1.1 detect_email, 1.2 detect_generic, 1.3 detect_text, 1.4 detect_unknown 1.5 email_strip, 1.6 email_metadata, 1.7 email_thread, 1.8 email_single 1.9 email_heavy_html, 1.10 fallback Run: pytest tests/test_preprocessors.py -v Langfuse scores are sent when LANGFUSE_SECRET_KEY / LANGFUSE_PUBLIC_KEY are set. """ from __future__ import annotations import pytest from app.core.preprocessors import detect_content_type, preprocess from app.core.langfuse_client import get_langfuse # ── Fixtures ────────────────────────────────────────────────────────── @pytest.fixture def sample_email_html() -> str: return """ Fix the login bug

Subject: Fix the login bug

From: boss@company.com

To: dev@company.com

Date: Mon, 7 Apr 2026 09:00:00 +0200

Please fix the login bug by Friday. It is blocking the release.

""" @pytest.fixture def sample_thread_email_html() -> str: return """

From: alice@co.com

Subject: Re: Re: Deploy plan

Sure, I'll handle the deploy.

On Mon, Apr 6, 2026 at 3:00 PM, Bob <bob@co.com> wrote:

From: bob@co.com

Can you handle the deploy?

On Sun, Apr 5, 2026 at 1:00 PM, Alice <alice@co.com> wrote:

From: alice@co.com

Let's plan the deploy for Monday.

""" @pytest.fixture def sample_heavy_html_email() -> str: return """
Company Newsletter
From:newsletter@corp.com
Subject:Q1 Results Update
Date:Apr 7, 2026

Dear Team,

Q1 results are in. Revenue up 15% year-over-year.

Please review the attached report.

""" # ── Helper ──────────────────────────────────────────────────────────── def _score(name: str, value: float, comment: str = "") -> None: lf = get_langfuse() if lf: trace = lf.trace(name=f"eval-{name}") lf.score(trace_id=trace.id, name=name, value=value, data_type="NUMERIC", comment=comment) lf.flush() # ── 1.1 — Detect email HTML ─────────────────────────────────────────── def test_detect_email_html(sample_email_html): ct = detect_content_type("email_export.html", sample_email_html) score = 1.0 if ct == "email_html" else 0.0 _score("preprocess.detect_email", score) assert ct == "email_html", f"Expected 'email_html', got '{ct}'" # ── 1.2 — Detect generic HTML ───────────────────────────────────────── def test_detect_generic_html(): generic = """My App

Welcome

""" ct = detect_content_type("index.html", generic) score = 1.0 if ct == "generic_html" else 0.0 _score("preprocess.detect_generic", score) assert ct == "generic_html", f"Expected 'generic_html', got '{ct}'" # ── 1.3 — Detect plain text ─────────────────────────────────────────── def test_detect_plain_text(): ct = detect_content_type("notes.txt", "Just some notes here.\nNo HTML at all.") score = 1.0 if ct == "plain_text" else 0.0 _score("preprocess.detect_text", score) assert ct == "plain_text", f"Expected 'plain_text', got '{ct}'" # ── 1.4 — Detect unknown ────────────────────────────────────────────── def test_detect_unknown(): # Simulate binary-like content with non-printable chars binary_like = "some\x00\x01\x02\x03\x04\x05content" * 20 ct = detect_content_type("archive.xyz", binary_like) score = 1.0 if ct == "unknown" else 0.0 _score("preprocess.detect_unknown", score) assert ct == "unknown", f"Expected 'unknown', got '{ct}'" # ── 1.5 — Email: strip HTML tags ───────────────────────────────────── def test_email_strip_html(sample_email_html): result = preprocess("email_html", sample_email_html) has_no_tags = "<" not in result.clean_text has_content = len(result.clean_text) > 50 ratio = len(result.clean_text) / len(sample_email_html) score = 1.0 if (has_no_tags and has_content and ratio < 0.8) else 0.0 _score("preprocess.email_strip", score, f"ratio={ratio:.2f}, len={len(result.clean_text)}") assert has_no_tags, "clean_text still contains HTML tags" assert has_content, "clean_text is too short" # ── 1.6 — Email: extract metadata ──────────────────────────────────── def test_email_extract_metadata(sample_email_html): result = preprocess("email_html", sample_email_html) has_subject = bool(result.metadata.get("subject")) has_from = bool(result.metadata.get("from")) score = 1.0 if (has_subject and has_from) else 0.5 if (has_subject or has_from) else 0.0 _score("preprocess.email_metadata", score, f"subject={result.metadata.get('subject')}, from={result.metadata.get('from')}") assert has_subject, f"metadata missing 'subject'. Got: {result.metadata}" assert has_from, f"metadata missing 'from'. Got: {result.metadata}" # ── 1.7 — Email: split thread ───────────────────────────────────────── def test_email_split_thread(sample_thread_email_html): result = preprocess("email_html", sample_thread_email_html) # The latest message is "Sure, I'll handle the deploy." # Quoted content from Bob/Alice should not appear in clean_text has_latest = "Sure, I'll handle the deploy" in result.clean_text lacks_quoted = "Let's plan the deploy" not in result.clean_text score = 1.0 if (has_latest and lacks_quoted) else 0.5 if has_latest else 0.0 _score("preprocess.email_thread", score, f"has_latest={has_latest}, lacks_quoted={lacks_quoted}") assert has_latest, "Latest message not found in clean_text" assert lacks_quoted, "Quoted older message leaked into clean_text" # ── 1.8 — Email: single message (no thread) ────────────────────────── def test_email_single_message(): single = """

From: alice@co.com

Subject: Quick update

The deploy is done. Everything looks good.

""" result = preprocess("email_html", single) has_body = "deploy is done" in result.clean_text score = 1.0 if has_body else 0.0 _score("preprocess.email_single", score) assert has_body, "Body of single message not found in clean_text" # ── 1.9 — Email: heavy HTML (table layout) ─────────────────────────── def test_email_heavy_html(sample_heavy_html_email): result = preprocess("email_html", sample_heavy_html_email) has_no_tags = "<" not in result.clean_text has_content = len(result.clean_text) > 30 # CSS properties should not appear in clean text no_css = "border-collapse" not in result.clean_text and "font-size" not in result.clean_text score = 1.0 if (has_no_tags and has_content and no_css) else 0.0 _score("preprocess.email_heavy_html", score, f"no_tags={has_no_tags}, has_content={has_content}, no_css={no_css}") assert has_no_tags, "HTML tags found in clean_text" assert has_content, "clean_text is empty" assert no_css, "CSS properties leaked into clean_text" # ── 1.10 — Fallback: unknown file type ─────────────────────────────── def test_fallback_unknown_content(): raw = "random text content without any structure\nline two\nline three" result = preprocess("unknown", raw) has_text = len(result.clean_text) > 0 score = 1.0 if has_text else 0.0 _score("preprocess.fallback", score) assert has_text, "fallback handler returned empty clean_text" assert result.content_type == "unknown"