api/tests/test_preprocessors.py

"""Tests for the preprocessor system (Step 1).

Test IDs map to the plan:
  1.1 detect_email, 1.2 detect_generic, 1.3 detect_text, 1.4 detect_unknown
  1.5 email_strip, 1.6 email_metadata, 1.7 email_thread, 1.8 email_single
  1.9 email_heavy_html, 1.10 fallback

Run:
    pytest tests/test_preprocessors.py -v

Langfuse scores are sent when LANGFUSE_SECRET_KEY / LANGFUSE_PUBLIC_KEY are set.
"""

from __future__ import annotations

import pytest

from app.core.preprocessors import detect_content_type, preprocess
from app.core.langfuse_client import get_langfuse

# ── Fixtures ──────────────────────────────────────────────────────────


@pytest.fixture
def sample_email_html() -> str:
    return """<!DOCTYPE html>
<html>
<head>
  <title>Fix the login bug</title>
  <style>body { font-family: Arial; color: #333; }</style>
</head>
<body>
  <p>Subject: Fix the login bug</p>
  <p>From: boss@company.com</p>
  <p>To: dev@company.com</p>
  <p>Date: Mon, 7 Apr 2026 09:00:00 +0200</p>
  <p>Please fix the login bug by Friday. It is blocking the release.</p>
</body>
</html>"""


@pytest.fixture
def sample_thread_email_html() -> str:
    return """<!DOCTYPE html>
<html><body>
<p>From: alice@co.com</p>
<p>Subject: Re: Re: Deploy plan</p>
<p>Sure, I'll handle the deploy.</p>

<p>On Mon, Apr 6, 2026 at 3:00 PM, Bob &lt;bob@co.com&gt; wrote:</p>
<blockquote>
<p>From: bob@co.com</p>
<p>Can you handle the deploy?</p>
<p>On Sun, Apr 5, 2026 at 1:00 PM, Alice &lt;alice@co.com&gt; wrote:</p>
<blockquote>
<p>From: alice@co.com</p>
<p>Let's plan the deploy for Monday.</p>
</blockquote>
</blockquote>
</body></html>"""


@pytest.fixture
def sample_heavy_html_email() -> str:
    return """<!DOCTYPE html>
<html><head>
<style>
  table { border-collapse: collapse; width: 100%; }
  td { padding: 8px; border: 1px solid #ddd; font-size: 12px; }
  .header { background: #003366; color: white; }
  .footer { font-size: 10px; color: #999; }
</style>
</head><body>
<table>
  <tr class="header"><td colspan="2">Company Newsletter</td></tr>
  <tr><td>From:</td><td>newsletter@corp.com</td></tr>
  <tr><td>Subject:</td><td>Q1 Results Update</td></tr>
  <tr><td>Date:</td><td>Apr 7, 2026</td></tr>
  <tr><td colspan="2">
    <p>Dear Team,</p>
    <p>Q1 results are in. Revenue up 15% year-over-year.</p>
    <p>Please review the attached report.</p>
  </td></tr>
  <tr class="footer"><td colspan="2">Confidential — do not forward</td></tr>
</table>
</body></html>"""


# ── Helper ────────────────────────────────────────────────────────────

def _score(name: str, value: float, comment: str = "") -> None:
    lf = get_langfuse()
    if lf:
        trace = lf.trace(name=f"eval-{name}")
        lf.score(trace_id=trace.id, name=name, value=value,
                 data_type="NUMERIC", comment=comment)
        lf.flush()


# ── 1.1 — Detect email HTML ───────────────────────────────────────────

def test_detect_email_html(sample_email_html):
    ct = detect_content_type("email_export.html", sample_email_html)
    score = 1.0 if ct == "email_html" else 0.0
    _score("preprocess.detect_email", score)
    assert ct == "email_html", f"Expected 'email_html', got '{ct}'"


# ── 1.2 — Detect generic HTML ─────────────────────────────────────────

def test_detect_generic_html():
    generic = """<!DOCTYPE html><html><head><title>My App</title></head>
<body><nav><a href="/">Home</a></nav><main><p>Welcome</p></main></body></html>"""
    ct = detect_content_type("index.html", generic)
    score = 1.0 if ct == "generic_html" else 0.0
    _score("preprocess.detect_generic", score)
    assert ct == "generic_html", f"Expected 'generic_html', got '{ct}'"


# ── 1.3 — Detect plain text ───────────────────────────────────────────

def test_detect_plain_text():
    ct = detect_content_type("notes.txt", "Just some notes here.\nNo HTML at all.")
    score = 1.0 if ct == "plain_text" else 0.0
    _score("preprocess.detect_text", score)
    assert ct == "plain_text", f"Expected 'plain_text', got '{ct}'"


# ── 1.4 — Detect unknown ──────────────────────────────────────────────

def test_detect_unknown():
    # Simulate binary-like content with non-printable chars
    binary_like = "some\x00\x01\x02\x03\x04\x05content" * 20
    ct = detect_content_type("archive.xyz", binary_like)
    score = 1.0 if ct == "unknown" else 0.0
    _score("preprocess.detect_unknown", score)
    assert ct == "unknown", f"Expected 'unknown', got '{ct}'"


# ── 1.5 — Email: strip HTML tags ─────────────────────────────────────

def test_email_strip_html(sample_email_html):
    result = preprocess("email_html", sample_email_html)
    has_no_tags = "<" not in result.clean_text
    has_content = len(result.clean_text) > 50
    ratio = len(result.clean_text) / len(sample_email_html)
    score = 1.0 if (has_no_tags and has_content and ratio < 0.8) else 0.0
    _score("preprocess.email_strip", score, f"ratio={ratio:.2f}, len={len(result.clean_text)}")
    assert has_no_tags, "clean_text still contains HTML tags"
    assert has_content, "clean_text is too short"


# ── 1.6 — Email: extract metadata ────────────────────────────────────

def test_email_extract_metadata(sample_email_html):
    result = preprocess("email_html", sample_email_html)
    has_subject = bool(result.metadata.get("subject"))
    has_from = bool(result.metadata.get("from"))
    score = 1.0 if (has_subject and has_from) else 0.5 if (has_subject or has_from) else 0.0
    _score("preprocess.email_metadata", score,
           f"subject={result.metadata.get('subject')}, from={result.metadata.get('from')}")
    assert has_subject, f"metadata missing 'subject'. Got: {result.metadata}"
    assert has_from, f"metadata missing 'from'. Got: {result.metadata}"


# ── 1.7 — Email: split thread ─────────────────────────────────────────

def test_email_split_thread(sample_thread_email_html):
    result = preprocess("email_html", sample_thread_email_html)
    # The latest message is "Sure, I'll handle the deploy."
    # Quoted content from Bob/Alice should not appear in clean_text
    has_latest = "Sure, I'll handle the deploy" in result.clean_text
    lacks_quoted = "Let's plan the deploy" not in result.clean_text
    score = 1.0 if (has_latest and lacks_quoted) else 0.5 if has_latest else 0.0
    _score("preprocess.email_thread", score,
           f"has_latest={has_latest}, lacks_quoted={lacks_quoted}")
    assert has_latest, "Latest message not found in clean_text"
    assert lacks_quoted, "Quoted older message leaked into clean_text"


# ── 1.8 — Email: single message (no thread) ──────────────────────────

def test_email_single_message():
    single = """<!DOCTYPE html><html><body>
<p>From: alice@co.com</p>
<p>Subject: Quick update</p>
<p>The deploy is done. Everything looks good.</p>
</body></html>"""
    result = preprocess("email_html", single)
    has_body = "deploy is done" in result.clean_text
    score = 1.0 if has_body else 0.0
    _score("preprocess.email_single", score)
    assert has_body, "Body of single message not found in clean_text"


# ── 1.9 — Email: heavy HTML (table layout) ───────────────────────────

def test_email_heavy_html(sample_heavy_html_email):
    result = preprocess("email_html", sample_heavy_html_email)
    has_no_tags = "<" not in result.clean_text
    has_content = len(result.clean_text) > 30
    # CSS properties should not appear in clean text
    no_css = "border-collapse" not in result.clean_text and "font-size" not in result.clean_text
    score = 1.0 if (has_no_tags and has_content and no_css) else 0.0
    _score("preprocess.email_heavy_html", score,
           f"no_tags={has_no_tags}, has_content={has_content}, no_css={no_css}")
    assert has_no_tags, "HTML tags found in clean_text"
    assert has_content, "clean_text is empty"
    assert no_css, "CSS properties leaked into clean_text"


# ── 1.10 — Fallback: unknown file type ───────────────────────────────

def test_fallback_unknown_content():
    raw = "random text content without any structure\nline two\nline three"
    result = preprocess("unknown", raw)
    has_text = len(result.clean_text) > 0
    score = 1.0 if has_text else 0.0
    _score("preprocess.fallback", score)
    assert has_text, "fallback handler returned empty clean_text"
    assert result.content_type == "unknown"