"""
ct = detect_content_type("index.html", generic)
score = 1.0 if ct == "generic_html" else 0.0
_score("preprocess.detect_generic", score)
assert ct == "generic_html", f"Expected 'generic_html', got '{ct}'"
# ── 1.3 — Detect plain text ───────────────────────────────────────────
def test_detect_plain_text():
ct = detect_content_type("notes.txt", "Just some notes here.\nNo HTML at all.")
score = 1.0 if ct == "plain_text" else 0.0
_score("preprocess.detect_text", score)
assert ct == "plain_text", f"Expected 'plain_text', got '{ct}'"
# ── 1.4 — Detect unknown ──────────────────────────────────────────────
def test_detect_unknown():
# Simulate binary-like content with non-printable chars
binary_like = "some\x00\x01\x02\x03\x04\x05content" * 20
ct = detect_content_type("archive.xyz", binary_like)
score = 1.0 if ct == "unknown" else 0.0
_score("preprocess.detect_unknown", score)
assert ct == "unknown", f"Expected 'unknown', got '{ct}'"
# ── 1.5 — Email: strip HTML tags ─────────────────────────────────────
def test_email_strip_html(sample_email_html):
result = preprocess("email_html", sample_email_html)
has_no_tags = "<" not in result.clean_text
has_content = len(result.clean_text) > 50
ratio = len(result.clean_text) / len(sample_email_html)
score = 1.0 if (has_no_tags and has_content and ratio < 0.8) else 0.0
_score("preprocess.email_strip", score, f"ratio={ratio:.2f}, len={len(result.clean_text)}")
assert has_no_tags, "clean_text still contains HTML tags"
assert has_content, "clean_text is too short"
# ── 1.6 — Email: extract metadata ────────────────────────────────────
def test_email_extract_metadata(sample_email_html):
result = preprocess("email_html", sample_email_html)
has_subject = bool(result.metadata.get("subject"))
has_from = bool(result.metadata.get("from"))
score = 1.0 if (has_subject and has_from) else 0.5 if (has_subject or has_from) else 0.0
_score("preprocess.email_metadata", score,
f"subject={result.metadata.get('subject')}, from={result.metadata.get('from')}")
assert has_subject, f"metadata missing 'subject'. Got: {result.metadata}"
assert has_from, f"metadata missing 'from'. Got: {result.metadata}"
# ── 1.7 — Email: split thread ─────────────────────────────────────────
def test_email_split_thread(sample_thread_email_html):
result = preprocess("email_html", sample_thread_email_html)
# The latest message is "Sure, I'll handle the deploy."
# Quoted content from Bob/Alice should not appear in clean_text
has_latest = "Sure, I'll handle the deploy" in result.clean_text
lacks_quoted = "Let's plan the deploy" not in result.clean_text
score = 1.0 if (has_latest and lacks_quoted) else 0.5 if has_latest else 0.0
_score("preprocess.email_thread", score,
f"has_latest={has_latest}, lacks_quoted={lacks_quoted}")
assert has_latest, "Latest message not found in clean_text"
assert lacks_quoted, "Quoted older message leaked into clean_text"
# ── 1.8 — Email: single message (no thread) ──────────────────────────
def test_email_single_message():
single = """
From: alice@co.com
Subject: Quick update
The deploy is done. Everything looks good.
"""
result = preprocess("email_html", single)
has_body = "deploy is done" in result.clean_text
score = 1.0 if has_body else 0.0
_score("preprocess.email_single", score)
assert has_body, "Body of single message not found in clean_text"
# ── 1.9 — Email: heavy HTML (table layout) ───────────────────────────
def test_email_heavy_html(sample_heavy_html_email):
result = preprocess("email_html", sample_heavy_html_email)
has_no_tags = "<" not in result.clean_text
has_content = len(result.clean_text) > 30
# CSS properties should not appear in clean text
no_css = "border-collapse" not in result.clean_text and "font-size" not in result.clean_text
score = 1.0 if (has_no_tags and has_content and no_css) else 0.0
_score("preprocess.email_heavy_html", score,
f"no_tags={has_no_tags}, has_content={has_content}, no_css={no_css}")
assert has_no_tags, "HTML tags found in clean_text"
assert has_content, "clean_text is empty"
assert no_css, "CSS properties leaked into clean_text"
# ── 1.10 — Fallback: unknown file type ───────────────────────────────
def test_fallback_unknown_content():
raw = "random text content without any structure\nline two\nline three"
result = preprocess("unknown", raw)
has_text = len(result.clean_text) > 0
score = 1.0 if has_text else 0.0
_score("preprocess.fallback", score)
assert has_text, "fallback handler returned empty clean_text"
assert result.content_type == "unknown"