feat: add preprocessor system (Step 1 — Local Agent V2)
- app/core/preprocessors/__init__.py: detect_content_type + preprocess dispatcher - app/core/preprocessors/base.py: PreprocessResult dataclass - app/core/preprocessors/email_html.py: BeautifulSoup HTML stripping, metadata extraction, thread splitting - requirements.txt: add beautifulsoup4 and lxml - tests/test_preprocessors.py: 10 tests with Langfuse scoring (preprocess.* scores) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
104
app/core/preprocessors/__init__.py
Normal file
104
app/core/preprocessors/__init__.py
Normal file
@@ -0,0 +1,104 @@
|
|||||||
|
"""Preprocessor registry: detect content type and dispatch to handlers.
|
||||||
|
|
||||||
|
Public API
|
||||||
|
----------
|
||||||
|
detect_content_type(filename, raw_content) -> str
|
||||||
|
Heuristic detection based on file extension and content patterns.
|
||||||
|
|
||||||
|
preprocess(content_type, raw_content) -> PreprocessResult
|
||||||
|
Dispatch to the appropriate handler.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
from app.core.preprocessors.base import PreprocessResult
|
||||||
|
|
||||||
|
# ── Heuristics ────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
# Patterns that strongly suggest an email HTML file
|
||||||
|
_EMAIL_SIGNALS = re.compile(
|
||||||
|
r"(Subject:|From:|To:|Date:|Sent:|MIME-Version:|Content-Type:\s*text/html)",
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Patterns that suggest a generic HTML page (not an email)
|
||||||
|
_GENERIC_HTML_SIGNALS = re.compile(
|
||||||
|
r"<(nav|main|header|footer|article|section)\b",
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def detect_content_type(filename: str, raw_content: str) -> str:
|
||||||
|
"""Return a content-type string for the given file.
|
||||||
|
|
||||||
|
Supported types: ``"email_html"``, ``"generic_html"``,
|
||||||
|
``"plain_text"``, ``"unknown"``.
|
||||||
|
"""
|
||||||
|
ext = filename.rsplit(".", 1)[-1].lower() if "." in filename else ""
|
||||||
|
|
||||||
|
if ext == "txt":
|
||||||
|
return "plain_text"
|
||||||
|
|
||||||
|
if ext in ("html", "htm", "eml", "mhtml", "mht"):
|
||||||
|
# Prefer email detection over generic HTML
|
||||||
|
if _EMAIL_SIGNALS.search(raw_content[:4096]):
|
||||||
|
return "email_html"
|
||||||
|
if _GENERIC_HTML_SIGNALS.search(raw_content[:4096]) or "<html" in raw_content[:200].lower():
|
||||||
|
return "generic_html"
|
||||||
|
# .html without clear signals — check for any email header
|
||||||
|
if re.search(r"^(From|To|Subject|Date):", raw_content[:2048], re.MULTILINE | re.IGNORECASE):
|
||||||
|
return "email_html"
|
||||||
|
return "generic_html"
|
||||||
|
|
||||||
|
# Plain text files with email headers
|
||||||
|
if ext in ("", "txt") or not ext:
|
||||||
|
if _EMAIL_SIGNALS.search(raw_content[:4096]):
|
||||||
|
return "email_html"
|
||||||
|
|
||||||
|
# Detect binary content
|
||||||
|
try:
|
||||||
|
raw_content.encode("utf-8")
|
||||||
|
except (UnicodeEncodeError, AttributeError):
|
||||||
|
return "unknown"
|
||||||
|
|
||||||
|
# Non-text bytes heuristic: high ratio of non-printable chars
|
||||||
|
sample = raw_content[:512]
|
||||||
|
non_printable = sum(1 for c in sample if ord(c) < 32 and c not in "\r\n\t")
|
||||||
|
if len(sample) > 0 and non_printable / len(sample) > 0.1:
|
||||||
|
return "unknown"
|
||||||
|
|
||||||
|
return "unknown"
|
||||||
|
|
||||||
|
|
||||||
|
# ── Generic fallback handler ──────────────────────────────────────────
|
||||||
|
|
||||||
|
def _preprocess_generic(raw_content: str, content_type: str) -> PreprocessResult:
|
||||||
|
"""Strip HTML tags if present, return text as-is."""
|
||||||
|
try:
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
text = BeautifulSoup(raw_content, "html.parser").get_text(separator="\n")
|
||||||
|
except ImportError:
|
||||||
|
# No BeautifulSoup — strip tags with a simple regex
|
||||||
|
text = re.sub(r"<[^>]+>", "", raw_content)
|
||||||
|
|
||||||
|
text = re.sub(r"\n{3,}", "\n\n", text).strip()
|
||||||
|
return PreprocessResult(content_type=content_type, clean_text=text, metadata={})
|
||||||
|
|
||||||
|
|
||||||
|
# ── Dispatch ──────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def preprocess(content_type: str, raw_content: str) -> PreprocessResult:
|
||||||
|
"""Dispatch *raw_content* to the handler registered for *content_type*.
|
||||||
|
|
||||||
|
Falls back to the generic handler for unknown types.
|
||||||
|
"""
|
||||||
|
if content_type == "email_html":
|
||||||
|
from app.core.preprocessors.email_html import preprocess_email_html
|
||||||
|
return preprocess_email_html(raw_content)
|
||||||
|
|
||||||
|
return _preprocess_generic(raw_content, content_type)
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = ["detect_content_type", "preprocess", "PreprocessResult"]
|
||||||
25
app/core/preprocessors/base.py
Normal file
25
app/core/preprocessors/base.py
Normal file
@@ -0,0 +1,25 @@
|
|||||||
|
"""Base types for the preprocessor system."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class PreprocessResult:
|
||||||
|
"""Output of a preprocessor handler.
|
||||||
|
|
||||||
|
Attributes
|
||||||
|
----------
|
||||||
|
content_type:
|
||||||
|
The detected content type (e.g. ``"email_html"``, ``"plain_text"``).
|
||||||
|
clean_text:
|
||||||
|
Human-readable text stripped of markup/binary noise.
|
||||||
|
metadata:
|
||||||
|
Dict of extracted metadata (keys vary by handler).
|
||||||
|
Common keys: ``subject``, ``from``, ``to``, ``date``, ``filename``.
|
||||||
|
"""
|
||||||
|
|
||||||
|
content_type: str
|
||||||
|
clean_text: str
|
||||||
|
metadata: dict = field(default_factory=dict)
|
||||||
111
app/core/preprocessors/email_html.py
Normal file
111
app/core/preprocessors/email_html.py
Normal file
@@ -0,0 +1,111 @@
|
|||||||
|
"""Preprocessor for email HTML files.
|
||||||
|
|
||||||
|
Handles:
|
||||||
|
- HTML stripping via BeautifulSoup
|
||||||
|
- Metadata extraction (Subject, From, To, Date)
|
||||||
|
- Thread splitting — isolates the latest reply
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import re
|
||||||
|
from typing import TYPE_CHECKING
|
||||||
|
|
||||||
|
from app.core.preprocessors.base import PreprocessResult
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# ── Thread split markers ──────────────────────────────────────────────
|
||||||
|
|
||||||
|
# Matches patterns like:
|
||||||
|
# "On Mon, Apr 7, 2026 at 10:00 AM, Alice <alice@co.com> wrote:"
|
||||||
|
# "-----Original Message-----"
|
||||||
|
# "> " (plain-text quote prefix)
|
||||||
|
_THREAD_PATTERNS = [
|
||||||
|
re.compile(r"^On\s+.+wrote\s*:", re.IGNORECASE | re.MULTILINE),
|
||||||
|
re.compile(r"^-{3,}\s*(original message|forwarded message)\s*-{3,}", re.IGNORECASE | re.MULTILINE),
|
||||||
|
re.compile(r"^>{1,}\s+\S", re.MULTILINE),
|
||||||
|
re.compile(r"^From:\s+.+\nSent:\s+", re.IGNORECASE | re.MULTILINE),
|
||||||
|
]
|
||||||
|
|
||||||
|
# ── Metadata patterns (applied on raw HTML / plain fallback) ──────────
|
||||||
|
|
||||||
|
_META_PATTERNS: dict[str, list[re.Pattern]] = {
|
||||||
|
"subject": [
|
||||||
|
re.compile(r"<title>(.+?)</title>", re.IGNORECASE | re.DOTALL),
|
||||||
|
re.compile(r"Subject:\s*(.+)", re.IGNORECASE),
|
||||||
|
],
|
||||||
|
"from": [
|
||||||
|
re.compile(r'<meta[^>]+name=["\']?from["\']?[^>]+content=["\']([^"\']+)["\']', re.IGNORECASE),
|
||||||
|
re.compile(r"From:\s*(.+)", re.IGNORECASE),
|
||||||
|
],
|
||||||
|
"to": [
|
||||||
|
re.compile(r'<meta[^>]+name=["\']?to["\']?[^>]+content=["\']([^"\']+)["\']', re.IGNORECASE),
|
||||||
|
re.compile(r"To:\s*(.+)", re.IGNORECASE),
|
||||||
|
],
|
||||||
|
"date": [
|
||||||
|
re.compile(r'<meta[^>]+name=["\']?date["\']?[^>]+content=["\']([^"\']+)["\']', re.IGNORECASE),
|
||||||
|
re.compile(r"Date:\s*(.+)", re.IGNORECASE),
|
||||||
|
re.compile(r"Sent:\s*(.+)", re.IGNORECASE),
|
||||||
|
],
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_metadata(raw_html: str, text: str) -> dict:
|
||||||
|
"""Extract Subject/From/To/Date from raw HTML or plain text."""
|
||||||
|
metadata: dict[str, str] = {}
|
||||||
|
for field, patterns in _META_PATTERNS.items():
|
||||||
|
for pat in patterns:
|
||||||
|
m = pat.search(raw_html) or pat.search(text)
|
||||||
|
if m:
|
||||||
|
metadata[field] = m.group(1).strip()
|
||||||
|
break
|
||||||
|
return metadata
|
||||||
|
|
||||||
|
|
||||||
|
def _split_thread(text: str) -> str:
|
||||||
|
"""Return only the latest message in a threaded email."""
|
||||||
|
earliest_pos: int | None = None
|
||||||
|
for pat in _THREAD_PATTERNS:
|
||||||
|
m = pat.search(text)
|
||||||
|
if m and (earliest_pos is None or m.start() < earliest_pos):
|
||||||
|
earliest_pos = m.start()
|
||||||
|
|
||||||
|
if earliest_pos is not None and earliest_pos > 0:
|
||||||
|
return text[:earliest_pos].strip()
|
||||||
|
return text.strip()
|
||||||
|
|
||||||
|
|
||||||
|
def preprocess_email_html(raw_content: str) -> PreprocessResult:
|
||||||
|
"""Strip HTML, extract metadata, split thread from an email HTML file."""
|
||||||
|
try:
|
||||||
|
from bs4 import BeautifulSoup # lazy import — optional dep
|
||||||
|
except ImportError as exc:
|
||||||
|
raise ImportError(
|
||||||
|
"beautifulsoup4 is required for email_html preprocessing. "
|
||||||
|
"Install it with: pip install beautifulsoup4"
|
||||||
|
) from exc
|
||||||
|
|
||||||
|
# Parse with lxml if available, fall back to html.parser
|
||||||
|
try:
|
||||||
|
soup = BeautifulSoup(raw_content, "lxml")
|
||||||
|
except Exception:
|
||||||
|
soup = BeautifulSoup(raw_content, "html.parser")
|
||||||
|
|
||||||
|
# Remove noise tags
|
||||||
|
for tag in soup(["style", "script", "head", "noscript"]):
|
||||||
|
tag.decompose()
|
||||||
|
|
||||||
|
clean_text = soup.get_text(separator="\n")
|
||||||
|
# Collapse excessive blank lines
|
||||||
|
clean_text = re.sub(r"\n{3,}", "\n\n", clean_text).strip()
|
||||||
|
|
||||||
|
metadata = _extract_metadata(raw_content, clean_text)
|
||||||
|
latest_message = _split_thread(clean_text)
|
||||||
|
|
||||||
|
return PreprocessResult(
|
||||||
|
content_type="email_html",
|
||||||
|
clean_text=latest_message,
|
||||||
|
metadata=metadata,
|
||||||
|
)
|
||||||
@@ -33,4 +33,6 @@ google-auth-httplib2>=0.2.0
|
|||||||
msal>=1.28.0
|
msal>=1.28.0
|
||||||
cryptography>=42.0.0
|
cryptography>=42.0.0
|
||||||
langfuse>=2.0.0
|
langfuse>=2.0.0
|
||||||
|
beautifulsoup4>=4.12.0
|
||||||
|
lxml>=5.0.0
|
||||||
ruff>=0.8.0
|
ruff>=0.8.0
|
||||||
|
|||||||
221
tests/test_preprocessors.py
Normal file
221
tests/test_preprocessors.py
Normal file
@@ -0,0 +1,221 @@
|
|||||||
|
"""Tests for the preprocessor system (Step 1).
|
||||||
|
|
||||||
|
Test IDs map to the plan:
|
||||||
|
1.1 detect_email, 1.2 detect_generic, 1.3 detect_text, 1.4 detect_unknown
|
||||||
|
1.5 email_strip, 1.6 email_metadata, 1.7 email_thread, 1.8 email_single
|
||||||
|
1.9 email_heavy_html, 1.10 fallback
|
||||||
|
|
||||||
|
Run:
|
||||||
|
pytest tests/test_preprocessors.py -v
|
||||||
|
|
||||||
|
Langfuse scores are sent when LANGFUSE_SECRET_KEY / LANGFUSE_PUBLIC_KEY are set.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from app.core.preprocessors import detect_content_type, preprocess
|
||||||
|
from app.core.langfuse_client import get_langfuse
|
||||||
|
|
||||||
|
# ── Fixtures ──────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def sample_email_html() -> str:
|
||||||
|
return """<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>Fix the login bug</title>
|
||||||
|
<style>body { font-family: Arial; color: #333; }</style>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<p>Subject: Fix the login bug</p>
|
||||||
|
<p>From: boss@company.com</p>
|
||||||
|
<p>To: dev@company.com</p>
|
||||||
|
<p>Date: Mon, 7 Apr 2026 09:00:00 +0200</p>
|
||||||
|
<p>Please fix the login bug by Friday. It is blocking the release.</p>
|
||||||
|
</body>
|
||||||
|
</html>"""
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def sample_thread_email_html() -> str:
|
||||||
|
return """<!DOCTYPE html>
|
||||||
|
<html><body>
|
||||||
|
<p>From: alice@co.com</p>
|
||||||
|
<p>Subject: Re: Re: Deploy plan</p>
|
||||||
|
<p>Sure, I'll handle the deploy.</p>
|
||||||
|
|
||||||
|
<p>On Mon, Apr 6, 2026 at 3:00 PM, Bob <bob@co.com> wrote:</p>
|
||||||
|
<blockquote>
|
||||||
|
<p>From: bob@co.com</p>
|
||||||
|
<p>Can you handle the deploy?</p>
|
||||||
|
<p>On Sun, Apr 5, 2026 at 1:00 PM, Alice <alice@co.com> wrote:</p>
|
||||||
|
<blockquote>
|
||||||
|
<p>From: alice@co.com</p>
|
||||||
|
<p>Let's plan the deploy for Monday.</p>
|
||||||
|
</blockquote>
|
||||||
|
</blockquote>
|
||||||
|
</body></html>"""
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def sample_heavy_html_email() -> str:
|
||||||
|
return """<!DOCTYPE html>
|
||||||
|
<html><head>
|
||||||
|
<style>
|
||||||
|
table { border-collapse: collapse; width: 100%; }
|
||||||
|
td { padding: 8px; border: 1px solid #ddd; font-size: 12px; }
|
||||||
|
.header { background: #003366; color: white; }
|
||||||
|
.footer { font-size: 10px; color: #999; }
|
||||||
|
</style>
|
||||||
|
</head><body>
|
||||||
|
<table>
|
||||||
|
<tr class="header"><td colspan="2">Company Newsletter</td></tr>
|
||||||
|
<tr><td>From:</td><td>newsletter@corp.com</td></tr>
|
||||||
|
<tr><td>Subject:</td><td>Q1 Results Update</td></tr>
|
||||||
|
<tr><td>Date:</td><td>Apr 7, 2026</td></tr>
|
||||||
|
<tr><td colspan="2">
|
||||||
|
<p>Dear Team,</p>
|
||||||
|
<p>Q1 results are in. Revenue up 15% year-over-year.</p>
|
||||||
|
<p>Please review the attached report.</p>
|
||||||
|
</td></tr>
|
||||||
|
<tr class="footer"><td colspan="2">Confidential — do not forward</td></tr>
|
||||||
|
</table>
|
||||||
|
</body></html>"""
|
||||||
|
|
||||||
|
|
||||||
|
# ── Helper ────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def _score(name: str, value: float, comment: str = "") -> None:
|
||||||
|
lf = get_langfuse()
|
||||||
|
if lf:
|
||||||
|
trace = lf.trace(name=f"eval-{name}")
|
||||||
|
lf.score(trace_id=trace.id, name=name, value=value,
|
||||||
|
data_type="NUMERIC", comment=comment)
|
||||||
|
lf.flush()
|
||||||
|
|
||||||
|
|
||||||
|
# ── 1.1 — Detect email HTML ───────────────────────────────────────────
|
||||||
|
|
||||||
|
def test_detect_email_html(sample_email_html):
|
||||||
|
ct = detect_content_type("email_export.html", sample_email_html)
|
||||||
|
score = 1.0 if ct == "email_html" else 0.0
|
||||||
|
_score("preprocess.detect_email", score)
|
||||||
|
assert ct == "email_html", f"Expected 'email_html', got '{ct}'"
|
||||||
|
|
||||||
|
|
||||||
|
# ── 1.2 — Detect generic HTML ─────────────────────────────────────────
|
||||||
|
|
||||||
|
def test_detect_generic_html():
|
||||||
|
generic = """<!DOCTYPE html><html><head><title>My App</title></head>
|
||||||
|
<body><nav><a href="/">Home</a></nav><main><p>Welcome</p></main></body></html>"""
|
||||||
|
ct = detect_content_type("index.html", generic)
|
||||||
|
score = 1.0 if ct == "generic_html" else 0.0
|
||||||
|
_score("preprocess.detect_generic", score)
|
||||||
|
assert ct == "generic_html", f"Expected 'generic_html', got '{ct}'"
|
||||||
|
|
||||||
|
|
||||||
|
# ── 1.3 — Detect plain text ───────────────────────────────────────────
|
||||||
|
|
||||||
|
def test_detect_plain_text():
|
||||||
|
ct = detect_content_type("notes.txt", "Just some notes here.\nNo HTML at all.")
|
||||||
|
score = 1.0 if ct == "plain_text" else 0.0
|
||||||
|
_score("preprocess.detect_text", score)
|
||||||
|
assert ct == "plain_text", f"Expected 'plain_text', got '{ct}'"
|
||||||
|
|
||||||
|
|
||||||
|
# ── 1.4 — Detect unknown ──────────────────────────────────────────────
|
||||||
|
|
||||||
|
def test_detect_unknown():
|
||||||
|
# Simulate binary-like content with non-printable chars
|
||||||
|
binary_like = "some\x00\x01\x02\x03\x04\x05content" * 20
|
||||||
|
ct = detect_content_type("archive.xyz", binary_like)
|
||||||
|
score = 1.0 if ct == "unknown" else 0.0
|
||||||
|
_score("preprocess.detect_unknown", score)
|
||||||
|
assert ct == "unknown", f"Expected 'unknown', got '{ct}'"
|
||||||
|
|
||||||
|
|
||||||
|
# ── 1.5 — Email: strip HTML tags ─────────────────────────────────────
|
||||||
|
|
||||||
|
def test_email_strip_html(sample_email_html):
|
||||||
|
result = preprocess("email_html", sample_email_html)
|
||||||
|
has_no_tags = "<" not in result.clean_text
|
||||||
|
has_content = len(result.clean_text) > 50
|
||||||
|
ratio = len(result.clean_text) / len(sample_email_html)
|
||||||
|
score = 1.0 if (has_no_tags and has_content and ratio < 0.8) else 0.0
|
||||||
|
_score("preprocess.email_strip", score, f"ratio={ratio:.2f}, len={len(result.clean_text)}")
|
||||||
|
assert has_no_tags, "clean_text still contains HTML tags"
|
||||||
|
assert has_content, "clean_text is too short"
|
||||||
|
|
||||||
|
|
||||||
|
# ── 1.6 — Email: extract metadata ────────────────────────────────────
|
||||||
|
|
||||||
|
def test_email_extract_metadata(sample_email_html):
|
||||||
|
result = preprocess("email_html", sample_email_html)
|
||||||
|
has_subject = bool(result.metadata.get("subject"))
|
||||||
|
has_from = bool(result.metadata.get("from"))
|
||||||
|
score = 1.0 if (has_subject and has_from) else 0.5 if (has_subject or has_from) else 0.0
|
||||||
|
_score("preprocess.email_metadata", score,
|
||||||
|
f"subject={result.metadata.get('subject')}, from={result.metadata.get('from')}")
|
||||||
|
assert has_subject, f"metadata missing 'subject'. Got: {result.metadata}"
|
||||||
|
assert has_from, f"metadata missing 'from'. Got: {result.metadata}"
|
||||||
|
|
||||||
|
|
||||||
|
# ── 1.7 — Email: split thread ─────────────────────────────────────────
|
||||||
|
|
||||||
|
def test_email_split_thread(sample_thread_email_html):
|
||||||
|
result = preprocess("email_html", sample_thread_email_html)
|
||||||
|
# The latest message is "Sure, I'll handle the deploy."
|
||||||
|
# Quoted content from Bob/Alice should not appear in clean_text
|
||||||
|
has_latest = "Sure, I'll handle the deploy" in result.clean_text
|
||||||
|
lacks_quoted = "Let's plan the deploy" not in result.clean_text
|
||||||
|
score = 1.0 if (has_latest and lacks_quoted) else 0.5 if has_latest else 0.0
|
||||||
|
_score("preprocess.email_thread", score,
|
||||||
|
f"has_latest={has_latest}, lacks_quoted={lacks_quoted}")
|
||||||
|
assert has_latest, "Latest message not found in clean_text"
|
||||||
|
assert lacks_quoted, "Quoted older message leaked into clean_text"
|
||||||
|
|
||||||
|
|
||||||
|
# ── 1.8 — Email: single message (no thread) ──────────────────────────
|
||||||
|
|
||||||
|
def test_email_single_message():
|
||||||
|
single = """<!DOCTYPE html><html><body>
|
||||||
|
<p>From: alice@co.com</p>
|
||||||
|
<p>Subject: Quick update</p>
|
||||||
|
<p>The deploy is done. Everything looks good.</p>
|
||||||
|
</body></html>"""
|
||||||
|
result = preprocess("email_html", single)
|
||||||
|
has_body = "deploy is done" in result.clean_text
|
||||||
|
score = 1.0 if has_body else 0.0
|
||||||
|
_score("preprocess.email_single", score)
|
||||||
|
assert has_body, "Body of single message not found in clean_text"
|
||||||
|
|
||||||
|
|
||||||
|
# ── 1.9 — Email: heavy HTML (table layout) ───────────────────────────
|
||||||
|
|
||||||
|
def test_email_heavy_html(sample_heavy_html_email):
|
||||||
|
result = preprocess("email_html", sample_heavy_html_email)
|
||||||
|
has_no_tags = "<" not in result.clean_text
|
||||||
|
has_content = len(result.clean_text) > 30
|
||||||
|
# CSS properties should not appear in clean text
|
||||||
|
no_css = "border-collapse" not in result.clean_text and "font-size" not in result.clean_text
|
||||||
|
score = 1.0 if (has_no_tags and has_content and no_css) else 0.0
|
||||||
|
_score("preprocess.email_heavy_html", score,
|
||||||
|
f"no_tags={has_no_tags}, has_content={has_content}, no_css={no_css}")
|
||||||
|
assert has_no_tags, "HTML tags found in clean_text"
|
||||||
|
assert has_content, "clean_text is empty"
|
||||||
|
assert no_css, "CSS properties leaked into clean_text"
|
||||||
|
|
||||||
|
|
||||||
|
# ── 1.10 — Fallback: unknown file type ───────────────────────────────
|
||||||
|
|
||||||
|
def test_fallback_unknown_content():
|
||||||
|
raw = "random text content without any structure\nline two\nline three"
|
||||||
|
result = preprocess("unknown", raw)
|
||||||
|
has_text = len(result.clean_text) > 0
|
||||||
|
score = 1.0 if has_text else 0.0
|
||||||
|
_score("preprocess.fallback", score)
|
||||||
|
assert has_text, "fallback handler returned empty clean_text"
|
||||||
|
assert result.content_type == "unknown"
|
||||||
Reference in New Issue
Block a user