- app/core/preprocessors/__init__.py: detect_content_type + preprocess dispatcher - app/core/preprocessors/base.py: PreprocessResult dataclass - app/core/preprocessors/email_html.py: BeautifulSoup HTML stripping, metadata extraction, thread splitting - requirements.txt: add beautifulsoup4 and lxml - tests/test_preprocessors.py: 10 tests with Langfuse scoring (preprocess.* scores) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
112 lines
3.7 KiB
Python
112 lines
3.7 KiB
Python
"""Preprocessor for email HTML files.
|
|
|
|
Handles:
|
|
- HTML stripping via BeautifulSoup
|
|
- Metadata extraction (Subject, From, To, Date)
|
|
- Thread splitting — isolates the latest reply
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import re
|
|
from typing import TYPE_CHECKING
|
|
|
|
from app.core.preprocessors.base import PreprocessResult
|
|
|
|
if TYPE_CHECKING:
|
|
pass
|
|
|
|
# ── Thread split markers ──────────────────────────────────────────────
|
|
|
|
# Matches patterns like:
|
|
# "On Mon, Apr 7, 2026 at 10:00 AM, Alice <alice@co.com> wrote:"
|
|
# "-----Original Message-----"
|
|
# "> " (plain-text quote prefix)
|
|
_THREAD_PATTERNS = [
|
|
re.compile(r"^On\s+.+wrote\s*:", re.IGNORECASE | re.MULTILINE),
|
|
re.compile(r"^-{3,}\s*(original message|forwarded message)\s*-{3,}", re.IGNORECASE | re.MULTILINE),
|
|
re.compile(r"^>{1,}\s+\S", re.MULTILINE),
|
|
re.compile(r"^From:\s+.+\nSent:\s+", re.IGNORECASE | re.MULTILINE),
|
|
]
|
|
|
|
# ── Metadata patterns (applied on raw HTML / plain fallback) ──────────
|
|
|
|
_META_PATTERNS: dict[str, list[re.Pattern]] = {
|
|
"subject": [
|
|
re.compile(r"<title>(.+?)</title>", re.IGNORECASE | re.DOTALL),
|
|
re.compile(r"Subject:\s*(.+)", re.IGNORECASE),
|
|
],
|
|
"from": [
|
|
re.compile(r'<meta[^>]+name=["\']?from["\']?[^>]+content=["\']([^"\']+)["\']', re.IGNORECASE),
|
|
re.compile(r"From:\s*(.+)", re.IGNORECASE),
|
|
],
|
|
"to": [
|
|
re.compile(r'<meta[^>]+name=["\']?to["\']?[^>]+content=["\']([^"\']+)["\']', re.IGNORECASE),
|
|
re.compile(r"To:\s*(.+)", re.IGNORECASE),
|
|
],
|
|
"date": [
|
|
re.compile(r'<meta[^>]+name=["\']?date["\']?[^>]+content=["\']([^"\']+)["\']', re.IGNORECASE),
|
|
re.compile(r"Date:\s*(.+)", re.IGNORECASE),
|
|
re.compile(r"Sent:\s*(.+)", re.IGNORECASE),
|
|
],
|
|
}
|
|
|
|
|
|
def _extract_metadata(raw_html: str, text: str) -> dict:
|
|
"""Extract Subject/From/To/Date from raw HTML or plain text."""
|
|
metadata: dict[str, str] = {}
|
|
for field, patterns in _META_PATTERNS.items():
|
|
for pat in patterns:
|
|
m = pat.search(raw_html) or pat.search(text)
|
|
if m:
|
|
metadata[field] = m.group(1).strip()
|
|
break
|
|
return metadata
|
|
|
|
|
|
def _split_thread(text: str) -> str:
|
|
"""Return only the latest message in a threaded email."""
|
|
earliest_pos: int | None = None
|
|
for pat in _THREAD_PATTERNS:
|
|
m = pat.search(text)
|
|
if m and (earliest_pos is None or m.start() < earliest_pos):
|
|
earliest_pos = m.start()
|
|
|
|
if earliest_pos is not None and earliest_pos > 0:
|
|
return text[:earliest_pos].strip()
|
|
return text.strip()
|
|
|
|
|
|
def preprocess_email_html(raw_content: str) -> PreprocessResult:
|
|
"""Strip HTML, extract metadata, split thread from an email HTML file."""
|
|
try:
|
|
from bs4 import BeautifulSoup # lazy import — optional dep
|
|
except ImportError as exc:
|
|
raise ImportError(
|
|
"beautifulsoup4 is required for email_html preprocessing. "
|
|
"Install it with: pip install beautifulsoup4"
|
|
) from exc
|
|
|
|
# Parse with lxml if available, fall back to html.parser
|
|
try:
|
|
soup = BeautifulSoup(raw_content, "lxml")
|
|
except Exception:
|
|
soup = BeautifulSoup(raw_content, "html.parser")
|
|
|
|
# Remove noise tags
|
|
for tag in soup(["style", "script", "head", "noscript"]):
|
|
tag.decompose()
|
|
|
|
clean_text = soup.get_text(separator="\n")
|
|
# Collapse excessive blank lines
|
|
clean_text = re.sub(r"\n{3,}", "\n\n", clean_text).strip()
|
|
|
|
metadata = _extract_metadata(raw_content, clean_text)
|
|
latest_message = _split_thread(clean_text)
|
|
|
|
return PreprocessResult(
|
|
content_type="email_html",
|
|
clean_text=latest_message,
|
|
metadata=metadata,
|
|
)
|