feat: add preprocessor system (Step 1 — Local Agent V2)
- app/core/preprocessors/__init__.py: detect_content_type + preprocess dispatcher - app/core/preprocessors/base.py: PreprocessResult dataclass - app/core/preprocessors/email_html.py: BeautifulSoup HTML stripping, metadata extraction, thread splitting - requirements.txt: add beautifulsoup4 and lxml - tests/test_preprocessors.py: 10 tests with Langfuse scoring (preprocess.* scores) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
111
app/core/preprocessors/email_html.py
Normal file
111
app/core/preprocessors/email_html.py
Normal file
@@ -0,0 +1,111 @@
|
||||
"""Preprocessor for email HTML files.
|
||||
|
||||
Handles:
|
||||
- HTML stripping via BeautifulSoup
|
||||
- Metadata extraction (Subject, From, To, Date)
|
||||
- Thread splitting — isolates the latest reply
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from app.core.preprocessors.base import PreprocessResult
|
||||
|
||||
if TYPE_CHECKING:
|
||||
pass
|
||||
|
||||
# ── Thread split markers ──────────────────────────────────────────────
|
||||
|
||||
# Matches patterns like:
|
||||
# "On Mon, Apr 7, 2026 at 10:00 AM, Alice <alice@co.com> wrote:"
|
||||
# "-----Original Message-----"
|
||||
# "> " (plain-text quote prefix)
|
||||
_THREAD_PATTERNS = [
|
||||
re.compile(r"^On\s+.+wrote\s*:", re.IGNORECASE | re.MULTILINE),
|
||||
re.compile(r"^-{3,}\s*(original message|forwarded message)\s*-{3,}", re.IGNORECASE | re.MULTILINE),
|
||||
re.compile(r"^>{1,}\s+\S", re.MULTILINE),
|
||||
re.compile(r"^From:\s+.+\nSent:\s+", re.IGNORECASE | re.MULTILINE),
|
||||
]
|
||||
|
||||
# ── Metadata patterns (applied on raw HTML / plain fallback) ──────────
|
||||
|
||||
_META_PATTERNS: dict[str, list[re.Pattern]] = {
|
||||
"subject": [
|
||||
re.compile(r"<title>(.+?)</title>", re.IGNORECASE | re.DOTALL),
|
||||
re.compile(r"Subject:\s*(.+)", re.IGNORECASE),
|
||||
],
|
||||
"from": [
|
||||
re.compile(r'<meta[^>]+name=["\']?from["\']?[^>]+content=["\']([^"\']+)["\']', re.IGNORECASE),
|
||||
re.compile(r"From:\s*(.+)", re.IGNORECASE),
|
||||
],
|
||||
"to": [
|
||||
re.compile(r'<meta[^>]+name=["\']?to["\']?[^>]+content=["\']([^"\']+)["\']', re.IGNORECASE),
|
||||
re.compile(r"To:\s*(.+)", re.IGNORECASE),
|
||||
],
|
||||
"date": [
|
||||
re.compile(r'<meta[^>]+name=["\']?date["\']?[^>]+content=["\']([^"\']+)["\']', re.IGNORECASE),
|
||||
re.compile(r"Date:\s*(.+)", re.IGNORECASE),
|
||||
re.compile(r"Sent:\s*(.+)", re.IGNORECASE),
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
def _extract_metadata(raw_html: str, text: str) -> dict:
|
||||
"""Extract Subject/From/To/Date from raw HTML or plain text."""
|
||||
metadata: dict[str, str] = {}
|
||||
for field, patterns in _META_PATTERNS.items():
|
||||
for pat in patterns:
|
||||
m = pat.search(raw_html) or pat.search(text)
|
||||
if m:
|
||||
metadata[field] = m.group(1).strip()
|
||||
break
|
||||
return metadata
|
||||
|
||||
|
||||
def _split_thread(text: str) -> str:
|
||||
"""Return only the latest message in a threaded email."""
|
||||
earliest_pos: int | None = None
|
||||
for pat in _THREAD_PATTERNS:
|
||||
m = pat.search(text)
|
||||
if m and (earliest_pos is None or m.start() < earliest_pos):
|
||||
earliest_pos = m.start()
|
||||
|
||||
if earliest_pos is not None and earliest_pos > 0:
|
||||
return text[:earliest_pos].strip()
|
||||
return text.strip()
|
||||
|
||||
|
||||
def preprocess_email_html(raw_content: str) -> PreprocessResult:
|
||||
"""Strip HTML, extract metadata, split thread from an email HTML file."""
|
||||
try:
|
||||
from bs4 import BeautifulSoup # lazy import — optional dep
|
||||
except ImportError as exc:
|
||||
raise ImportError(
|
||||
"beautifulsoup4 is required for email_html preprocessing. "
|
||||
"Install it with: pip install beautifulsoup4"
|
||||
) from exc
|
||||
|
||||
# Parse with lxml if available, fall back to html.parser
|
||||
try:
|
||||
soup = BeautifulSoup(raw_content, "lxml")
|
||||
except Exception:
|
||||
soup = BeautifulSoup(raw_content, "html.parser")
|
||||
|
||||
# Remove noise tags
|
||||
for tag in soup(["style", "script", "head", "noscript"]):
|
||||
tag.decompose()
|
||||
|
||||
clean_text = soup.get_text(separator="\n")
|
||||
# Collapse excessive blank lines
|
||||
clean_text = re.sub(r"\n{3,}", "\n\n", clean_text).strip()
|
||||
|
||||
metadata = _extract_metadata(raw_content, clean_text)
|
||||
latest_message = _split_thread(clean_text)
|
||||
|
||||
return PreprocessResult(
|
||||
content_type="email_html",
|
||||
clean_text=latest_message,
|
||||
metadata=metadata,
|
||||
)
|
||||
Reference in New Issue
Block a user