"""Preprocessor for email HTML files. Handles: - HTML stripping via BeautifulSoup - Metadata extraction (Subject, From, To, Date) - Thread splitting — isolates the latest reply """ from __future__ import annotations import re from typing import TYPE_CHECKING from app.core.preprocessors.base import PreprocessResult if TYPE_CHECKING: pass # ── Thread split markers ────────────────────────────────────────────── # Matches patterns like: # "On Mon, Apr 7, 2026 at 10:00 AM, Alice wrote:" # "-----Original Message-----" # "> " (plain-text quote prefix) _THREAD_PATTERNS = [ re.compile(r"^On\s+.+wrote\s*:", re.IGNORECASE | re.MULTILINE), re.compile(r"^-{3,}\s*(original message|forwarded message)\s*-{3,}", re.IGNORECASE | re.MULTILINE), re.compile(r"^>{1,}\s+\S", re.MULTILINE), re.compile(r"^From:\s+.+\nSent:\s+", re.IGNORECASE | re.MULTILINE), ] # ── Metadata patterns (applied on raw HTML / plain fallback) ────────── _META_PATTERNS: dict[str, list[re.Pattern]] = { "subject": [ re.compile(r"(.+?)", re.IGNORECASE | re.DOTALL), re.compile(r"Subject:\s*(.+)", re.IGNORECASE), ], "from": [ re.compile(r']+name=["\']?from["\']?[^>]+content=["\']([^"\']+)["\']', re.IGNORECASE), re.compile(r"From:\s*(.+)", re.IGNORECASE), ], "to": [ re.compile(r']+name=["\']?to["\']?[^>]+content=["\']([^"\']+)["\']', re.IGNORECASE), re.compile(r"To:\s*(.+)", re.IGNORECASE), ], "date": [ re.compile(r']+name=["\']?date["\']?[^>]+content=["\']([^"\']+)["\']', re.IGNORECASE), re.compile(r"Date:\s*(.+)", re.IGNORECASE), re.compile(r"Sent:\s*(.+)", re.IGNORECASE), ], } def _extract_metadata(raw_html: str, text: str) -> dict: """Extract Subject/From/To/Date from raw HTML or plain text.""" metadata: dict[str, str] = {} for field, patterns in _META_PATTERNS.items(): for pat in patterns: m = pat.search(raw_html) or pat.search(text) if m: metadata[field] = m.group(1).strip() break return metadata def _split_thread(text: str) -> str: """Return only the latest message in a threaded email.""" earliest_pos: int | None = None for pat in _THREAD_PATTERNS: m = pat.search(text) if m and (earliest_pos is None or m.start() < earliest_pos): earliest_pos = m.start() if earliest_pos is not None and earliest_pos > 0: return text[:earliest_pos].strip() return text.strip() def preprocess_email_html(raw_content: str) -> PreprocessResult: """Strip HTML, extract metadata, split thread from an email HTML file.""" try: from bs4 import BeautifulSoup # lazy import — optional dep except ImportError as exc: raise ImportError( "beautifulsoup4 is required for email_html preprocessing. " "Install it with: pip install beautifulsoup4" ) from exc # Parse with lxml if available, fall back to html.parser try: soup = BeautifulSoup(raw_content, "lxml") except Exception: soup = BeautifulSoup(raw_content, "html.parser") # Remove noise tags for tag in soup(["style", "script", "head", "noscript"]): tag.decompose() clean_text = soup.get_text(separator="\n") # Collapse excessive blank lines clean_text = re.sub(r"\n{3,}", "\n\n", clean_text).strip() metadata = _extract_metadata(raw_content, clean_text) latest_message = _split_thread(clean_text) return PreprocessResult( content_type="email_html", clean_text=latest_message, metadata=metadata, )