diff --git a/app/core/preprocessors/__init__.py b/app/core/preprocessors/__init__.py new file mode 100644 index 0000000..3b72e3d --- /dev/null +++ b/app/core/preprocessors/__init__.py @@ -0,0 +1,104 @@ +"""Preprocessor registry: detect content type and dispatch to handlers. + +Public API +---------- +detect_content_type(filename, raw_content) -> str + Heuristic detection based on file extension and content patterns. + +preprocess(content_type, raw_content) -> PreprocessResult + Dispatch to the appropriate handler. +""" + +from __future__ import annotations + +import re + +from app.core.preprocessors.base import PreprocessResult + +# ── Heuristics ──────────────────────────────────────────────────────── + +# Patterns that strongly suggest an email HTML file +_EMAIL_SIGNALS = re.compile( + r"(Subject:|From:|To:|Date:|Sent:|MIME-Version:|Content-Type:\s*text/html)", + re.IGNORECASE, +) + +# Patterns that suggest a generic HTML page (not an email) +_GENERIC_HTML_SIGNALS = re.compile( + r"<(nav|main|header|footer|article|section)\b", + re.IGNORECASE, +) + + +def detect_content_type(filename: str, raw_content: str) -> str: + """Return a content-type string for the given file. + + Supported types: ``"email_html"``, ``"generic_html"``, + ``"plain_text"``, ``"unknown"``. + """ + ext = filename.rsplit(".", 1)[-1].lower() if "." in filename else "" + + if ext == "txt": + return "plain_text" + + if ext in ("html", "htm", "eml", "mhtml", "mht"): + # Prefer email detection over generic HTML + if _EMAIL_SIGNALS.search(raw_content[:4096]): + return "email_html" + if _GENERIC_HTML_SIGNALS.search(raw_content[:4096]) or " 0 and non_printable / len(sample) > 0.1: + return "unknown" + + return "unknown" + + +# ── Generic fallback handler ────────────────────────────────────────── + +def _preprocess_generic(raw_content: str, content_type: str) -> PreprocessResult: + """Strip HTML tags if present, return text as-is.""" + try: + from bs4 import BeautifulSoup + text = BeautifulSoup(raw_content, "html.parser").get_text(separator="\n") + except ImportError: + # No BeautifulSoup — strip tags with a simple regex + text = re.sub(r"<[^>]+>", "", raw_content) + + text = re.sub(r"\n{3,}", "\n\n", text).strip() + return PreprocessResult(content_type=content_type, clean_text=text, metadata={}) + + +# ── Dispatch ────────────────────────────────────────────────────────── + +def preprocess(content_type: str, raw_content: str) -> PreprocessResult: + """Dispatch *raw_content* to the handler registered for *content_type*. + + Falls back to the generic handler for unknown types. + """ + if content_type == "email_html": + from app.core.preprocessors.email_html import preprocess_email_html + return preprocess_email_html(raw_content) + + return _preprocess_generic(raw_content, content_type) + + +__all__ = ["detect_content_type", "preprocess", "PreprocessResult"] diff --git a/app/core/preprocessors/base.py b/app/core/preprocessors/base.py new file mode 100644 index 0000000..904ea0b --- /dev/null +++ b/app/core/preprocessors/base.py @@ -0,0 +1,25 @@ +"""Base types for the preprocessor system.""" + +from __future__ import annotations + +from dataclasses import dataclass, field + + +@dataclass +class PreprocessResult: + """Output of a preprocessor handler. + + Attributes + ---------- + content_type: + The detected content type (e.g. ``"email_html"``, ``"plain_text"``). + clean_text: + Human-readable text stripped of markup/binary noise. + metadata: + Dict of extracted metadata (keys vary by handler). + Common keys: ``subject``, ``from``, ``to``, ``date``, ``filename``. + """ + + content_type: str + clean_text: str + metadata: dict = field(default_factory=dict) diff --git a/app/core/preprocessors/email_html.py b/app/core/preprocessors/email_html.py new file mode 100644 index 0000000..d108cff --- /dev/null +++ b/app/core/preprocessors/email_html.py @@ -0,0 +1,111 @@ +"""Preprocessor for email HTML files. + +Handles: +- HTML stripping via BeautifulSoup +- Metadata extraction (Subject, From, To, Date) +- Thread splitting — isolates the latest reply +""" + +from __future__ import annotations + +import re +from typing import TYPE_CHECKING + +from app.core.preprocessors.base import PreprocessResult + +if TYPE_CHECKING: + pass + +# ── Thread split markers ────────────────────────────────────────────── + +# Matches patterns like: +# "On Mon, Apr 7, 2026 at 10:00 AM, Alice wrote:" +# "-----Original Message-----" +# "> " (plain-text quote prefix) +_THREAD_PATTERNS = [ + re.compile(r"^On\s+.+wrote\s*:", re.IGNORECASE | re.MULTILINE), + re.compile(r"^-{3,}\s*(original message|forwarded message)\s*-{3,}", re.IGNORECASE | re.MULTILINE), + re.compile(r"^>{1,}\s+\S", re.MULTILINE), + re.compile(r"^From:\s+.+\nSent:\s+", re.IGNORECASE | re.MULTILINE), +] + +# ── Metadata patterns (applied on raw HTML / plain fallback) ────────── + +_META_PATTERNS: dict[str, list[re.Pattern]] = { + "subject": [ + re.compile(r"(.+?)", re.IGNORECASE | re.DOTALL), + re.compile(r"Subject:\s*(.+)", re.IGNORECASE), + ], + "from": [ + re.compile(r']+name=["\']?from["\']?[^>]+content=["\']([^"\']+)["\']', re.IGNORECASE), + re.compile(r"From:\s*(.+)", re.IGNORECASE), + ], + "to": [ + re.compile(r']+name=["\']?to["\']?[^>]+content=["\']([^"\']+)["\']', re.IGNORECASE), + re.compile(r"To:\s*(.+)", re.IGNORECASE), + ], + "date": [ + re.compile(r']+name=["\']?date["\']?[^>]+content=["\']([^"\']+)["\']', re.IGNORECASE), + re.compile(r"Date:\s*(.+)", re.IGNORECASE), + re.compile(r"Sent:\s*(.+)", re.IGNORECASE), + ], +} + + +def _extract_metadata(raw_html: str, text: str) -> dict: + """Extract Subject/From/To/Date from raw HTML or plain text.""" + metadata: dict[str, str] = {} + for field, patterns in _META_PATTERNS.items(): + for pat in patterns: + m = pat.search(raw_html) or pat.search(text) + if m: + metadata[field] = m.group(1).strip() + break + return metadata + + +def _split_thread(text: str) -> str: + """Return only the latest message in a threaded email.""" + earliest_pos: int | None = None + for pat in _THREAD_PATTERNS: + m = pat.search(text) + if m and (earliest_pos is None or m.start() < earliest_pos): + earliest_pos = m.start() + + if earliest_pos is not None and earliest_pos > 0: + return text[:earliest_pos].strip() + return text.strip() + + +def preprocess_email_html(raw_content: str) -> PreprocessResult: + """Strip HTML, extract metadata, split thread from an email HTML file.""" + try: + from bs4 import BeautifulSoup # lazy import — optional dep + except ImportError as exc: + raise ImportError( + "beautifulsoup4 is required for email_html preprocessing. " + "Install it with: pip install beautifulsoup4" + ) from exc + + # Parse with lxml if available, fall back to html.parser + try: + soup = BeautifulSoup(raw_content, "lxml") + except Exception: + soup = BeautifulSoup(raw_content, "html.parser") + + # Remove noise tags + for tag in soup(["style", "script", "head", "noscript"]): + tag.decompose() + + clean_text = soup.get_text(separator="\n") + # Collapse excessive blank lines + clean_text = re.sub(r"\n{3,}", "\n\n", clean_text).strip() + + metadata = _extract_metadata(raw_content, clean_text) + latest_message = _split_thread(clean_text) + + return PreprocessResult( + content_type="email_html", + clean_text=latest_message, + metadata=metadata, + ) diff --git a/requirements.txt b/requirements.txt index 023fe42..6a7b5a6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -33,4 +33,6 @@ google-auth-httplib2>=0.2.0 msal>=1.28.0 cryptography>=42.0.0 langfuse>=2.0.0 +beautifulsoup4>=4.12.0 +lxml>=5.0.0 ruff>=0.8.0 diff --git a/tests/test_preprocessors.py b/tests/test_preprocessors.py new file mode 100644 index 0000000..83b68cd --- /dev/null +++ b/tests/test_preprocessors.py @@ -0,0 +1,221 @@ +"""Tests for the preprocessor system (Step 1). + +Test IDs map to the plan: + 1.1 detect_email, 1.2 detect_generic, 1.3 detect_text, 1.4 detect_unknown + 1.5 email_strip, 1.6 email_metadata, 1.7 email_thread, 1.8 email_single + 1.9 email_heavy_html, 1.10 fallback + +Run: + pytest tests/test_preprocessors.py -v + +Langfuse scores are sent when LANGFUSE_SECRET_KEY / LANGFUSE_PUBLIC_KEY are set. +""" + +from __future__ import annotations + +import pytest + +from app.core.preprocessors import detect_content_type, preprocess +from app.core.langfuse_client import get_langfuse + +# ── Fixtures ────────────────────────────────────────────────────────── + + +@pytest.fixture +def sample_email_html() -> str: + return """ + + + Fix the login bug + + + +

Subject: Fix the login bug

+

From: boss@company.com

+

To: dev@company.com

+

Date: Mon, 7 Apr 2026 09:00:00 +0200

+

Please fix the login bug by Friday. It is blocking the release.

+ +""" + + +@pytest.fixture +def sample_thread_email_html() -> str: + return """ + +

From: alice@co.com

+

Subject: Re: Re: Deploy plan

+

Sure, I'll handle the deploy.

+ +

On Mon, Apr 6, 2026 at 3:00 PM, Bob <bob@co.com> wrote:

+
+

From: bob@co.com

+

Can you handle the deploy?

+

On Sun, Apr 5, 2026 at 1:00 PM, Alice <alice@co.com> wrote:

+
+

From: alice@co.com

+

Let's plan the deploy for Monday.

+
+
+""" + + +@pytest.fixture +def sample_heavy_html_email() -> str: + return """ + + + + + + + + + + +
Company Newsletter
From:newsletter@corp.com
Subject:Q1 Results Update
Date:Apr 7, 2026
+

Dear Team,

+

Q1 results are in. Revenue up 15% year-over-year.

+

Please review the attached report.

+
+""" + + +# ── Helper ──────────────────────────────────────────────────────────── + +def _score(name: str, value: float, comment: str = "") -> None: + lf = get_langfuse() + if lf: + trace = lf.trace(name=f"eval-{name}") + lf.score(trace_id=trace.id, name=name, value=value, + data_type="NUMERIC", comment=comment) + lf.flush() + + +# ── 1.1 — Detect email HTML ─────────────────────────────────────────── + +def test_detect_email_html(sample_email_html): + ct = detect_content_type("email_export.html", sample_email_html) + score = 1.0 if ct == "email_html" else 0.0 + _score("preprocess.detect_email", score) + assert ct == "email_html", f"Expected 'email_html', got '{ct}'" + + +# ── 1.2 — Detect generic HTML ───────────────────────────────────────── + +def test_detect_generic_html(): + generic = """My App +

Welcome

""" + ct = detect_content_type("index.html", generic) + score = 1.0 if ct == "generic_html" else 0.0 + _score("preprocess.detect_generic", score) + assert ct == "generic_html", f"Expected 'generic_html', got '{ct}'" + + +# ── 1.3 — Detect plain text ─────────────────────────────────────────── + +def test_detect_plain_text(): + ct = detect_content_type("notes.txt", "Just some notes here.\nNo HTML at all.") + score = 1.0 if ct == "plain_text" else 0.0 + _score("preprocess.detect_text", score) + assert ct == "plain_text", f"Expected 'plain_text', got '{ct}'" + + +# ── 1.4 — Detect unknown ────────────────────────────────────────────── + +def test_detect_unknown(): + # Simulate binary-like content with non-printable chars + binary_like = "some\x00\x01\x02\x03\x04\x05content" * 20 + ct = detect_content_type("archive.xyz", binary_like) + score = 1.0 if ct == "unknown" else 0.0 + _score("preprocess.detect_unknown", score) + assert ct == "unknown", f"Expected 'unknown', got '{ct}'" + + +# ── 1.5 — Email: strip HTML tags ───────────────────────────────────── + +def test_email_strip_html(sample_email_html): + result = preprocess("email_html", sample_email_html) + has_no_tags = "<" not in result.clean_text + has_content = len(result.clean_text) > 50 + ratio = len(result.clean_text) / len(sample_email_html) + score = 1.0 if (has_no_tags and has_content and ratio < 0.8) else 0.0 + _score("preprocess.email_strip", score, f"ratio={ratio:.2f}, len={len(result.clean_text)}") + assert has_no_tags, "clean_text still contains HTML tags" + assert has_content, "clean_text is too short" + + +# ── 1.6 — Email: extract metadata ──────────────────────────────────── + +def test_email_extract_metadata(sample_email_html): + result = preprocess("email_html", sample_email_html) + has_subject = bool(result.metadata.get("subject")) + has_from = bool(result.metadata.get("from")) + score = 1.0 if (has_subject and has_from) else 0.5 if (has_subject or has_from) else 0.0 + _score("preprocess.email_metadata", score, + f"subject={result.metadata.get('subject')}, from={result.metadata.get('from')}") + assert has_subject, f"metadata missing 'subject'. Got: {result.metadata}" + assert has_from, f"metadata missing 'from'. Got: {result.metadata}" + + +# ── 1.7 — Email: split thread ───────────────────────────────────────── + +def test_email_split_thread(sample_thread_email_html): + result = preprocess("email_html", sample_thread_email_html) + # The latest message is "Sure, I'll handle the deploy." + # Quoted content from Bob/Alice should not appear in clean_text + has_latest = "Sure, I'll handle the deploy" in result.clean_text + lacks_quoted = "Let's plan the deploy" not in result.clean_text + score = 1.0 if (has_latest and lacks_quoted) else 0.5 if has_latest else 0.0 + _score("preprocess.email_thread", score, + f"has_latest={has_latest}, lacks_quoted={lacks_quoted}") + assert has_latest, "Latest message not found in clean_text" + assert lacks_quoted, "Quoted older message leaked into clean_text" + + +# ── 1.8 — Email: single message (no thread) ────────────────────────── + +def test_email_single_message(): + single = """ +

From: alice@co.com

+

Subject: Quick update

+

The deploy is done. Everything looks good.

+""" + result = preprocess("email_html", single) + has_body = "deploy is done" in result.clean_text + score = 1.0 if has_body else 0.0 + _score("preprocess.email_single", score) + assert has_body, "Body of single message not found in clean_text" + + +# ── 1.9 — Email: heavy HTML (table layout) ─────────────────────────── + +def test_email_heavy_html(sample_heavy_html_email): + result = preprocess("email_html", sample_heavy_html_email) + has_no_tags = "<" not in result.clean_text + has_content = len(result.clean_text) > 30 + # CSS properties should not appear in clean text + no_css = "border-collapse" not in result.clean_text and "font-size" not in result.clean_text + score = 1.0 if (has_no_tags and has_content and no_css) else 0.0 + _score("preprocess.email_heavy_html", score, + f"no_tags={has_no_tags}, has_content={has_content}, no_css={no_css}") + assert has_no_tags, "HTML tags found in clean_text" + assert has_content, "clean_text is empty" + assert no_css, "CSS properties leaked into clean_text" + + +# ── 1.10 — Fallback: unknown file type ─────────────────────────────── + +def test_fallback_unknown_content(): + raw = "random text content without any structure\nline two\nline three" + result = preprocess("unknown", raw) + has_text = len(result.clean_text) > 0 + score = 1.0 if has_text else 0.0 + _score("preprocess.fallback", score) + assert has_text, "fallback handler returned empty clean_text" + assert result.content_type == "unknown"