- app/core/preprocessors/__init__.py: detect_content_type + preprocess dispatcher - app/core/preprocessors/base.py: PreprocessResult dataclass - app/core/preprocessors/email_html.py: BeautifulSoup HTML stripping, metadata extraction, thread splitting - requirements.txt: add beautifulsoup4 and lxml - tests/test_preprocessors.py: 10 tests with Langfuse scoring (preprocess.* scores) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
26 lines
654 B
Python
26 lines
654 B
Python
"""Base types for the preprocessor system."""
|
|
|
|
from __future__ import annotations
|
|
|
|
from dataclasses import dataclass, field
|
|
|
|
|
|
@dataclass
|
|
class PreprocessResult:
|
|
"""Output of a preprocessor handler.
|
|
|
|
Attributes
|
|
----------
|
|
content_type:
|
|
The detected content type (e.g. ``"email_html"``, ``"plain_text"``).
|
|
clean_text:
|
|
Human-readable text stripped of markup/binary noise.
|
|
metadata:
|
|
Dict of extracted metadata (keys vary by handler).
|
|
Common keys: ``subject``, ``from``, ``to``, ``date``, ``filename``.
|
|
"""
|
|
|
|
content_type: str
|
|
clean_text: str
|
|
metadata: dict = field(default_factory=dict)
|