api/app/core/preprocessors/base.py

"""Base types for the preprocessor system."""

from __future__ import annotations

from dataclasses import dataclass, field


@dataclass
class PreprocessResult:
    """Output of a preprocessor handler.

    Attributes
    ----------
    content_type:
        The detected content type (e.g. ``"email_html"``, ``"plain_text"``).
    clean_text:
        Human-readable text stripped of markup/binary noise.
    metadata:
        Dict of extracted metadata (keys vary by handler).
        Common keys: ``subject``, ``from``, ``to``, ``date``, ``filename``.
    """

    content_type: str
    clean_text: str
    metadata: dict = field(default_factory=dict)