fix(scouts): fetch single Gmail message instead of bulk in fetch_content

Replace bulk GmailClient.fetch_messages() + linear search with a direct
service.users().messages().get(format="full") call. Adds _extract_plain_text_body
helper for recursive MIME part walking. Update test to patch _get_gmail_service.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Roberto
2026-05-16 05:39:39 +02:00
parent 11b31e5814
commit 0833db239c
2 changed files with 60 additions and 35 deletions

View File

@@ -16,12 +16,35 @@ from datetime import datetime, timezone
from app.config.settings import settings
from app.integrations import decrypt_token
from app.integrations.gmail import GmailClient
from app.scouts.connectors.base import ItemContent, ItemMetadata, ItemRef
logger = logging.getLogger(__name__)
def _extract_plain_text_body(payload: dict) -> str:
"""Recursively walk a Gmail message payload to find text/plain content."""
import base64
mime_type = payload.get("mimeType", "")
if mime_type == "text/plain":
data = payload.get("body", {}).get("data", "")
if data:
return base64.urlsafe_b64decode(data + "==").decode("utf-8", errors="replace")
return ""
if mime_type.startswith("multipart/"):
for part in payload.get("parts", []):
text = _extract_plain_text_body(part)
if text:
return text
# text/html fallback: strip tags rudimentarily if no text/plain part
if mime_type == "text/html":
data = payload.get("body", {}).get("data", "")
if data:
import re
html = base64.urlsafe_b64decode(data + "==").decode("utf-8", errors="replace")
return re.sub(r"<[^>]+>", " ", html)
return ""
def _get_gmail_service(scout):
"""Return a synchronous Google API client for low-level metadata/history calls."""
from googleapiclient.discovery import build
@@ -118,32 +141,27 @@ class GmailConnector:
# ── fetch_content ─────────────────────────────────────────────────────
async def fetch_content(self, scout, ref: ItemRef) -> ItemContent:
"""Fetch full body text via GmailClient — transient, must not be persisted."""
creds_info = decrypt_token(scout.oauth_token_encrypted)
client = GmailClient(creds_info)
# fetch_messages returns EmailMessage dataclasses with body_text already
# extracted and decoded. We pass an empty filter to avoid narrowing by
# date — callers should only invoke fetch_content for known-new messages.
messages = await client.fetch_messages(filter_config=None, since=None)
"""Fetch full body text for a single message — transient, must not be persisted."""
# Pick the message matching our ref (or fall back to first if only one returned).
email_msg = next(
(m for m in messages if m.id == ref.source_msg_ref),
messages[0] if messages else None,
)
if email_msg is None:
raise ValueError(f"Message {ref.source_msg_ref!r} not found via GmailClient")
def _sync() -> ItemContent:
service = _get_gmail_service(scout)
msg = service.users().messages().get(
userId="me", id=ref.source_msg_ref, format="full",
).execute()
headers = {h["name"]: h["value"] for h in msg.get("payload", {}).get("headers", [])}
body_text = _extract_plain_text_body(msg.get("payload", {}))
return ItemContent(
metadata=ItemMetadata(
subject=headers.get("Subject"),
sender=headers.get("From"),
snippet=msg.get("snippet"),
received_at=None,
),
body_text=body_text,
raw_headers=headers,
)
return ItemContent(
metadata=ItemMetadata(
subject=email_msg.subject,
sender=email_msg.sender,
snippet=None,
received_at=email_msg.date,
),
body_text=email_msg.body_text,
raw_headers={},
)
return await asyncio.to_thread(_sync)
# ── archive ───────────────────────────────────────────────────────────