1 Commits

Author SHA1 Message Date
Roberto
0833db239c fix(scouts): fetch single Gmail message instead of bulk in fetch_content
Replace bulk GmailClient.fetch_messages() + linear search with a direct
service.users().messages().get(format="full") call. Adds _extract_plain_text_body
helper for recursive MIME part walking. Update test to patch _get_gmail_service.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-16 05:39:39 +02:00
2 changed files with 60 additions and 35 deletions

View File

@@ -16,12 +16,35 @@ from datetime import datetime, timezone
from app.config.settings import settings
from app.integrations import decrypt_token
from app.integrations.gmail import GmailClient
from app.scouts.connectors.base import ItemContent, ItemMetadata, ItemRef
logger = logging.getLogger(__name__)
def _extract_plain_text_body(payload: dict) -> str:
"""Recursively walk a Gmail message payload to find text/plain content."""
import base64
mime_type = payload.get("mimeType", "")
if mime_type == "text/plain":
data = payload.get("body", {}).get("data", "")
if data:
return base64.urlsafe_b64decode(data + "==").decode("utf-8", errors="replace")
return ""
if mime_type.startswith("multipart/"):
for part in payload.get("parts", []):
text = _extract_plain_text_body(part)
if text:
return text
# text/html fallback: strip tags rudimentarily if no text/plain part
if mime_type == "text/html":
data = payload.get("body", {}).get("data", "")
if data:
import re
html = base64.urlsafe_b64decode(data + "==").decode("utf-8", errors="replace")
return re.sub(r"<[^>]+>", " ", html)
return ""
def _get_gmail_service(scout):
"""Return a synchronous Google API client for low-level metadata/history calls."""
from googleapiclient.discovery import build
@@ -118,32 +141,27 @@ class GmailConnector:
# ── fetch_content ─────────────────────────────────────────────────────
async def fetch_content(self, scout, ref: ItemRef) -> ItemContent:
"""Fetch full body text via GmailClient — transient, must not be persisted."""
creds_info = decrypt_token(scout.oauth_token_encrypted)
client = GmailClient(creds_info)
# fetch_messages returns EmailMessage dataclasses with body_text already
# extracted and decoded. We pass an empty filter to avoid narrowing by
# date — callers should only invoke fetch_content for known-new messages.
messages = await client.fetch_messages(filter_config=None, since=None)
"""Fetch full body text for a single message — transient, must not be persisted."""
# Pick the message matching our ref (or fall back to first if only one returned).
email_msg = next(
(m for m in messages if m.id == ref.source_msg_ref),
messages[0] if messages else None,
)
if email_msg is None:
raise ValueError(f"Message {ref.source_msg_ref!r} not found via GmailClient")
def _sync() -> ItemContent:
service = _get_gmail_service(scout)
msg = service.users().messages().get(
userId="me", id=ref.source_msg_ref, format="full",
).execute()
headers = {h["name"]: h["value"] for h in msg.get("payload", {}).get("headers", [])}
body_text = _extract_plain_text_body(msg.get("payload", {}))
return ItemContent(
metadata=ItemMetadata(
subject=headers.get("Subject"),
sender=headers.get("From"),
snippet=msg.get("snippet"),
received_at=None,
),
body_text=body_text,
raw_headers=headers,
)
return ItemContent(
metadata=ItemMetadata(
subject=email_msg.subject,
sender=email_msg.sender,
snippet=None,
received_at=email_msg.date,
),
body_text=email_msg.body_text,
raw_headers={},
)
return await asyncio.to_thread(_sync)
# ── archive ───────────────────────────────────────────────────────────

View File

@@ -3,8 +3,7 @@
from __future__ import annotations
import uuid
from datetime import datetime, timezone
from unittest.mock import AsyncMock, MagicMock, patch
from unittest.mock import MagicMock, patch
import pytest
@@ -53,16 +52,24 @@ async def test_fetch_metadata_returns_subject_and_snippet():
@pytest.mark.asyncio
async def test_fetch_content_returns_body_text():
import base64
scout = _make_scout()
conn = GmailConnector()
# decrypt_token is patched because the test doesn't set OAUTH_ENCRYPTION_KEY.
with patch("app.scouts.connectors.gmail.decrypt_token", return_value={}), \
patch("app.scouts.connectors.gmail.GmailClient") as MockClient:
instance = MockClient.return_value
instance.fetch_messages = AsyncMock(return_value=[
MagicMock(id="msg-1", subject="S", sender="a@b", body_text="hello world",
date=datetime.now(tz=timezone.utc), labels=[]),
])
body_data = base64.urlsafe_b64encode(b"hello world").decode()
fake_message = {
"id": "msg-1",
"snippet": "hello world",
"payload": {
"mimeType": "text/plain",
"headers": [
{"name": "Subject", "value": "S"},
{"name": "From", "value": "a@b"},
],
"body": {"data": body_data},
},
}
with patch("app.scouts.connectors.gmail._get_gmail_service") as mock_svc:
mock_svc.return_value.users().messages().get().execute.return_value = fake_message
content = await conn.fetch_content(scout, ItemRef(source_msg_ref="msg-1"))
assert content.body_text == "hello world"
assert content.metadata.subject == "S"