fix(scouts): fetch single Gmail message instead of bulk in fetch_content
Replace bulk GmailClient.fetch_messages() + linear search with a direct service.users().messages().get(format="full") call. Adds _extract_plain_text_body helper for recursive MIME part walking. Update test to patch _get_gmail_service. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -16,12 +16,35 @@ from datetime import datetime, timezone
|
|||||||
|
|
||||||
from app.config.settings import settings
|
from app.config.settings import settings
|
||||||
from app.integrations import decrypt_token
|
from app.integrations import decrypt_token
|
||||||
from app.integrations.gmail import GmailClient
|
|
||||||
from app.scouts.connectors.base import ItemContent, ItemMetadata, ItemRef
|
from app.scouts.connectors.base import ItemContent, ItemMetadata, ItemRef
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_plain_text_body(payload: dict) -> str:
|
||||||
|
"""Recursively walk a Gmail message payload to find text/plain content."""
|
||||||
|
import base64
|
||||||
|
mime_type = payload.get("mimeType", "")
|
||||||
|
if mime_type == "text/plain":
|
||||||
|
data = payload.get("body", {}).get("data", "")
|
||||||
|
if data:
|
||||||
|
return base64.urlsafe_b64decode(data + "==").decode("utf-8", errors="replace")
|
||||||
|
return ""
|
||||||
|
if mime_type.startswith("multipart/"):
|
||||||
|
for part in payload.get("parts", []):
|
||||||
|
text = _extract_plain_text_body(part)
|
||||||
|
if text:
|
||||||
|
return text
|
||||||
|
# text/html fallback: strip tags rudimentarily if no text/plain part
|
||||||
|
if mime_type == "text/html":
|
||||||
|
data = payload.get("body", {}).get("data", "")
|
||||||
|
if data:
|
||||||
|
import re
|
||||||
|
html = base64.urlsafe_b64decode(data + "==").decode("utf-8", errors="replace")
|
||||||
|
return re.sub(r"<[^>]+>", " ", html)
|
||||||
|
return ""
|
||||||
|
|
||||||
|
|
||||||
def _get_gmail_service(scout):
|
def _get_gmail_service(scout):
|
||||||
"""Return a synchronous Google API client for low-level metadata/history calls."""
|
"""Return a synchronous Google API client for low-level metadata/history calls."""
|
||||||
from googleapiclient.discovery import build
|
from googleapiclient.discovery import build
|
||||||
@@ -118,32 +141,27 @@ class GmailConnector:
|
|||||||
# ── fetch_content ─────────────────────────────────────────────────────
|
# ── fetch_content ─────────────────────────────────────────────────────
|
||||||
|
|
||||||
async def fetch_content(self, scout, ref: ItemRef) -> ItemContent:
|
async def fetch_content(self, scout, ref: ItemRef) -> ItemContent:
|
||||||
"""Fetch full body text via GmailClient — transient, must not be persisted."""
|
"""Fetch full body text for a single message — transient, must not be persisted."""
|
||||||
creds_info = decrypt_token(scout.oauth_token_encrypted)
|
|
||||||
client = GmailClient(creds_info)
|
|
||||||
# fetch_messages returns EmailMessage dataclasses with body_text already
|
|
||||||
# extracted and decoded. We pass an empty filter to avoid narrowing by
|
|
||||||
# date — callers should only invoke fetch_content for known-new messages.
|
|
||||||
messages = await client.fetch_messages(filter_config=None, since=None)
|
|
||||||
|
|
||||||
# Pick the message matching our ref (or fall back to first if only one returned).
|
def _sync() -> ItemContent:
|
||||||
email_msg = next(
|
service = _get_gmail_service(scout)
|
||||||
(m for m in messages if m.id == ref.source_msg_ref),
|
msg = service.users().messages().get(
|
||||||
messages[0] if messages else None,
|
userId="me", id=ref.source_msg_ref, format="full",
|
||||||
)
|
).execute()
|
||||||
if email_msg is None:
|
headers = {h["name"]: h["value"] for h in msg.get("payload", {}).get("headers", [])}
|
||||||
raise ValueError(f"Message {ref.source_msg_ref!r} not found via GmailClient")
|
body_text = _extract_plain_text_body(msg.get("payload", {}))
|
||||||
|
return ItemContent(
|
||||||
|
metadata=ItemMetadata(
|
||||||
|
subject=headers.get("Subject"),
|
||||||
|
sender=headers.get("From"),
|
||||||
|
snippet=msg.get("snippet"),
|
||||||
|
received_at=None,
|
||||||
|
),
|
||||||
|
body_text=body_text,
|
||||||
|
raw_headers=headers,
|
||||||
|
)
|
||||||
|
|
||||||
return ItemContent(
|
return await asyncio.to_thread(_sync)
|
||||||
metadata=ItemMetadata(
|
|
||||||
subject=email_msg.subject,
|
|
||||||
sender=email_msg.sender,
|
|
||||||
snippet=None,
|
|
||||||
received_at=email_msg.date,
|
|
||||||
),
|
|
||||||
body_text=email_msg.body_text,
|
|
||||||
raw_headers={},
|
|
||||||
)
|
|
||||||
|
|
||||||
# ── archive ───────────────────────────────────────────────────────────
|
# ── archive ───────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|||||||
@@ -3,8 +3,7 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import uuid
|
import uuid
|
||||||
from datetime import datetime, timezone
|
from unittest.mock import MagicMock, patch
|
||||||
from unittest.mock import AsyncMock, MagicMock, patch
|
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
@@ -53,16 +52,24 @@ async def test_fetch_metadata_returns_subject_and_snippet():
|
|||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_fetch_content_returns_body_text():
|
async def test_fetch_content_returns_body_text():
|
||||||
|
import base64
|
||||||
scout = _make_scout()
|
scout = _make_scout()
|
||||||
conn = GmailConnector()
|
conn = GmailConnector()
|
||||||
# decrypt_token is patched because the test doesn't set OAUTH_ENCRYPTION_KEY.
|
body_data = base64.urlsafe_b64encode(b"hello world").decode()
|
||||||
with patch("app.scouts.connectors.gmail.decrypt_token", return_value={}), \
|
fake_message = {
|
||||||
patch("app.scouts.connectors.gmail.GmailClient") as MockClient:
|
"id": "msg-1",
|
||||||
instance = MockClient.return_value
|
"snippet": "hello world",
|
||||||
instance.fetch_messages = AsyncMock(return_value=[
|
"payload": {
|
||||||
MagicMock(id="msg-1", subject="S", sender="a@b", body_text="hello world",
|
"mimeType": "text/plain",
|
||||||
date=datetime.now(tz=timezone.utc), labels=[]),
|
"headers": [
|
||||||
])
|
{"name": "Subject", "value": "S"},
|
||||||
|
{"name": "From", "value": "a@b"},
|
||||||
|
],
|
||||||
|
"body": {"data": body_data},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
with patch("app.scouts.connectors.gmail._get_gmail_service") as mock_svc:
|
||||||
|
mock_svc.return_value.users().messages().get().execute.return_value = fake_message
|
||||||
content = await conn.fetch_content(scout, ItemRef(source_msg_ref="msg-1"))
|
content = await conn.fetch_content(scout, ItemRef(source_msg_ref="msg-1"))
|
||||||
assert content.body_text == "hello world"
|
assert content.body_text == "hello world"
|
||||||
assert content.metadata.subject == "S"
|
assert content.metadata.subject == "S"
|
||||||
|
|||||||
Reference in New Issue
Block a user