Add pypdf/python-docx deps, _extract_pdf_text/_extract_docx_text helpers, and summarize_pdf/summarize_docx wrappers that delegate to summarize_text. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
84 lines
3.5 KiB
Python
84 lines
3.5 KiB
Python
"""Folder indexer LLM helpers."""
|
|
from __future__ import annotations
|
|
|
|
from unittest.mock import AsyncMock, patch
|
|
|
|
import pytest
|
|
|
|
from app.core.folder_indexer import summarize_text, summarize_image, IndexResult
|
|
|
|
pytestmark = pytest.mark.asyncio
|
|
|
|
|
|
async def test_summarize_text_returns_summary_and_tokens():
|
|
mock_resp = AsyncMock()
|
|
mock_resp.content = "Kickoff notes covering scope and deadlines."
|
|
mock_resp.usage_metadata = {"input_tokens": 320, "output_tokens": 18, "total_tokens": 338}
|
|
with patch("app.core.folder_indexer._llm_text", new=AsyncMock(return_value=mock_resp)):
|
|
result = await summarize_text(content="hello world", ext=".md", name="kickoff.md")
|
|
assert isinstance(result, IndexResult)
|
|
assert result.summary == "Kickoff notes covering scope and deadlines."
|
|
assert result.tokens_used == 338
|
|
|
|
|
|
async def test_summarize_text_truncates_summary_at_500_chars():
|
|
mock_resp = AsyncMock()
|
|
mock_resp.content = "x" * 1000
|
|
mock_resp.usage_metadata = {"total_tokens": 100}
|
|
with patch("app.core.folder_indexer._llm_text", new=AsyncMock(return_value=mock_resp)):
|
|
result = await summarize_text(content="x", ext=".md", name="x.md")
|
|
assert len(result.summary) <= 500
|
|
|
|
|
|
async def test_summarize_image_uses_vision_content_blocks():
|
|
mock_resp = AsyncMock()
|
|
mock_resp.content = "Final logo on white background."
|
|
mock_resp.usage_metadata = {"total_tokens": 500}
|
|
captured = {}
|
|
|
|
async def fake_llm_vision(messages):
|
|
captured["messages"] = messages
|
|
return mock_resp
|
|
|
|
with patch("app.core.folder_indexer._llm_vision", new=fake_llm_vision):
|
|
result = await summarize_image(image_b64="iVBORw0KG", mime="image/png")
|
|
|
|
assert "Final logo" in result.summary
|
|
assert result.tokens_used == 500
|
|
# last message contains an image content block
|
|
last = captured["messages"][-1]
|
|
assert any(
|
|
isinstance(p, dict) and p.get("type") == "image_url"
|
|
for p in (last.content if isinstance(last.content, list) else [])
|
|
)
|
|
|
|
|
|
async def test_summarize_pdf_extracts_then_summarizes(monkeypatch):
|
|
# pypdf.PdfReader returns text from pages
|
|
from app.core import folder_indexer
|
|
class FakePage:
|
|
def extract_text(self): return "PDF page content with project info."
|
|
class FakeReader:
|
|
pages = [FakePage(), FakePage()]
|
|
monkeypatch.setattr(folder_indexer, "PdfReader", lambda buf: FakeReader())
|
|
mock_resp = AsyncMock(); mock_resp.content = "Project info doc."; mock_resp.usage_metadata = {"total_tokens": 50}
|
|
async def fake_llm(messages): return mock_resp
|
|
with patch("app.core.folder_indexer._llm_text", new=fake_llm):
|
|
result = await folder_indexer.summarize_pdf(pdf_b64="SGVsbG8=", name="doc.pdf")
|
|
assert "Project info" in result.summary
|
|
assert result.tokens_used == 50
|
|
|
|
|
|
async def test_summarize_docx_extracts_then_summarizes(monkeypatch):
|
|
from app.core import folder_indexer
|
|
class FakePara:
|
|
def __init__(self, t): self.text = t
|
|
class FakeDoc:
|
|
paragraphs = [FakePara("Heading"), FakePara("Body paragraph one.")]
|
|
monkeypatch.setattr(folder_indexer, "DocxDocument", lambda buf: FakeDoc())
|
|
mock_resp = AsyncMock(); mock_resp.content = "Heading and body."; mock_resp.usage_metadata = {"total_tokens": 30}
|
|
async def fake_llm(messages): return mock_resp
|
|
with patch("app.core.folder_indexer._llm_text", new=fake_llm):
|
|
result = await folder_indexer.summarize_docx(docx_b64="UEsDBBQ=", name="doc.docx")
|
|
assert result.summary == "Heading and body."
|