feat(api): PDF + DOCX extraction in folder indexer
Add pypdf/python-docx deps, _extract_pdf_text/_extract_docx_text helpers, and summarize_pdf/summarize_docx wrappers that delegate to summarize_text. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -51,3 +51,33 @@ async def test_summarize_image_uses_vision_content_blocks():
|
||||
isinstance(p, dict) and p.get("type") == "image_url"
|
||||
for p in (last.content if isinstance(last.content, list) else [])
|
||||
)
|
||||
|
||||
|
||||
async def test_summarize_pdf_extracts_then_summarizes(monkeypatch):
|
||||
# pypdf.PdfReader returns text from pages
|
||||
from app.core import folder_indexer
|
||||
class FakePage:
|
||||
def extract_text(self): return "PDF page content with project info."
|
||||
class FakeReader:
|
||||
pages = [FakePage(), FakePage()]
|
||||
monkeypatch.setattr(folder_indexer, "PdfReader", lambda buf: FakeReader())
|
||||
mock_resp = AsyncMock(); mock_resp.content = "Project info doc."; mock_resp.usage_metadata = {"total_tokens": 50}
|
||||
async def fake_llm(messages): return mock_resp
|
||||
with patch("app.core.folder_indexer._llm_text", new=fake_llm):
|
||||
result = await folder_indexer.summarize_pdf(pdf_b64="SGVsbG8=", name="doc.pdf")
|
||||
assert "Project info" in result.summary
|
||||
assert result.tokens_used == 50
|
||||
|
||||
|
||||
async def test_summarize_docx_extracts_then_summarizes(monkeypatch):
|
||||
from app.core import folder_indexer
|
||||
class FakePara:
|
||||
def __init__(self, t): self.text = t
|
||||
class FakeDoc:
|
||||
paragraphs = [FakePara("Heading"), FakePara("Body paragraph one.")]
|
||||
monkeypatch.setattr(folder_indexer, "DocxDocument", lambda buf: FakeDoc())
|
||||
mock_resp = AsyncMock(); mock_resp.content = "Heading and body."; mock_resp.usage_metadata = {"total_tokens": 30}
|
||||
async def fake_llm(messages): return mock_resp
|
||||
with patch("app.core.folder_indexer._llm_text", new=fake_llm):
|
||||
result = await folder_indexer.summarize_docx(docx_b64="UEsDBBQ=", name="doc.docx")
|
||||
assert result.summary == "Heading and body."
|
||||
|
||||
Reference in New Issue
Block a user