feat(api): PDF + DOCX extraction in folder indexer
Add pypdf/python-docx deps, _extract_pdf_text/_extract_docx_text helpers, and summarize_pdf/summarize_docx wrappers that delegate to summarize_text. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1,9 +1,13 @@
|
||||
"""Per-file summarisation for project folder integration."""
|
||||
from __future__ import annotations
|
||||
|
||||
import base64
|
||||
import io
|
||||
from dataclasses import dataclass
|
||||
|
||||
from langchain_core.messages import HumanMessage, SystemMessage
|
||||
from pypdf import PdfReader
|
||||
from docx import Document as DocxDocument
|
||||
|
||||
from app.core.langfuse_client import (
|
||||
compile_prompt,
|
||||
@@ -98,3 +102,53 @@ async def summarize_text(*, content: str, ext: str, name: str) -> IndexResult:
|
||||
usage = extract_usage(response)
|
||||
summary = (response.content or "").strip()[:500]
|
||||
return IndexResult(summary=summary, tokens_used=usage.get("total", 0))
|
||||
|
||||
|
||||
def _extract_pdf_text(pdf_b64: str) -> str:
|
||||
buf = io.BytesIO(base64.b64decode(pdf_b64))
|
||||
reader = PdfReader(buf)
|
||||
parts: list[str] = []
|
||||
for page in reader.pages:
|
||||
try:
|
||||
parts.append(page.extract_text() or "")
|
||||
except Exception:
|
||||
continue
|
||||
return "\n".join(parts).strip()
|
||||
|
||||
|
||||
def _extract_docx_text(docx_b64: str) -> str:
|
||||
buf = io.BytesIO(base64.b64decode(docx_b64))
|
||||
doc = DocxDocument(buf)
|
||||
return "\n".join(p.text for p in doc.paragraphs if p.text).strip()
|
||||
|
||||
|
||||
async def summarize_pdf(*, pdf_b64: str, name: str) -> IndexResult:
|
||||
"""Return a compact summary of a PDF file.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
pdf_b64:
|
||||
Base64-encoded PDF bytes.
|
||||
name:
|
||||
File name, e.g. ``"report.pdf"``.
|
||||
"""
|
||||
text = _extract_pdf_text(pdf_b64)
|
||||
if not text:
|
||||
return IndexResult(summary="Could not extract text", tokens_used=0)
|
||||
return await summarize_text(content=text, ext=".pdf", name=name)
|
||||
|
||||
|
||||
async def summarize_docx(*, docx_b64: str, name: str) -> IndexResult:
|
||||
"""Return a compact summary of a DOCX file.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
docx_b64:
|
||||
Base64-encoded DOCX bytes.
|
||||
name:
|
||||
File name, e.g. ``"spec.docx"``.
|
||||
"""
|
||||
text = _extract_docx_text(docx_b64)
|
||||
if not text:
|
||||
return IndexResult(summary="Could not extract text", tokens_used=0)
|
||||
return await summarize_text(content=text, ext=".docx", name=name)
|
||||
|
||||
@@ -39,3 +39,5 @@ lxml>=5.0.0
|
||||
PyYAML>=6.0.0
|
||||
apscheduler>=3.10.0
|
||||
ruff>=0.8.0
|
||||
pypdf>=4.0
|
||||
python-docx>=1.1
|
||||
|
||||
@@ -51,3 +51,33 @@ async def test_summarize_image_uses_vision_content_blocks():
|
||||
isinstance(p, dict) and p.get("type") == "image_url"
|
||||
for p in (last.content if isinstance(last.content, list) else [])
|
||||
)
|
||||
|
||||
|
||||
async def test_summarize_pdf_extracts_then_summarizes(monkeypatch):
|
||||
# pypdf.PdfReader returns text from pages
|
||||
from app.core import folder_indexer
|
||||
class FakePage:
|
||||
def extract_text(self): return "PDF page content with project info."
|
||||
class FakeReader:
|
||||
pages = [FakePage(), FakePage()]
|
||||
monkeypatch.setattr(folder_indexer, "PdfReader", lambda buf: FakeReader())
|
||||
mock_resp = AsyncMock(); mock_resp.content = "Project info doc."; mock_resp.usage_metadata = {"total_tokens": 50}
|
||||
async def fake_llm(messages): return mock_resp
|
||||
with patch("app.core.folder_indexer._llm_text", new=fake_llm):
|
||||
result = await folder_indexer.summarize_pdf(pdf_b64="SGVsbG8=", name="doc.pdf")
|
||||
assert "Project info" in result.summary
|
||||
assert result.tokens_used == 50
|
||||
|
||||
|
||||
async def test_summarize_docx_extracts_then_summarizes(monkeypatch):
|
||||
from app.core import folder_indexer
|
||||
class FakePara:
|
||||
def __init__(self, t): self.text = t
|
||||
class FakeDoc:
|
||||
paragraphs = [FakePara("Heading"), FakePara("Body paragraph one.")]
|
||||
monkeypatch.setattr(folder_indexer, "DocxDocument", lambda buf: FakeDoc())
|
||||
mock_resp = AsyncMock(); mock_resp.content = "Heading and body."; mock_resp.usage_metadata = {"total_tokens": 30}
|
||||
async def fake_llm(messages): return mock_resp
|
||||
with patch("app.core.folder_indexer._llm_text", new=fake_llm):
|
||||
result = await folder_indexer.summarize_docx(docx_b64="UEsDBBQ=", name="doc.docx")
|
||||
assert result.summary == "Heading and body."
|
||||
|
||||
Reference in New Issue
Block a user