diff --git a/app/core/folder_indexer.py b/app/core/folder_indexer.py index f7f863a..4a070db 100644 --- a/app/core/folder_indexer.py +++ b/app/core/folder_indexer.py @@ -1,9 +1,13 @@ """Per-file summarisation for project folder integration.""" from __future__ import annotations +import base64 +import io from dataclasses import dataclass from langchain_core.messages import HumanMessage, SystemMessage +from pypdf import PdfReader +from docx import Document as DocxDocument from app.core.langfuse_client import ( compile_prompt, @@ -98,3 +102,53 @@ async def summarize_text(*, content: str, ext: str, name: str) -> IndexResult: usage = extract_usage(response) summary = (response.content or "").strip()[:500] return IndexResult(summary=summary, tokens_used=usage.get("total", 0)) + + +def _extract_pdf_text(pdf_b64: str) -> str: + buf = io.BytesIO(base64.b64decode(pdf_b64)) + reader = PdfReader(buf) + parts: list[str] = [] + for page in reader.pages: + try: + parts.append(page.extract_text() or "") + except Exception: + continue + return "\n".join(parts).strip() + + +def _extract_docx_text(docx_b64: str) -> str: + buf = io.BytesIO(base64.b64decode(docx_b64)) + doc = DocxDocument(buf) + return "\n".join(p.text for p in doc.paragraphs if p.text).strip() + + +async def summarize_pdf(*, pdf_b64: str, name: str) -> IndexResult: + """Return a compact summary of a PDF file. + + Parameters + ---------- + pdf_b64: + Base64-encoded PDF bytes. + name: + File name, e.g. ``"report.pdf"``. + """ + text = _extract_pdf_text(pdf_b64) + if not text: + return IndexResult(summary="Could not extract text", tokens_used=0) + return await summarize_text(content=text, ext=".pdf", name=name) + + +async def summarize_docx(*, docx_b64: str, name: str) -> IndexResult: + """Return a compact summary of a DOCX file. + + Parameters + ---------- + docx_b64: + Base64-encoded DOCX bytes. + name: + File name, e.g. ``"spec.docx"``. + """ + text = _extract_docx_text(docx_b64) + if not text: + return IndexResult(summary="Could not extract text", tokens_used=0) + return await summarize_text(content=text, ext=".docx", name=name) diff --git a/requirements.txt b/requirements.txt index 6934c7c..9c4c895 100644 --- a/requirements.txt +++ b/requirements.txt @@ -39,3 +39,5 @@ lxml>=5.0.0 PyYAML>=6.0.0 apscheduler>=3.10.0 ruff>=0.8.0 +pypdf>=4.0 +python-docx>=1.1 diff --git a/tests/test_folder_indexer.py b/tests/test_folder_indexer.py index ae0f6aa..e3bdb22 100644 --- a/tests/test_folder_indexer.py +++ b/tests/test_folder_indexer.py @@ -51,3 +51,33 @@ async def test_summarize_image_uses_vision_content_blocks(): isinstance(p, dict) and p.get("type") == "image_url" for p in (last.content if isinstance(last.content, list) else []) ) + + +async def test_summarize_pdf_extracts_then_summarizes(monkeypatch): + # pypdf.PdfReader returns text from pages + from app.core import folder_indexer + class FakePage: + def extract_text(self): return "PDF page content with project info." + class FakeReader: + pages = [FakePage(), FakePage()] + monkeypatch.setattr(folder_indexer, "PdfReader", lambda buf: FakeReader()) + mock_resp = AsyncMock(); mock_resp.content = "Project info doc."; mock_resp.usage_metadata = {"total_tokens": 50} + async def fake_llm(messages): return mock_resp + with patch("app.core.folder_indexer._llm_text", new=fake_llm): + result = await folder_indexer.summarize_pdf(pdf_b64="SGVsbG8=", name="doc.pdf") + assert "Project info" in result.summary + assert result.tokens_used == 50 + + +async def test_summarize_docx_extracts_then_summarizes(monkeypatch): + from app.core import folder_indexer + class FakePara: + def __init__(self, t): self.text = t + class FakeDoc: + paragraphs = [FakePara("Heading"), FakePara("Body paragraph one.")] + monkeypatch.setattr(folder_indexer, "DocxDocument", lambda buf: FakeDoc()) + mock_resp = AsyncMock(); mock_resp.content = "Heading and body."; mock_resp.usage_metadata = {"total_tokens": 30} + async def fake_llm(messages): return mock_resp + with patch("app.core.folder_indexer._llm_text", new=fake_llm): + result = await folder_indexer.summarize_docx(docx_b64="UEsDBBQ=", name="doc.docx") + assert result.summary == "Heading and body."