"""Per-file summarisation for project folder integration.""" from __future__ import annotations import base64 import io from dataclasses import dataclass from langchain_core.messages import HumanMessage, SystemMessage from pypdf import PdfReader from docx import Document as DocxDocument from app.core.langfuse_client import ( compile_prompt, extract_usage, get_langfuse, get_prompt_or_fallback, ) from app.core.llm import get_llm _TEXT_FALLBACK = ( "You are summarising a file for an AI assistant that helps the user manage a project.\n" "Produce a single sentence (<=30 words, <=200 chars) that captures the file's purpose " "and most important detail.\nFile extension: {ext}\nFile name: {name}\nContent (truncated if long):\n{content}" ) _IMAGE_FALLBACK = ( "You are summarising an image attached to a project folder.\n" "Produce a single sentence (<=30 words, <=200 chars) describing what the image shows " "and any obvious purpose (logo, screenshot, diagram, photo of a whiteboard, etc.)." ) _MAX_INPUT_CHARS = 6000 @dataclass class IndexResult: summary: str tokens_used: int async def _llm_text(messages: list) -> object: """Make the LLM call for text summarisation. Defined as a standalone async function so tests can patch it cleanly without needing to mock the LLM object itself. """ llm = get_llm(model="gpt-4o-mini", temperature=0.2) return await llm.ainvoke(messages) async def _llm_vision(messages: list) -> object: """Make the LLM call for vision (image) summarisation. Accepts the message list and returns the response directly, mirroring the ``_llm_text`` caller pattern so tests can patch it at the module level. """ llm = get_llm(model="gpt-4o-mini", temperature=0.2) return await llm.ainvoke(messages) async def summarize_image(*, image_b64: str, mime: str, file_name: str | None = None) -> IndexResult: """Return a compact summary of an image file using vision. Parameters ---------- image_b64: Base64-encoded image bytes. mime: MIME type of the image, e.g. ``"image/png"``. file_name: Optional file name, attached to the Langfuse trace as input metadata. """ template, prompt_obj = get_prompt_or_fallback("folder_file_summary_image", _IMAGE_FALLBACK) messages = [ SystemMessage(content=template), HumanMessage(content=[ {"type": "text", "text": "Summarise this image."}, {"type": "image_url", "image_url": {"url": f"data:{mime};base64,{image_b64}"}}, ]), ] lf = get_langfuse() if lf is not None: with lf.start_as_current_observation( as_type="generation", name="folder-summarize-image", model="gpt-4o-mini", prompt=prompt_obj, input={"file_name": file_name, "mime": mime}, ) as gen: response = await _llm_vision(messages) usage = extract_usage(response) gen.update(output=response.content, usage_details=usage) else: response = await _llm_vision(messages) usage = extract_usage(response) summary = (response.content or "").strip()[:500] return IndexResult(summary=summary, tokens_used=usage.get("total", 0)) async def summarize_text(*, content: str, ext: str, name: str) -> IndexResult: """Return a compact summary of a text file. Parameters ---------- content: Raw text content of the file (will be truncated to _MAX_INPUT_CHARS). ext: File extension including the leading dot, e.g. ``".md"``. name: File name, e.g. ``"kickoff.md"``. """ template, prompt_obj = get_prompt_or_fallback("folder_file_summary_text", _TEXT_FALLBACK) truncated = content[:_MAX_INPUT_CHARS] compiled = compile_prompt(template, prompt_obj, ext=ext, name=name, content=truncated) messages = [ SystemMessage(content=compiled), HumanMessage(content="Summarise this file."), ] lf = get_langfuse() if lf is not None: with lf.start_as_current_observation( as_type="generation", name="folder-summarize-text", model="gpt-4o-mini", prompt=prompt_obj, input={"file_name": name, "ext": ext, "content_chars": len(truncated)}, ) as gen: response = await _llm_text(messages) usage = extract_usage(response) gen.update(output=response.content, usage_details=usage) else: response = await _llm_text(messages) usage = extract_usage(response) summary = (response.content or "").strip()[:500] return IndexResult(summary=summary, tokens_used=usage.get("total", 0)) def _extract_pdf_text(pdf_b64: str) -> str: buf = io.BytesIO(base64.b64decode(pdf_b64)) reader = PdfReader(buf) parts: list[str] = [] for page in reader.pages: try: parts.append(page.extract_text() or "") except Exception: continue return "\n".join(parts).strip() def _extract_docx_text(docx_b64: str) -> str: buf = io.BytesIO(base64.b64decode(docx_b64)) doc = DocxDocument(buf) return "\n".join(p.text for p in doc.paragraphs if p.text).strip() async def summarize_pdf(*, pdf_b64: str, name: str) -> IndexResult: """Return a compact summary of a PDF file. Parameters ---------- pdf_b64: Base64-encoded PDF bytes. name: File name, e.g. ``"report.pdf"``. """ text = _extract_pdf_text(pdf_b64) if not text: return IndexResult(summary="Could not extract text", tokens_used=0) return await summarize_text(content=text, ext=".pdf", name=name) async def summarize_docx(*, docx_b64: str, name: str) -> IndexResult: """Return a compact summary of a DOCX file. Parameters ---------- docx_b64: Base64-encoded DOCX bytes. name: File name, e.g. ``"spec.docx"``. """ text = _extract_docx_text(docx_b64) if not text: return IndexResult(summary="Could not extract text", tokens_used=0) return await summarize_text(content=text, ext=".docx", name=name)