From 822b4cd8b174fcbd7b57d6f25c44e03bf2b77d90 Mon Sep 17 00:00:00 2001 From: Roberto Date: Tue, 12 May 2026 11:05:43 +0200 Subject: [PATCH] feat(api): folder_indexer.summarize_text via gpt-4o-mini --- app/core/folder_indexer.py | 66 ++++++++++++++++++++++++++++++++++++ tests/test_folder_indexer.py | 30 ++++++++++++++++ 2 files changed, 96 insertions(+) create mode 100644 app/core/folder_indexer.py create mode 100644 tests/test_folder_indexer.py diff --git a/app/core/folder_indexer.py b/app/core/folder_indexer.py new file mode 100644 index 0000000..f81c4bf --- /dev/null +++ b/app/core/folder_indexer.py @@ -0,0 +1,66 @@ +"""Per-file summarisation for project folder integration.""" +from __future__ import annotations + +from dataclasses import dataclass + +from langchain_core.messages import HumanMessage, SystemMessage + +from app.core.langfuse_client import ( + compile_prompt, + extract_usage, + get_prompt_or_fallback, +) +from app.core.llm import get_llm + +_TEXT_FALLBACK = ( + "You are summarising a file for an AI assistant that helps the user manage a project.\n" + "Produce a single sentence (<=30 words, <=200 chars) that captures the file's purpose " + "and most important detail.\nFile extension: {ext}\nFile name: {name}\nContent (truncated if long):\n{content}" +) +_IMAGE_FALLBACK = ( + "You are summarising an image attached to a project folder.\n" + "Produce a single sentence (<=30 words, <=200 chars) describing what the image shows " + "and any obvious purpose (logo, screenshot, diagram, photo of a whiteboard, etc.)." +) +_MAX_INPUT_CHARS = 6000 + + +@dataclass +class IndexResult: + summary: str + tokens_used: int + + +async def _llm_text(messages: list) -> object: + """Make the LLM call for text summarisation. + + Defined as a standalone async function so tests can patch it cleanly + without needing to mock the LLM object itself. + """ + llm = get_llm(model="gpt-4o-mini", temperature=0.2) + return await llm.ainvoke(messages) + + +async def summarize_text(*, content: str, ext: str, name: str) -> IndexResult: + """Return a compact summary of a text file. + + Parameters + ---------- + content: + Raw text content of the file (will be truncated to _MAX_INPUT_CHARS). + ext: + File extension including the leading dot, e.g. ``".md"``. + name: + File name, e.g. ``"kickoff.md"``. + """ + template, prompt_obj = get_prompt_or_fallback("folder_file_summary_text", _TEXT_FALLBACK) + truncated = content[:_MAX_INPUT_CHARS] + compiled = compile_prompt(template, prompt_obj, ext=ext, name=name, content=truncated) + messages = [ + SystemMessage(content=compiled), + HumanMessage(content="Summarise this file."), + ] + response = await _llm_text(messages) + usage = extract_usage(response) + summary = (response.content or "").strip()[:500] + return IndexResult(summary=summary, tokens_used=usage.get("total", 0)) diff --git a/tests/test_folder_indexer.py b/tests/test_folder_indexer.py new file mode 100644 index 0000000..418f80e --- /dev/null +++ b/tests/test_folder_indexer.py @@ -0,0 +1,30 @@ +"""Folder indexer LLM helpers.""" +from __future__ import annotations + +from unittest.mock import AsyncMock, patch + +import pytest + +from app.core.folder_indexer import summarize_text, IndexResult + +pytestmark = pytest.mark.asyncio + + +async def test_summarize_text_returns_summary_and_tokens(): + mock_resp = AsyncMock() + mock_resp.content = "Kickoff notes covering scope and deadlines." + mock_resp.usage_metadata = {"input_tokens": 320, "output_tokens": 18, "total_tokens": 338} + with patch("app.core.folder_indexer._llm_text", new=AsyncMock(return_value=mock_resp)): + result = await summarize_text(content="hello world", ext=".md", name="kickoff.md") + assert isinstance(result, IndexResult) + assert result.summary == "Kickoff notes covering scope and deadlines." + assert result.tokens_used == 338 + + +async def test_summarize_text_truncates_summary_at_500_chars(): + mock_resp = AsyncMock() + mock_resp.content = "x" * 1000 + mock_resp.usage_metadata = {"total_tokens": 100} + with patch("app.core.folder_indexer._llm_text", new=AsyncMock(return_value=mock_resp)): + result = await summarize_text(content="x", ext=".md", name="x.md") + assert len(result.summary) <= 500