diff --git a/app/core/folder_indexer.py b/app/core/folder_indexer.py index f81c4bf..f7f863a 100644 --- a/app/core/folder_indexer.py +++ b/app/core/folder_indexer.py @@ -41,6 +41,40 @@ async def _llm_text(messages: list) -> object: return await llm.ainvoke(messages) +async def _llm_vision(messages: list) -> object: + """Make the LLM call for vision (image) summarisation. + + Accepts the message list and returns the response directly, mirroring + the ``_llm_text`` caller pattern so tests can patch it at the module level. + """ + llm = get_llm(model="gpt-4o-mini", temperature=0.2) + return await llm.ainvoke(messages) + + +async def summarize_image(*, image_b64: str, mime: str) -> IndexResult: + """Return a compact summary of an image file using vision. + + Parameters + ---------- + image_b64: + Base64-encoded image bytes. + mime: + MIME type of the image, e.g. ``"image/png"``. + """ + template, prompt_obj = get_prompt_or_fallback("folder_file_summary_image", _IMAGE_FALLBACK) + messages = [ + SystemMessage(content=template), + HumanMessage(content=[ + {"type": "text", "text": "Summarise this image."}, + {"type": "image_url", "image_url": {"url": f"data:{mime};base64,{image_b64}"}}, + ]), + ] + response = await _llm_vision(messages) + usage = extract_usage(response) + summary = (response.content or "").strip()[:500] + return IndexResult(summary=summary, tokens_used=usage.get("total", 0)) + + async def summarize_text(*, content: str, ext: str, name: str) -> IndexResult: """Return a compact summary of a text file. diff --git a/tests/test_folder_indexer.py b/tests/test_folder_indexer.py index 418f80e..ae0f6aa 100644 --- a/tests/test_folder_indexer.py +++ b/tests/test_folder_indexer.py @@ -5,7 +5,7 @@ from unittest.mock import AsyncMock, patch import pytest -from app.core.folder_indexer import summarize_text, IndexResult +from app.core.folder_indexer import summarize_text, summarize_image, IndexResult pytestmark = pytest.mark.asyncio @@ -28,3 +28,26 @@ async def test_summarize_text_truncates_summary_at_500_chars(): with patch("app.core.folder_indexer._llm_text", new=AsyncMock(return_value=mock_resp)): result = await summarize_text(content="x", ext=".md", name="x.md") assert len(result.summary) <= 500 + + +async def test_summarize_image_uses_vision_content_blocks(): + mock_resp = AsyncMock() + mock_resp.content = "Final logo on white background." + mock_resp.usage_metadata = {"total_tokens": 500} + captured = {} + + async def fake_llm_vision(messages): + captured["messages"] = messages + return mock_resp + + with patch("app.core.folder_indexer._llm_vision", new=fake_llm_vision): + result = await summarize_image(image_b64="iVBORw0KG", mime="image/png") + + assert "Final logo" in result.summary + assert result.tokens_used == 500 + # last message contains an image content block + last = captured["messages"][-1] + assert any( + isinstance(p, dict) and p.get("type") == "image_url" + for p in (last.content if isinstance(last.content, list) else []) + )