diff --git a/app/agents/folder_agent.py b/app/agents/folder_agent.py index 56b087d..f6542d6 100644 --- a/app/agents/folder_agent.py +++ b/app/agents/folder_agent.py @@ -1,10 +1,15 @@ -"""Scoped file-read tool for the project folder feature.""" +"""Scoped file-read and search tools for the project folder feature.""" from __future__ import annotations from langchain_core.tools import tool +from app.core.folder_indexer import _extract_docx_text, _extract_pdf_text from app.core.ws_context import execute_on_client +# Cap returned slice size to keep tool output under control. +_MAX_RETURN_CHARS = 50_000 +_MAX_SEARCH_MATCHES = 20 + def _is_unsafe_path(rel: str) -> bool: if not rel: @@ -19,19 +24,145 @@ def _is_unsafe_path(rel: str) -> bool: return ".." in parts +async def _fetch_file(project_id: str, relative_path: str, offset: int, length: int) -> dict: + """Return the raw Electron tool_result dict for a file read.""" + return await execute_on_client( + action="read_project_folder_file", + data={ + "projectId": project_id, + "relativePath": relative_path, + "offset": offset, + "length": length, + }, + ) + + +def _decode(result: dict) -> tuple[str, str, int]: + """Decode a tool_result into (text, kind, total_size). For pdf/docx, + extracts text from base64. For images, returns a placeholder string. + For text, content is already a sliced utf-8 string. + """ + kind = result.get("kind", "text") + content = result.get("content", "") or "" + total = int(result.get("totalSize", 0) or 0) + if kind == "image": + return ("[Image file — cannot be navigated as text. See manifest summary.]", kind, total) + if kind == "pdf": + return (_extract_pdf_text(content), kind, total) + if kind == "docx": + return (_extract_docx_text(content), kind, total) + return (content, kind, total) + + @tool -async def read_project_folder_file(project_id: str, relative_path: str) -> str: - """Read full content of a file inside the project's linked folder.""" +async def read_project_folder_file( + project_id: str, + relative_path: str, + offset: int = 0, + length: int = _MAX_RETURN_CHARS, +) -> str: + """Read a slice of a file inside the project's linked folder. + + Args: + project_id: project ID. + relative_path: path relative to the linked folder root. + offset: char offset to start reading from (0 = beginning). + length: max chars to return. Default 50000. Use smaller values to save tokens. + + Returns text content slice with a header showing position. Header tells you + when more content is available; call again with the suggested next offset. + + For PDF / DOCX files the backend extracts text first, then applies offset/length + on the extracted text. For images returns a placeholder; navigate with the + manifest summary instead. + """ if _is_unsafe_path(relative_path): return "Access denied" - result = await execute_on_client( - action="read_project_folder_file", - data={"projectId": project_id, "relativePath": relative_path}, - ) - content = result.get("content", "") - if not content: - return f"File not found: {relative_path}" - return content + + result = await _fetch_file(project_id, relative_path, offset, length) + text, kind, total_size = _decode(result) + + if not text and kind in ("missing", "error"): + return f"File not found or unreadable: {relative_path}" + + if kind in ("pdf", "docx"): + # Backend extracted full text — apply offset/length on chars. + sliced = text[offset:offset + length] + slice_end = min(offset + length, len(text)) + header = ( + f"[file={relative_path} kind={kind} offset={offset} end={slice_end} " + f"totalChars={len(text)}]" + ) + if slice_end < len(text): + header += f"\n[More content available — call again with offset={slice_end}.]" + return header + "\n" + sliced + + if kind == "text": + slice_end = offset + len(text) + header = ( + f"[file={relative_path} kind=text offset={offset} end={slice_end} " + f"totalBytes={total_size}]" + ) + if slice_end < total_size: + header += f"\n[More content available — call again with offset={slice_end}.]" + return header + "\n" + text + + # image or unknown + return text -FOLDER_TOOLS = [read_project_folder_file] +@tool +async def search_project_folder_file( + project_id: str, + relative_path: str, + query: str, + context_lines: int = 3, +) -> str: + """Search a project folder file for a query string (case-insensitive substring). + + Args: + project_id: project ID. + relative_path: path relative to the linked folder root. + query: text to search for. + context_lines: number of lines of context around each match (default 3). + + Returns matching line ranges with surrounding context and 1-based line numbers. + Capped at 20 matches; if more exist the header shows the total. + + Works on text, code, markdown, PDF (extracted), and DOCX (extracted). + Images and binary files are not searchable. + """ + if _is_unsafe_path(relative_path): + return "Access denied" + if not query: + return "Empty query." + + # For text we still need full file; pass length=very large. + result = await _fetch_file(project_id, relative_path, offset=0, length=10_000_000) + text, kind, _ = _decode(result) + + if not text and kind in ("missing", "error"): + return f"File not found or unreadable: {relative_path}" + if kind == "image": + return "Cannot search inside images." + + lines = text.splitlines() + q = query.lower() + matches = [i for i, line in enumerate(lines) if q in line.lower()] + if not matches: + return f"No matches for '{query}' in {relative_path}." + + shown = matches[:_MAX_SEARCH_MATCHES] + snippets: list[str] = [] + for i in shown: + start = max(0, i - context_lines) + end = min(len(lines), i + context_lines + 1) + block = "\n".join(f"{n + 1:5d}: {lines[n]}" for n in range(start, end)) + snippets.append(block) + + header = f"[file={relative_path} matches={len(matches)} showing={len(shown)} query='{query}']" + body = "\n---\n".join(snippets) + return header + "\n" + body + + +FOLDER_TOOLS = [read_project_folder_file, search_project_folder_file] diff --git a/tests/test_folder_agent_tool.py b/tests/test_folder_agent_tool.py index 2160d0f..c6b92ef 100644 --- a/tests/test_folder_agent_tool.py +++ b/tests/test_folder_agent_tool.py @@ -4,7 +4,10 @@ from unittest.mock import AsyncMock, patch import pytest -from app.agents.folder_agent import read_project_folder_file +from app.agents.folder_agent import ( + read_project_folder_file, + search_project_folder_file, +) pytestmark = pytest.mark.asyncio @@ -12,10 +15,11 @@ pytestmark = pytest.mark.asyncio async def test_happy_path(): with patch( "app.agents.folder_agent.execute_on_client", - new=AsyncMock(return_value={"content": "file body"}), + new=AsyncMock(return_value={"content": "file body", "kind": "text", "totalSize": 9}), ): out = await read_project_folder_file.ainvoke({"project_id": "p1", "relative_path": "docs/x.md"}) - assert out == "file body" + assert "file body" in out + assert "kind=text" in out async def test_traversal_rejected(): @@ -31,7 +35,105 @@ async def test_absolute_path_rejected(): async def test_missing_file(): with patch( "app.agents.folder_agent.execute_on_client", - new=AsyncMock(return_value={"content": ""}), + new=AsyncMock(return_value={"content": "", "kind": "missing", "totalSize": 0}), ): out = await read_project_folder_file.ainvoke({"project_id": "p1", "relative_path": "ghost.md"}) assert "not found" in out.lower() + + +async def test_pagination_signals_more_available(): + # Electron returned the first slice, totalSize larger than slice length. + with patch( + "app.agents.folder_agent.execute_on_client", + new=AsyncMock(return_value={"content": "first chunk", "kind": "text", "totalSize": 1000}), + ): + out = await read_project_folder_file.ainvoke({ + "project_id": "p1", + "relative_path": "big.txt", + "offset": 0, + "length": 11, + }) + assert "first chunk" in out + assert "More content available" in out + assert "offset=11" in out + + +async def test_pdf_extracted_then_sliced(monkeypatch): + from app.agents import folder_agent + monkeypatch.setattr(folder_agent, "_extract_pdf_text", lambda b: "ABC " * 100) + with patch( + "app.agents.folder_agent.execute_on_client", + new=AsyncMock(return_value={"content": "JVBERi0xLg==", "kind": "pdf", "totalSize": 12}), + ): + out = await read_project_folder_file.ainvoke({ + "project_id": "p1", + "relative_path": "doc.pdf", + "offset": 0, + "length": 8, + }) + assert "kind=pdf" in out + assert "ABC ABC " in out + assert "More content available" in out + + +async def test_image_returns_placeholder(): + with patch( + "app.agents.folder_agent.execute_on_client", + new=AsyncMock(return_value={"content": "iVBORw0K", "kind": "image", "totalSize": 1024}), + ): + out = await read_project_folder_file.ainvoke({"project_id": "p1", "relative_path": "logo.png"}) + assert "image" in out.lower() + + +async def test_search_finds_match_with_context(): + body = "alpha\nbeta\nthe needle is here\ngamma\ndelta" + with patch( + "app.agents.folder_agent.execute_on_client", + new=AsyncMock(return_value={"content": body, "kind": "text", "totalSize": len(body)}), + ): + out = await search_project_folder_file.ainvoke({ + "project_id": "p1", + "relative_path": "log.txt", + "query": "needle", + "context_lines": 1, + }) + assert "needle" in out + assert "matches=1" in out + # Context lines included + assert "beta" in out + assert "gamma" in out + + +async def test_search_no_match(): + with patch( + "app.agents.folder_agent.execute_on_client", + new=AsyncMock(return_value={"content": "nothing here", "kind": "text", "totalSize": 12}), + ): + out = await search_project_folder_file.ainvoke({ + "project_id": "p1", + "relative_path": "x.txt", + "query": "zzz", + }) + assert "No matches" in out + + +async def test_search_rejects_traversal(): + out = await search_project_folder_file.ainvoke({ + "project_id": "p1", + "relative_path": "../etc/passwd", + "query": "root", + }) + assert out == "Access denied" + + +async def test_search_image_rejected(): + with patch( + "app.agents.folder_agent.execute_on_client", + new=AsyncMock(return_value={"content": "b64data", "kind": "image", "totalSize": 100}), + ): + out = await search_project_folder_file.ainvoke({ + "project_id": "p1", + "relative_path": "logo.png", + "query": "anything", + }) + assert "Cannot search" in out