Improve Step 1 project matching and Step 2 update-first enforcement

- Rewrite _STEP1_SYSTEM_PROMPT: lower matching threshold (no longer requires "clear" match), strongly prefer existing projects over creating new ones, use structured id=|name=|status= format with aiSummary for richer context - Add code-level UUID validation: reject hallucinated ids not in the fetched projects list, fall back to "new" instead of creating a bad link - Rewrite _PROCESSING_SYSTEM_PROMPT: enforce explicit scan-before-create process (read existing → search → update if found → create only if not) with hard rule against calling create_* without checking existing records Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-20 23:45:29 +01:00
parent 58bc6efd4b
commit e7cdce8287
1 changed files with 77 additions and 36 deletions
--- a/app/core/agent_runner.py
+++ b/app/core/agent_runner.py
@@ -102,19 +102,29 @@ _DOMAIN_DESCRIPTIONS: dict[str, str] = {
 _STEP1_SYSTEM_PROMPT = """\
 You are a file classifier for a freelance project management tool.

-Given a file's content and a list of existing projects, your job is to:
-1. Identify which project this file belongs to (or "standalone" if none match).
-2. Identify which data domains are relevant to extract from this file,
-   limited to the allowed domains listed below.
+Your job is to match a file to an existing project and identify which data domains to extract.

-Domain definitions (only consider domains in the allowed list):
-{domain_definitions}
+## Project matching rules (STRICT — follow in order)
+
+1. Search the file content for any mention of a project name, client name, acronym, or topic
+   that overlaps with the existing projects listed below.
+2. The match does NOT need to be exact — partial name, abbreviation, or topic similarity is enough.
+3. STRONGLY PREFER matching an existing project. Only return "new" as an absolute last resort
+   when the file has zero meaningful connection to any listed project.
+4. When in doubt, pick the closest match from the list.
+
+## Response format

 Respond ONLY with a JSON object — no markdown, no explanation:

-{{"project_id": "<uuid> or standalone", "domains": ["tasks", "notes"]}}
+{{"project_id": "<exact id from the list below, or new>", "new_project_name": "<concise 2-5 word name, only when project_id is new>", "domains": ["tasks", "notes"]}}
+
+## Domain definitions (only consider domains in the allowed list)
+
+{domain_definitions}
+
+## Existing projects

-Existing projects:
 {projects_list}
 """

@@ -123,20 +133,26 @@ Existing projects:
 _PROCESSING_SYSTEM_PROMPT = """\
 You are a data extraction assistant for a freelance project management tool.

-Your task is to read the file content provided and create or update records
-using the available tools.
+Your task: extract structured data from the file content and persist it using the available tools.

-IMPORTANT — update-first rules:
-  The existing records below are the source of truth.
-  If an existing record semantically matches the content (by title, topic,
-  or context), update it instead of creating a duplicate.
-  Only create a new record when no existing match is found.
-  Set isAiSuggested=1 on all new records.
+## Mandatory process — follow this order for EVERY item you extract
+
+1. READ the existing records listed below for the relevant domain.
+2. SEARCH for a match by title, topic, or semantic similarity.
+3. If a match exists → call the update_* tool with the existing record's id.
+4. If no match exists → call the create_* tool and set isAiSuggested=1.
+
+NEVER call create_* without first checking the existing records.
+NEVER duplicate a record that already exists under a different wording.
+
+## Existing records (source of truth)

 {existing_context}

-Project context: {project_context}
-Target domains: {data_types}
+## Context
+
+Project: {project_context}
+Domains to extract: {data_types}

 {custom_prompt_section}
 """
@@ -470,21 +486,27 @@ async def _classify_file(
    file_content: str,
    projects: list[dict],
    config_data_types: list[str],
-) -> tuple[str, list[str]]:
+) -> tuple[str, list[str], str | None]:
    """Call the LLM to classify a file by project and relevant domains.

-    Returns ``(project_id_or_"standalone", domains)``.
-    Falls back to ``("standalone", config_data_types)`` on any error.
+    Returns ``(project_id_or_"new", domains, new_project_name_or_None)``.
+    - ``project_id`` is an existing project UUID, or ``"new"`` when no match found.
+    - ``new_project_name`` is only set when ``project_id == "new"``.
+    Falls back to ``("new", config_data_types, None)`` on any error.
    """
-    fallback = ("standalone", list(config_data_types))
+    fallback: tuple[str, list[str], str | None] = ("new", list(config_data_types), None)

    if not file_content.strip():
        return fallback

-    projects_list = "\n".join(
-        f"  - {p.get('name', '')} (id: {p['id']}, status: {p.get('status', '')})"
-        for p in projects
-    ) or "  (none — all files are standalone)"
+    valid_project_ids = {p["id"] for p in projects}
+
+    def _fmt_project(p: dict) -> str:
+        summary = (p.get("aiSummary") or p.get("ai_summary") or "").strip()
+        summary_part = f" — {summary[:100]}" if summary else ""
+        return f"  - id={p['id']} | name={p.get('name', '')} | status={p.get('status', '')}{summary_part}"
+
+    projects_list = "\n".join(_fmt_project(p) for p in projects) or "  (none yet)"

    domain_definitions = "\n".join(
        f"  - {d}: {_DOMAIN_DESCRIPTIONS[d]}"
@@ -510,14 +532,21 @@ async def _classify_file(
            if raw.startswith("json"):
                raw = raw[4:]
        parsed = json.loads(raw.strip())
-        project_id: str = str(parsed.get("project_id") or "standalone")
+        raw_project_id: str = str(parsed.get("project_id") or "new")
+        # Reject hallucinated UUIDs — only accept ids that exist in the fetched list.
+        project_id = raw_project_id if raw_project_id in valid_project_ids else "new"
+        new_project_name: str | None = (
+            str(parsed["new_project_name"]).strip() or None
+            if project_id == "new" and parsed.get("new_project_name")
+            else None
+        )
        domains: list[str] = [
            d for d in parsed.get("domains", [])
            if d in config_data_types
        ]
        if not domains:
            domains = list(config_data_types)
-        return project_id, domains
+        return project_id, domains, new_project_name
    except Exception as exc:
        logger.warning(
            "agent_runner: step1 classification failed for %r: %s", file_path, exc
@@ -605,9 +634,6 @@ async def run_local_agent(
        # ── Code: fetch all projects once ────────────────────────────
        projects = await _fetch_projects()

-        # ── Per-file processing ──────────────────────────────────────
-        processing_tools = _build_processing_tools(config.data_types)
-
        for file_path in file_paths:
            try:
                # Read file content via code.
@@ -622,30 +648,43 @@ async def run_local_agent(
                items_processed += 1

                # Step 1 — classify file.
-                project_id, domains = await _classify_file(
+                project_id, domains, new_project_name = await _classify_file(
                    file_path=file_path,
                    file_content=file_content,
                    projects=projects,
                    config_data_types=config.data_types,
                )
                logger.info(
-                    "agent_runner: run=%s file=%r → project=%s domains=%s",
+                    "agent_runner: run=%s file=%r → project=%s new_name=%r domains=%s",
                    run_id,
                    file_path,
                    project_id,
+                    new_project_name,
                    domains,
                )

                # Step 2 — fetch existing entities for this project + domains.
+                # When project_id is "new", entities are fetched without a project
+                # filter; the LLM will create the project and link records to it.
+                effective_project_id = project_id if project_id != "new" else "standalone"
+
                existing_blocks: list[str] = []
                for domain in domains:
-                    rows = await _fetch_domain_entities(domain, project_id)
+                    rows = await _fetch_domain_entities(domain, effective_project_id)
                    existing_blocks.append(_format_entities_for_context(domain, rows))

                existing_context = "\n\n".join(existing_blocks)

-                if project_id == "standalone":
-                    project_context = "This file is not associated with any existing project."
+                if project_id == "new":
+                    name_hint = f' Use "{new_project_name}" as the project name.' if new_project_name else ""
+                    project_context = (
+                        f"No existing project matches this file. "
+                        f"Create a new project first using the create_project tool, "
+                        f"then link all extracted records to its id.{name_hint}"
+                    )
+                    # Ensure the LLM has the project tools available.
+                    if "projects" not in domains:
+                        domains = ["projects"] + domains
                else:
                    project_context = (
                        f"This file belongs to project ID: {project_id}. "
@@ -659,6 +698,8 @@ async def run_local_agent(
                    custom_prompt_section=custom_section,
                )

+                processing_tools = _build_processing_tools(domains)
+
                result_text = await _run_agent_with_tools(
                    system_prompt=system_prompt,
                    user_message=(