Improve Step 1 project matching and Step 2 update-first enforcement
- Rewrite _STEP1_SYSTEM_PROMPT: lower matching threshold (no longer requires "clear" match), strongly prefer existing projects over creating new ones, use structured id=|name=|status= format with aiSummary for richer context - Add code-level UUID validation: reject hallucinated ids not in the fetched projects list, fall back to "new" instead of creating a bad link - Rewrite _PROCESSING_SYSTEM_PROMPT: enforce explicit scan-before-create process (read existing → search → update if found → create only if not) with hard rule against calling create_* without checking existing records Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -102,19 +102,29 @@ _DOMAIN_DESCRIPTIONS: dict[str, str] = {
|
||||
_STEP1_SYSTEM_PROMPT = """\
|
||||
You are a file classifier for a freelance project management tool.
|
||||
|
||||
Given a file's content and a list of existing projects, your job is to:
|
||||
1. Identify which project this file belongs to (or "standalone" if none match).
|
||||
2. Identify which data domains are relevant to extract from this file,
|
||||
limited to the allowed domains listed below.
|
||||
Your job is to match a file to an existing project and identify which data domains to extract.
|
||||
|
||||
Domain definitions (only consider domains in the allowed list):
|
||||
{domain_definitions}
|
||||
## Project matching rules (STRICT — follow in order)
|
||||
|
||||
1. Search the file content for any mention of a project name, client name, acronym, or topic
|
||||
that overlaps with the existing projects listed below.
|
||||
2. The match does NOT need to be exact — partial name, abbreviation, or topic similarity is enough.
|
||||
3. STRONGLY PREFER matching an existing project. Only return "new" as an absolute last resort
|
||||
when the file has zero meaningful connection to any listed project.
|
||||
4. When in doubt, pick the closest match from the list.
|
||||
|
||||
## Response format
|
||||
|
||||
Respond ONLY with a JSON object — no markdown, no explanation:
|
||||
|
||||
{{"project_id": "<uuid> or standalone", "domains": ["tasks", "notes"]}}
|
||||
{{"project_id": "<exact id from the list below, or new>", "new_project_name": "<concise 2-5 word name, only when project_id is new>", "domains": ["tasks", "notes"]}}
|
||||
|
||||
## Domain definitions (only consider domains in the allowed list)
|
||||
|
||||
{domain_definitions}
|
||||
|
||||
## Existing projects
|
||||
|
||||
Existing projects:
|
||||
{projects_list}
|
||||
"""
|
||||
|
||||
@@ -123,20 +133,26 @@ Existing projects:
|
||||
_PROCESSING_SYSTEM_PROMPT = """\
|
||||
You are a data extraction assistant for a freelance project management tool.
|
||||
|
||||
Your task is to read the file content provided and create or update records
|
||||
using the available tools.
|
||||
Your task: extract structured data from the file content and persist it using the available tools.
|
||||
|
||||
IMPORTANT — update-first rules:
|
||||
The existing records below are the source of truth.
|
||||
If an existing record semantically matches the content (by title, topic,
|
||||
or context), update it instead of creating a duplicate.
|
||||
Only create a new record when no existing match is found.
|
||||
Set isAiSuggested=1 on all new records.
|
||||
## Mandatory process — follow this order for EVERY item you extract
|
||||
|
||||
1. READ the existing records listed below for the relevant domain.
|
||||
2. SEARCH for a match by title, topic, or semantic similarity.
|
||||
3. If a match exists → call the update_* tool with the existing record's id.
|
||||
4. If no match exists → call the create_* tool and set isAiSuggested=1.
|
||||
|
||||
NEVER call create_* without first checking the existing records.
|
||||
NEVER duplicate a record that already exists under a different wording.
|
||||
|
||||
## Existing records (source of truth)
|
||||
|
||||
{existing_context}
|
||||
|
||||
Project context: {project_context}
|
||||
Target domains: {data_types}
|
||||
## Context
|
||||
|
||||
Project: {project_context}
|
||||
Domains to extract: {data_types}
|
||||
|
||||
{custom_prompt_section}
|
||||
"""
|
||||
@@ -470,21 +486,27 @@ async def _classify_file(
|
||||
file_content: str,
|
||||
projects: list[dict],
|
||||
config_data_types: list[str],
|
||||
) -> tuple[str, list[str]]:
|
||||
) -> tuple[str, list[str], str | None]:
|
||||
"""Call the LLM to classify a file by project and relevant domains.
|
||||
|
||||
Returns ``(project_id_or_"standalone", domains)``.
|
||||
Falls back to ``("standalone", config_data_types)`` on any error.
|
||||
Returns ``(project_id_or_"new", domains, new_project_name_or_None)``.
|
||||
- ``project_id`` is an existing project UUID, or ``"new"`` when no match found.
|
||||
- ``new_project_name`` is only set when ``project_id == "new"``.
|
||||
Falls back to ``("new", config_data_types, None)`` on any error.
|
||||
"""
|
||||
fallback = ("standalone", list(config_data_types))
|
||||
fallback: tuple[str, list[str], str | None] = ("new", list(config_data_types), None)
|
||||
|
||||
if not file_content.strip():
|
||||
return fallback
|
||||
|
||||
projects_list = "\n".join(
|
||||
f" - {p.get('name', '')} (id: {p['id']}, status: {p.get('status', '')})"
|
||||
for p in projects
|
||||
) or " (none — all files are standalone)"
|
||||
valid_project_ids = {p["id"] for p in projects}
|
||||
|
||||
def _fmt_project(p: dict) -> str:
|
||||
summary = (p.get("aiSummary") or p.get("ai_summary") or "").strip()
|
||||
summary_part = f" — {summary[:100]}" if summary else ""
|
||||
return f" - id={p['id']} | name={p.get('name', '')} | status={p.get('status', '')}{summary_part}"
|
||||
|
||||
projects_list = "\n".join(_fmt_project(p) for p in projects) or " (none yet)"
|
||||
|
||||
domain_definitions = "\n".join(
|
||||
f" - {d}: {_DOMAIN_DESCRIPTIONS[d]}"
|
||||
@@ -510,14 +532,21 @@ async def _classify_file(
|
||||
if raw.startswith("json"):
|
||||
raw = raw[4:]
|
||||
parsed = json.loads(raw.strip())
|
||||
project_id: str = str(parsed.get("project_id") or "standalone")
|
||||
raw_project_id: str = str(parsed.get("project_id") or "new")
|
||||
# Reject hallucinated UUIDs — only accept ids that exist in the fetched list.
|
||||
project_id = raw_project_id if raw_project_id in valid_project_ids else "new"
|
||||
new_project_name: str | None = (
|
||||
str(parsed["new_project_name"]).strip() or None
|
||||
if project_id == "new" and parsed.get("new_project_name")
|
||||
else None
|
||||
)
|
||||
domains: list[str] = [
|
||||
d for d in parsed.get("domains", [])
|
||||
if d in config_data_types
|
||||
]
|
||||
if not domains:
|
||||
domains = list(config_data_types)
|
||||
return project_id, domains
|
||||
return project_id, domains, new_project_name
|
||||
except Exception as exc:
|
||||
logger.warning(
|
||||
"agent_runner: step1 classification failed for %r: %s", file_path, exc
|
||||
@@ -605,9 +634,6 @@ async def run_local_agent(
|
||||
# ── Code: fetch all projects once ────────────────────────────
|
||||
projects = await _fetch_projects()
|
||||
|
||||
# ── Per-file processing ──────────────────────────────────────
|
||||
processing_tools = _build_processing_tools(config.data_types)
|
||||
|
||||
for file_path in file_paths:
|
||||
try:
|
||||
# Read file content via code.
|
||||
@@ -622,30 +648,43 @@ async def run_local_agent(
|
||||
items_processed += 1
|
||||
|
||||
# Step 1 — classify file.
|
||||
project_id, domains = await _classify_file(
|
||||
project_id, domains, new_project_name = await _classify_file(
|
||||
file_path=file_path,
|
||||
file_content=file_content,
|
||||
projects=projects,
|
||||
config_data_types=config.data_types,
|
||||
)
|
||||
logger.info(
|
||||
"agent_runner: run=%s file=%r → project=%s domains=%s",
|
||||
"agent_runner: run=%s file=%r → project=%s new_name=%r domains=%s",
|
||||
run_id,
|
||||
file_path,
|
||||
project_id,
|
||||
new_project_name,
|
||||
domains,
|
||||
)
|
||||
|
||||
# Step 2 — fetch existing entities for this project + domains.
|
||||
# When project_id is "new", entities are fetched without a project
|
||||
# filter; the LLM will create the project and link records to it.
|
||||
effective_project_id = project_id if project_id != "new" else "standalone"
|
||||
|
||||
existing_blocks: list[str] = []
|
||||
for domain in domains:
|
||||
rows = await _fetch_domain_entities(domain, project_id)
|
||||
rows = await _fetch_domain_entities(domain, effective_project_id)
|
||||
existing_blocks.append(_format_entities_for_context(domain, rows))
|
||||
|
||||
existing_context = "\n\n".join(existing_blocks)
|
||||
|
||||
if project_id == "standalone":
|
||||
project_context = "This file is not associated with any existing project."
|
||||
if project_id == "new":
|
||||
name_hint = f' Use "{new_project_name}" as the project name.' if new_project_name else ""
|
||||
project_context = (
|
||||
f"No existing project matches this file. "
|
||||
f"Create a new project first using the create_project tool, "
|
||||
f"then link all extracted records to its id.{name_hint}"
|
||||
)
|
||||
# Ensure the LLM has the project tools available.
|
||||
if "projects" not in domains:
|
||||
domains = ["projects"] + domains
|
||||
else:
|
||||
project_context = (
|
||||
f"This file belongs to project ID: {project_id}. "
|
||||
@@ -659,6 +698,8 @@ async def run_local_agent(
|
||||
custom_prompt_section=custom_section,
|
||||
)
|
||||
|
||||
processing_tools = _build_processing_tools(domains)
|
||||
|
||||
result_text = await _run_agent_with_tools(
|
||||
system_prompt=system_prompt,
|
||||
user_message=(
|
||||
|
||||
Reference in New Issue
Block a user