fix: Langfuse SDK v4 migration, tracing improvements, and LLM config

- Langfuse SDK v4: fix prompt-to-trace linking (as_type=generation) - tracing: compile_prompt with Langfuse managed prompt fallback - journey: remove journey CLI subcommand (keep only interactive) - LLM: add service-specific llm modules for batch-agent and chat - gitignore: exclude eval private test data - config: add LANGFUSE settings to shared config
2026-03-24 16:25:51 +01:00
parent d3f7099d93
commit fe0dd038ee
10 changed files with 239 additions and 42 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -35,3 +35,6 @@ Thumbs.db
 # Claude Code
 .claude/
 logs/
 # Eval private test data
 services/batch-agent/eval/fixtures/private_data/
--- a/app/config/settings.py
+++ b/app/config/settings.py
@@ -27,6 +27,7 @@ class Settings(BaseSettings):
    ANTHROPIC_API_KEY: str = ""
    GOOGLE_API_KEY: str = ""
    CEREBRAS_API_KEY: str = ""
    GITHUB_TOKEN: str = ""
    LLM_MODEL: str = "gpt-4o"
    LLM_EMBED_MODEL: str = "text-embedding-3-small"
--- a/app/core/llm.py
+++ b/app/core/llm.py
@@ -50,6 +50,8 @@ def _api_key_for_model(model: str) -> str | None:
        return settings.GOOGLE_API_KEY or None
    if model.startswith("cerebras/"):
        return settings.CEREBRAS_API_KEY or None
    if model.startswith("github/"):
        return settings.GITHUB_TOKEN or None
    if model.startswith("github_copilot/"):
        # GitHub Copilot uses OAuth device-flow tokens managed by LiteLLM.
        # No API key is required; returning None lets LiteLLM handle auth.
@@ -83,6 +85,9 @@ def get_llm(
    if settings.GITHUB_COPILOT_TOKEN_DIR:
        os.environ.setdefault("GITHUB_COPILOT_TOKEN_DIR", settings.GITHUB_COPILOT_TOKEN_DIR)
    if settings.GITHUB_TOKEN:
        os.environ.setdefault("GITHUB_TOKEN", settings.GITHUB_TOKEN)
    # Use ChatLiteLLM for provider-prefixed models (github_copilot/, anthropic/, etc.)
    # so LiteLLM handles routing and auth. ChatOpenAI for plain OpenAI model names.
    if "/" in model:
--- a/services/batch-agent/app/journey.py
+++ b/services/batch-agent/app/journey.py
@@ -80,17 +80,9 @@ def get_journey_session(session_id: str, user_id: str) -> JourneySession | None:
 _SYSTEM_PROMPT_TEMPLATE = """\
 You are a friendly assistant helping a freelancer configure a data-extraction agent.
 Your job is to understand exactly what data the user wants to extract from their
-local directory and produce a detailed prompt_template that a separate AI will use
+local directory and produce a concise prompt_template that a separate AI will use
 as its instruction set.
 The extraction agent already has this base behaviour built in:
  - Reads each file using file-system tools.
  - Creates records (tasks, notes, timelines, projects) via CRUD tools.
  - Sets isAiSuggested=1 on every new record.
  - Only extracts data explicitly present in the files — it never invents information.
 The user's custom prompt is appended AFTER this base behaviour, so focus on
 what to look for and how to map it — not on the general extraction mechanics.
 You have access to file-system tools to explore the user's directory:
 - list_directory: to see folder structure
 - read_file_content: to peek at file contents
@@ -99,38 +91,43 @@ You have access to file-system tools to explore the user's directory:
 The user's configured directory is: {directory}
 Target data types: {data_types}
-IMPORTANT — project assignment is handled automatically by the main agent runner
+IMPORTANT — project assignment is handled automatically.  You MUST NOT ask the user
-before the custom prompt is ever used.  You MUST NOT ask the user about projects,
+about projects, projectId, or how to link records to projects.  Never include
-projectId, or how to link records to projects.  Never include projectId logic or
+projectId logic or project creation instructions in the generated prompt_template.
 project creation instructions in the generated prompt_template.
 Start by exploring the directory to understand its structure.  Then ask concise,
-focused questions one at a time.  Cover these topics (not necessarily in this order):
+focused questions one at a time.  Cover only the topics relevant to the target
-  1. The type and format of the source content (confirmed by your exploration).
+data types listed above:
  2. How fields should be mapped (e.g. filename → task title).
  3. Priority or status rules (e.g. "urgent" keyword → high priority).
  4. Any special handling, date extraction, or exclusions.
-Once you reach 90% confidence, output the final prompt_template between these exact
+  1. Content type and format — confirmed by your exploration.
-markers on their own lines:
+  2. For TASKS (if in scope): field mapping for title, status, priority, content,
       dueDate (where is the date found? what's the fallback when absent?),
       and assignee (is there a person name to assign?).
  3. For NOTES when TASKS are also in scope: note vs task distinction —
       what makes something a note rather than a task?
  4. For TIMELINES (if in scope): the date source — what marks a milestone or event?
  5. Exclusions and special handling applicable to the target data types.
 Keep asking focused questions until you are at least 90% confident.  Then stop and
 output the final prompt_template immediately, wrapped between these exact markers
 on their own lines:
 {template_start}
 <the complete extraction prompt here>
 {template_end}
-The prompt_template must be a self-contained instruction for an AI that reads files
+The prompt_template must be concise (bullet points, ~15–25 lines maximum).
-and must perform CRUD operations using tools to create records.  It should specify:
+Specify only:
-  - What entity types to create (tasks, notes, timelines) — never projects.
+  - Scope: what files/content qualify and what entity types to create.
-  - How to map file content to record fields (camelCase: title, status, priority,
+  - Field mapping rules per entity type (camelCase fields: title, status, priority,
-    dueDate, content, etc.) — never include projectId.
+    dueDate, content, assignee, etc.).
-  - That isAiSuggested must be set to 1 on every new record.
+  - dueDate rule (if tasks in scope): source and fallback behaviour.
-  - Concrete examples of mappings based on what you discovered in the directory.
+  - Note vs task rule (if both in scope): the criterion that separates them.
  - Timeline date rule (if timelines in scope): what constitutes a timeline event.
  - Exclusion/filtering rules.
  - 2–3 concrete mapping examples based on what you discovered.
-{existing_section}\
+{existing_section}Begin by exploring the directory, then ask your first question.\
 Keep asking clarifying questions until you are at least 90% confident you have
 enough information to generate an accurate prompt_template.  Once you reach that
 confidence level, stop asking and produce the final template immediately.
 Begin by exploring the directory, then ask your first question.\
 """
@@ -152,8 +149,6 @@ def _build_system_prompt(
        variables={
            "directory": directory,
            "data_types": ", ".join(data_types),
            "template_start": _TEMPLATE_START,
            "template_end": _TEMPLATE_END,
            "existing_section": existing_section,
        },
    )
--- a/services/batch-agent/app/llm.py
+++ b/services/batch-agent/app/llm.py
@@ -0,0 +1,76 @@
 """LLM factory — centralised model instantiation via LiteLLM.
 Identical to services/chat/app/llm.py. Uses shared.config.settings.
 """
 from __future__ import annotations
 import os
 import warnings
 from openai import AsyncOpenAI
 import litellm
 from langchain_openai import ChatOpenAI
 from langchain_litellm import ChatLiteLLM
 from shared.config import settings
 litellm.drop_params = True
 warnings.filterwarnings(
    "ignore",
    message=r"PydanticSerializationUnexpectedValue\(Expected `ResponseAPIUsage`",
    category=UserWarning,
 )
 def _api_key_for_model(model: str) -> str | None:
    if model.startswith("anthropic/"):
        return settings.ANTHROPIC_API_KEY or None
    if model.startswith("gemini/") or model.startswith("google/"):
        return settings.GOOGLE_API_KEY or None
    if model.startswith("cerebras/"):
        return settings.CEREBRAS_API_KEY or None
    if model.startswith("github/"):
        return settings.GITHUB_TOKEN or None
    if model.startswith("github_copilot/"):
        return None
    return settings.OPENAI_API_KEY or None
 def get_llm(
    *,
    model: str | None = None,
    temperature: float = 0,
    callbacks: list | None = None,
 ) -> ChatOpenAI | ChatLiteLLM:
    model = model or settings.LLM_MODEL
    if settings.GITHUB_COPILOT_TOKEN_DIR:
        os.environ.setdefault("GITHUB_COPILOT_TOKEN_DIR", settings.GITHUB_COPILOT_TOKEN_DIR)
    if settings.GITHUB_TOKEN:
        os.environ.setdefault("GITHUB_TOKEN", settings.GITHUB_TOKEN)
    if "/" in model:
        return ChatLiteLLM(model=model, temperature=temperature, callbacks=callbacks)
    return ChatOpenAI(
        model=model,
        temperature=temperature,
        api_key=_api_key_for_model(model),
        callbacks=callbacks,
    )
 async def embed(text: str) -> list[float]:
    model = settings.LLM_EMBED_MODEL
    if model.startswith("github_copilot/") or "/" in model:
        response = await litellm.aembedding(model=model, input=[text])
        return response.data[0]["embedding"]
    client = AsyncOpenAI(api_key=settings.OPENAI_API_KEY)
    response = await client.embeddings.create(model=model, input=text)
    return response.data[0].embedding
--- a/services/batch-agent/app/redis_consumer.py
+++ b/services/batch-agent/app/redis_consumer.py
@@ -138,6 +138,8 @@ async def _dispatch(user_id: str, message_data: dict[str, Any]) -> None:
        await _handle_journey_message(user_id, message_data)
    elif msg_type == "agent_trigger":
        await _handle_agent_trigger(user_id, message_data)
    elif msg_type == "device_online":
        logger.info("batch-agent: device_online user=%s device=%s", user_id, message_data.get("device_id", "?"))
    else:
        logger.warning("batch-agent: unknown message type %r from user=%s", msg_type, user_id)
--- a/services/batch-agent/app/tracing.py
+++ b/services/batch-agent/app/tracing.py
@@ -232,6 +232,38 @@ def compile_prompt(
        return fallback.format(**variables)
 def get_prompt_object(
    name: str,
    *,
    version: int | None = None,
    label: str | None = None,
    cache_ttl_seconds: int = 300,
 ) -> Any | None:
    """Fetch the raw Langfuse prompt *object* (not the compiled string).
    Returns ``None`` when Langfuse is disabled or the prompt is not found.
    Use this when you need to pass the prompt to ``start_observation(prompt=...)``
    for linking the prompt to a trace in the Langfuse UI.
    """
    lf = _get_client()
    if lf is None:
        return None
    try:
        kwargs: dict[str, Any] = {
            "name": name,
            "cache_ttl_seconds": cache_ttl_seconds,
        }
        if version is not None:
            kwargs["version"] = version
        if label is not None:
            kwargs["label"] = label
        return lf.get_prompt(**kwargs)
    except Exception as exc:
        logger.warning("tracing: get_prompt_object(%s) failed: %s", name, exc)
        return None
 def link_prompt_to_trace(
    span: Any,
    prompt_name: str,
@@ -239,19 +271,19 @@ def link_prompt_to_trace(
    version: int | None = None,
    label: str | None = None,
 ) -> None:
-    """Attach prompt metadata to a span/trace."""
+    """Link a Langfuse managed prompt to a span/observation.
    Uses the SDK v4 ``prompt=`` parameter so that the prompt version
    appears linked in the Langfuse UI with metrics tracking.
    """
    lf = _get_client()
    if lf is None or isinstance(span, _NullSpan):
        return
    try:
-        kwargs: dict[str, Any] = {"name": prompt_name}
+        prompt = get_prompt_object(prompt_name, version=version, label=label)
-        if version is not None:
+        if prompt is not None:
-            kwargs["version"] = version
+            span.update(prompt=prompt)
        if label is not None:
            kwargs["label"] = label
        prompt = lf.get_prompt(**kwargs)
        span.update(metadata={"prompt": {"name": prompt_name, "version": prompt.version}})
    except Exception as exc:
        logger.warning("tracing: link_prompt_to_trace(%s) failed: %s", prompt_name, exc)
--- a/services/chat/app/llm.py
+++ b/services/chat/app/llm.py
@@ -0,0 +1,77 @@
 """LLM factory — centralised model instantiation via LiteLLM.
 Adapted from app/core/llm.py for the Chat Service.
 Uses shared.config.settings instead of app.config.settings.
 """
 from __future__ import annotations
 import os
 import warnings
 from openai import AsyncOpenAI
 import litellm
 from langchain_openai import ChatOpenAI
 from langchain_litellm import ChatLiteLLM
 from shared.config import settings
 litellm.drop_params = True
 warnings.filterwarnings(
    "ignore",
    message=r"PydanticSerializationUnexpectedValue\(Expected `ResponseAPIUsage`",
    category=UserWarning,
 )
 def _api_key_for_model(model: str) -> str | None:
    if model.startswith("anthropic/"):
        return settings.ANTHROPIC_API_KEY or None
    if model.startswith("gemini/") or model.startswith("google/"):
        return settings.GOOGLE_API_KEY or None
    if model.startswith("cerebras/"):
        return settings.CEREBRAS_API_KEY or None
    if model.startswith("github/"):
        return settings.GITHUB_TOKEN or None
    if model.startswith("github_copilot/"):
        return None
    return settings.OPENAI_API_KEY or None
 def get_llm(
    *,
    model: str | None = None,
    temperature: float = 0,
    callbacks: list | None = None,
 ) -> ChatOpenAI | ChatLiteLLM:
    model = model or settings.LLM_MODEL
    if settings.GITHUB_COPILOT_TOKEN_DIR:
        os.environ.setdefault("GITHUB_COPILOT_TOKEN_DIR", settings.GITHUB_COPILOT_TOKEN_DIR)
    if settings.GITHUB_TOKEN:
        os.environ.setdefault("GITHUB_TOKEN", settings.GITHUB_TOKEN)
    if "/" in model:
        return ChatLiteLLM(model=model, temperature=temperature, callbacks=callbacks)
    return ChatOpenAI(
        model=model,
        temperature=temperature,
        api_key=_api_key_for_model(model),
        callbacks=callbacks,
    )
 async def embed(text: str) -> list[float]:
    model = settings.LLM_EMBED_MODEL
    if model.startswith("github_copilot/") or "/" in model:
        response = await litellm.aembedding(model=model, input=[text])
        return response.data[0]["embedding"]
    client = AsyncOpenAI(api_key=settings.OPENAI_API_KEY)
    response = await client.embeddings.create(model=model, input=text)
    return response.data[0].embedding
--- a/shared/config.py
+++ b/shared/config.py
@@ -62,6 +62,7 @@ class Settings(BaseSettings):
    ANTHROPIC_API_KEY: str = ""
    GOOGLE_API_KEY: str = ""
    CEREBRAS_API_KEY: str = ""
    GITHUB_TOKEN: str = ""
    LLM_MODEL: str = "gpt-4o"
    LLM_EMBED_MODEL: str = "text-embedding-3-small"
--- a/shared/llm.py
+++ b/shared/llm.py
@@ -33,6 +33,8 @@ def _api_key_for_model(model: str) -> str | None:
        return settings.GOOGLE_API_KEY or None
    if model.startswith("cerebras/"):
        return settings.CEREBRAS_API_KEY or None
    if model.startswith("github/"):
        return settings.GITHUB_TOKEN or None
    if model.startswith("github_copilot/"):
        return None
    return settings.OPENAI_API_KEY or None
@@ -49,6 +51,9 @@ def get_llm(
    if settings.GITHUB_COPILOT_TOKEN_DIR:
        os.environ.setdefault("GITHUB_COPILOT_TOKEN_DIR", settings.GITHUB_COPILOT_TOKEN_DIR)
    if settings.GITHUB_TOKEN:
        os.environ.setdefault("GITHUB_TOKEN", settings.GITHUB_TOKEN)
    if "/" in model:
        return ChatLiteLLM(model=model, temperature=temperature, callbacks=callbacks)