Add Langfuse observability: traces, prompt management, prompt-to-generation linking

- New app/core/langfuse_client.py: lazy singleton client, get_prompt_or_fallback()
  helper (returns raw template + prompt obj for linking), extract_usage() for token
  counts. No-ops when LANGFUSE_* env vars are not set.
- deep_agent.py: home-agent and floating-agent runs wrapped in spans; each ainvoke
  wrapped in a generation with model/input/output/usage; prompts fetched from
  Langfuse (adiuva-home-agent, adiuva-floating-agent, adiuva-floating-classifier)
  with hardcoded fallback.
- agent_runner.py: step1-classifier and step2-processor LLM calls traced; batch
  agent _run_agent_with_tools spans + generations; cloud-processor included.
  Prompts: adiuva-step1-classifier, adiuva-step2-processor, adiuva-cloud-processor.
- agent_setup.py: journey-setup span + generation per ainvoke; prompt_obj stored
  on JourneySession and reused across turns. Prompt: journey_system.
- settings.py: LANGFUSE_SECRET_KEY, LANGFUSE_PUBLIC_KEY, LANGFUSE_HOST added.
- .env.example: Langfuse section with EU/US/self-hosted host comments.
- requirements.txt: langfuse>=2.0.0.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Roberto Musso
2026-04-07 00:19:20 +02:00
parent 552b8eb305
commit 1ce1d492b0
7 changed files with 455 additions and 78 deletions

View File

@@ -16,7 +16,9 @@ from app.agents.note_agent import NOTE_TOOLS
from app.agents.project_agent import PROJECT_TOOLS
from app.agents.task_agent import TASK_TOOLS
from app.agents.timeline_agent import TIMELINE_TOOLS
from app.core.langfuse_client import extract_usage, get_langfuse, get_prompt_or_fallback
from app.core.llm import get_llm
from app.config.settings import settings
from app.core.memory_middleware import MemoryMiddleware
from app.core.ws_context import clear_tool_result_collector, execute_on_client, set_tool_result_collector
from app.db import async_session
@@ -536,17 +538,31 @@ async def _infer_floating_domain(message: str, context: dict[str, Any]) -> dict[
try:
llm = get_llm()
response = await llm.ainvoke(
[
SystemMessage(content=_FLOATING_DOMAIN_CLASSIFIER_SYSTEM),
HumanMessage(
content=(
f"Message:\n{message}\n\n"
f"Context:\n{json.dumps(classifier_context, ensure_ascii=True)}"
)
),
]
classifier_messages = [
SystemMessage(content=_FLOATING_DOMAIN_CLASSIFIER_SYSTEM),
HumanMessage(
content=(
f"Message:\n{message}\n\n"
f"Context:\n{json.dumps(classifier_context, ensure_ascii=True)}"
)
),
]
lf = get_langfuse()
_, classifier_prompt_obj = get_prompt_or_fallback(
"floating_domain_classifier", _FLOATING_DOMAIN_CLASSIFIER_SYSTEM
)
if lf:
with lf.start_as_current_observation(
as_type="generation",
name="floating-classifier",
model=settings.LLM_MODEL,
prompt=classifier_prompt_obj,
input=classifier_messages,
) as gen:
response = await llm.ainvoke(classifier_messages)
gen.update(output=_as_text(response.content), usage=extract_usage(response))
else:
response = await llm.ainvoke(classifier_messages)
parsed = _parse_json_object(_as_text(response.content))
if parsed is not None:
domain = _normalize_domain_payload(parsed, project_id)
@@ -571,8 +587,11 @@ async def _run_single_agent(
message: str,
context: dict[str, Any],
max_steps: int = 6,
langfuse_prompt: Any = None,
agent_name: str = "agent",
) -> str:
trace_id = _trace_id_from_context(context)
lf = get_langfuse()
llm = get_llm()
tools = _all_tools_for_user(user_id, trace_id)
model_context = _context_for_model(context)
@@ -591,9 +610,37 @@ async def _run_single_agent(
tool_calls_count = 0
collected: list[dict[str, Any]] = []
set_tool_result_collector(collected)
_span_ctx = (
lf.start_as_current_observation(
as_type="span",
name=agent_name,
user_id=user_id,
session_id=trace_id,
input=message,
)
if lf else None
)
_span = _span_ctx.__enter__() if _span_ctx else None
try:
for _ in range(max_steps):
_gen_ctx = (
lf.start_as_current_observation(
as_type="generation",
name=f"{agent_name}-llm",
model=settings.LLM_MODEL,
prompt=langfuse_prompt,
input=messages,
)
if lf else None
)
_gen = _gen_ctx.__enter__() if _gen_ctx else None
response: AIMessage = await llm_with_tools.ainvoke(messages)
if _gen_ctx:
_gen.update(output=_as_text(response.content), usage=extract_usage(response))
_gen_ctx.__exit__(None, None, None)
messages.append(response)
if not response.tool_calls:
@@ -605,6 +652,8 @@ async def _run_single_agent(
tool_calls_count,
len(final_text),
)
if _span:
_span.update(output=final_text)
return final_text
tool_map = {tool_def.name: tool_def for tool_def in tools}
@@ -644,9 +693,15 @@ async def _run_single_agent(
tool_calls_count,
len(final_text),
)
if _span:
_span.update(output=final_text)
return final_text
finally:
clear_tool_result_collector()
if _span_ctx:
_span_ctx.__exit__(None, None, None)
if lf:
lf.flush()
async def _run_single_agent_stream(
@@ -656,8 +711,11 @@ async def _run_single_agent_stream(
message: str,
context: dict[str, Any],
max_steps: int = 6,
langfuse_prompt: Any = None,
agent_name: str = "agent",
) -> AsyncGenerator[tuple[str, Any], None]:
trace_id = _trace_id_from_context(context)
lf = get_langfuse()
llm = get_llm()
tools = _all_tools_for_user(user_id, trace_id)
model_context = _context_for_model(context)
@@ -677,9 +735,38 @@ async def _run_single_agent_stream(
streamed_chars = 0
collected: list[dict[str, Any]] = []
set_tool_result_collector(collected)
_span_ctx = (
lf.start_as_current_observation(
as_type="span",
name=f"{agent_name}-stream",
user_id=user_id,
session_id=trace_id,
input=message,
)
if lf else None
)
_span = _span_ctx.__enter__() if _span_ctx else None
streamed_text: list[str] = []
try:
for _ in range(max_steps):
_gen_ctx = (
lf.start_as_current_observation(
as_type="generation",
name=f"{agent_name}-llm",
model=settings.LLM_MODEL,
prompt=langfuse_prompt,
input=messages,
)
if lf else None
)
_gen = _gen_ctx.__enter__() if _gen_ctx else None
response: AIMessage = await llm_with_tools.ainvoke(messages)
if _gen_ctx:
_gen.update(output=_as_text(response.content), usage=extract_usage(response))
_gen_ctx.__exit__(None, None, None)
messages.append(response)
if not response.tool_calls:
@@ -688,6 +775,7 @@ async def _run_single_agent_stream(
token = _as_text(getattr(chunk, "content", ""))
if token:
streamed_chars += len(token)
streamed_text.append(token)
emitted_any = True
yield "token", token
@@ -696,6 +784,7 @@ async def _run_single_agent_stream(
fallback_text = _as_text(response.content)
if fallback_text:
streamed_chars += len(fallback_text)
streamed_text.append(fallback_text)
yield "token", fallback_text
logger.info(
"deep_agent: run_single_agent_stream_end trace=%s user=%s tool_calls=%d response_chars=%d",
@@ -704,6 +793,8 @@ async def _run_single_agent_stream(
tool_calls_count,
streamed_chars,
)
if _span:
_span.update(output="".join(streamed_text))
return
tool_map = {tool_def.name: tool_def for tool_def in tools}
@@ -738,6 +829,7 @@ async def _run_single_agent_stream(
token = _as_text(getattr(chunk, "content", ""))
if token:
streamed_chars += len(token)
streamed_text.append(token)
yield "token", token
logger.info(
"deep_agent: run_single_agent_stream_end trace=%s user=%s tool_calls=%d response_chars=%d fallback=1",
@@ -746,17 +838,28 @@ async def _run_single_agent_stream(
tool_calls_count,
streamed_chars,
)
if _span:
_span.update(output="".join(streamed_text))
finally:
clear_tool_result_collector()
if _span_ctx:
_span_ctx.__exit__(None, None, None)
if lf:
lf.flush()
async def run_home(user_id: str, message: str, context: dict[str, Any]) -> str:
prepared_context = await _prepare_context(message, context)
system_prompt, langfuse_prompt = get_prompt_or_fallback(
"home_system", _HOME_SINGLE_AGENT_SYSTEM
)
response = await _run_single_agent(
user_id=user_id,
system_prompt=_HOME_SINGLE_AGENT_SYSTEM,
system_prompt=system_prompt,
message=message,
context=prepared_context,
langfuse_prompt=langfuse_prompt,
agent_name="home-agent",
)
return _normalize_tagged_list_lines(response, message)
@@ -764,11 +867,16 @@ async def run_home(user_id: str, message: str, context: dict[str, Any]) -> str:
async def run_floating(user_id: str, message: str, context: dict[str, Any]) -> tuple[str, dict[str, str | None]]:
prepared_context = await _prepare_context(message, context)
domain = await _infer_floating_domain(message, prepared_context)
system_prompt, langfuse_prompt = get_prompt_or_fallback(
"floating_system", _FLOATING_SINGLE_AGENT_SYSTEM
)
response = await _run_single_agent(
user_id=user_id,
system_prompt=_FLOATING_SINGLE_AGENT_SYSTEM,
system_prompt=system_prompt,
message=message,
context=prepared_context,
langfuse_prompt=langfuse_prompt,
agent_name="floating-agent",
)
sanitized = _strip_floating_markup(response)
if not sanitized and response:
@@ -782,12 +890,17 @@ async def run_home_stream(
context: dict[str, Any],
) -> AsyncGenerator[tuple[str, Any], None]:
prepared_context = await _prepare_context(message, context)
system_prompt, langfuse_prompt = get_prompt_or_fallback(
"home_system", _HOME_SINGLE_AGENT_SYSTEM
)
text_chunks: list[str] = []
async for event in _run_single_agent_stream(
user_id=user_id,
system_prompt=_HOME_SINGLE_AGENT_SYSTEM,
system_prompt=system_prompt,
message=message,
context=prepared_context,
langfuse_prompt=langfuse_prompt,
agent_name="home-agent",
):
event_type, data = event
if event_type != "token":
@@ -809,14 +922,19 @@ async def run_floating_stream(
domain = await _infer_floating_domain(message, prepared_context)
yield "floating_domain", domain
system_prompt, langfuse_prompt = get_prompt_or_fallback(
"floating_system", _FLOATING_SINGLE_AGENT_SYSTEM
)
sanitizer = _FloatingStreamSanitizer()
emitted_sanitized = False
raw_chunks: list[str] = []
async for event in _run_single_agent_stream(
user_id=user_id,
system_prompt=_FLOATING_SINGLE_AGENT_SYSTEM,
system_prompt=system_prompt,
message=message,
context=prepared_context,
langfuse_prompt=langfuse_prompt,
agent_name="floating-agent",
):
event_type, data = event
if event_type != "token":