refactor: remove storage, backup, plugin/marketplace features

- Delete app/storage/ (blob_store, vector_store, encryption) - Delete app/marketplace/ (plugin_registry, plugin_review, revenue_share) - Delete routes: backup.py, plugins.py, storage.py, vectors.py - Relocate embed endpoint to POST /chat/embed - Rewrite migration 001 (remove storage/plugin tables) - Delete migration 002 (seed_plugins) - Remove S3/Pinecone/Qdrant env vars from settings - Remove storage/backup quotas from tier_manager - Remove MinIO and Qdrant from docker-compose - Delete tests: test_backup, test_plugins, test_storage - Update README.md and clean .env.example
Refactor system prompt variables for clarity and consistency across agent setup and runner modules
2026-04-08 00:47:37 +02:00 · 2026-04-07 00:23:41 +02:00 · 2026-04-07 00:19:20 +02:00
114 changed files with 3302 additions and 9204 deletions
--- a/.env.example
+++ b/.env.example
@@ -4,19 +4,9 @@ ENV=dev
 # ── Database ──────────────────────────────────────────────────────────────────
 DATABASE_URL=postgresql+asyncpg://postgres:postgres@localhost:5432/adiuva
-# ── Redis ─────────────────────────────────────────────────────────────────────
+# ── Auth ──────────────────────────────────────────────────────────────────────
-REDIS_URL=redis://localhost:6379/0
+JWT_SECRET=replace-with-a-long-random-secret
-
+JWT_ALGORITHM=HS256
 # ── Auth (JWT RS256) ──────────────────────────────────────────────────────────
 # Generate keypair:
 #   openssl genpkey -algorithm RSA -out private.pem -pkeyopt rsa_keygen_bits:2048
 #   openssl rsa -in private.pem -pubout -out public.pem
 # Paste PEM content with literal \n for newlines.
 #
 # Private key — ONLY used by the Auth Service (JWT signing).
 JWT_PRIVATE_KEY=
 # Public key — used by all services / Traefik ForwardAuth (JWT verification).
 JWT_PUBLIC_KEY=
 JWT_ACCESS_TOKEN_EXPIRE_MINUTES=30
 JWT_REFRESH_TOKEN_EXPIRE_DAYS=30
@@ -27,41 +17,20 @@ OPENAI_API_KEY=
 ANTHROPIC_API_KEY=
 GOOGLE_API_KEY=
 LLM_MODEL=gpt-4o
 LLM_ROUTER_MODEL=gpt-4o-mini
 # ── Stripe (leave empty to stub billing) ──────────────────────────────────────
 STRIPE_SECRET_KEY=
 STRIPE_WEBHOOK_SECRET=
 # ── AWS / S3 ──────────────────────────────────────────────────────────────────
 S3_BUCKET=adiuva
 S3_REGION=us-east-1
 S3_ENDPOINT_URL=
 AWS_ACCESS_KEY_ID=
 AWS_SECRET_ACCESS_KEY=
 # For MinIO (homelab): S3_ENDPOINT_URL=http://minio:9000
-# ── Vector Store ──────────────────────────────────────────────────────────────
+# ── Langfuse (leave empty to disable observability) ───────────────────────────
-# Pinecone is used when PINECONE_API_KEY is set; otherwise falls back to Qdrant.
+LANGFUSE_SECRET_KEY=
-PINECONE_API_KEY=
+LANGFUSE_PUBLIC_KEY=
-PINECONE_INDEX=adiuva
+# LANGFUSE_HOST=https://cloud.langfuse.com        # EU (default)
-QDRANT_URL=
+# LANGFUSE_HOST=https://us.cloud.langfuse.com     # US
-QDRANT_API_KEY=
+# LANGFUSE_HOST=http://localhost:3000             # Self-hosted
 # For local Qdrant (homelab): QDRANT_URL=http://qdrant:6333
 # ── CORS ──────────────────────────────────────────────────────────────────────
 # Comma-separated list parsed by Settings (override default if needed)
 # CORS_ORIGINS=["app://.","http://localhost:3000"]
 # ── Langfuse (observability) ─────────────────────────────────────────────────
 LANGFUSE_SECRET_KEY=sk-lf-...
 LANGFUSE_PUBLIC_KEY=pk-lf-...
 LANGFUSE_HOST=https://cloud.langfuse.com  # or self-hosted URL
 # ── Cloudflare (Traefik ACME DNS-01 challenge) ───────────────────────────────
 CF_DNS_API_TOKEN=
 ACME_EMAIL=
 # ── PostgreSQL (used by docker-compose) ──────────────────────────────────────
 POSTGRES_USER=postgres
 POSTGRES_PASSWORD=postgres
 POSTGRES_DB=adiuva
--- a/.gitignore
+++ b/.gitignore
@@ -13,9 +13,6 @@ env/
 # Environment variables
 .env
 # Cryptographic keys
 *.pem
 # IDE
 .vscode/
 .idea/
@@ -24,6 +21,7 @@ env/
 .pytest_cache/
 htmlcov/
 .coverage
 tests/fixtures/private*/
 # Docker
 *.log
@@ -35,6 +33,3 @@ Thumbs.db
 # Claude Code
 .claude/
 logs/
 # Eval private test data
 services/batch-agent/eval/fixtures/private_data/
--- a/services/chat/Dockerfile
+++ b/services/chat/Dockerfile
@@ -3,34 +3,37 @@ FROM python:3.12-slim AS builder
 WORKDIR /build
-COPY services/chat/requirements.txt ./requirements.txt
+COPY requirements.txt .
 RUN pip install --upgrade pip && \
    pip install --no-cache-dir --prefix=/install -r requirements.txt
 # ── runtime ──────────────────────────────────────────────────────────────────
 FROM python:3.12-slim AS runtime
 # Non-root user
 RUN addgroup --system appgroup && adduser --system --ingroup appgroup appuser
 WORKDIR /app
 # Copy installed packages from builder
 COPY --from=builder /install /usr/local
-# Shared module
+# Copy application source
-COPY shared/ shared/
+COPY app/ app/
-# Service source
+# Copy Alembic migration files
-COPY services/chat/app/ app/
+COPY alembic/ alembic/
 COPY alembic.ini .
 # Ensure appuser owns the working directory
 RUN chown -R appuser:appgroup /app
 USER appuser
 EXPOSE 8000
 # Chat service is CPU-bound (LLM calls) — use multiple workers
 CMD ["gunicorn", "app.main:app", \
     "-k", "uvicorn.workers.UvicornWorker", \
     "--bind", "0.0.0.0:8000", \
-     "--workers", "2", \
+     "--workers", "4", \
     "--timeout", "120"]
--- a/README.md
+++ b/README.md
@@ -1,8 +1,8 @@
 # Adiuva Cloud API
-**AI-powered project management backend with E2E encrypted cloud storage, LLM orchestration, and a plugin marketplace.**
+**AI-powered project management backend with LLM orchestration and subscription billing.**
-Built with FastAPI · Python 3.12 · PostgreSQL · LangChain · Stripe · AWS S3
+Built with FastAPI · Python 3.12 · PostgreSQL · LangChain · Stripe
 ---
@@ -20,9 +20,7 @@ Built with FastAPI · Python 3.12 · PostgreSQL · LangChain · Stripe · AWS S3
 - [AI Agent System](#ai-agent-system)
 - [Orchestration & Execution Plans](#orchestration--execution-plans)
 - [Middleware](#middleware)
 - [Storage Layer](#storage-layer)
 - [Billing & Tiers](#billing--tiers)
 - [Plugin Marketplace](#plugin-marketplace)
 - [Testing](#testing)
 - [Project Structure](#project-structure)
 - [License](#license)
@@ -31,15 +29,13 @@ Built with FastAPI · Python 3.12 · PostgreSQL · LangChain · Stripe · AWS S3
 ## Overview
-Adiuva Cloud API is the FastAPI backend that powers the **Adiuva Electron desktop app**. It provides LLM-powered chat orchestration, end-to-end encrypted cloud storage, a vector search engine, an encrypted backup system, a plugin marketplace with revenue sharing, and Stripe-based subscription billing across four tiers.
+Adiuva Cloud API is the FastAPI backend that powers the **Adiuva Electron desktop app**. It provides LLM-powered chat orchestration, text embedding generation, and Stripe-based subscription billing across four tiers.
 ### Design Principles
-1. **Never persist user data in plaintext** — the database stores only auth, billing, storage metadata, and marketplace data. All user content is E2E encrypted by the client before reaching the server.
+1. **Never expose prompts** — system prompts stay server-side; responses are sanitized to strip any leaked prompt fragments.
-2. **Never expose prompts** — system prompts stay server-side; responses are sanitized to strip any leaked prompt fragments.
+2. **Stateless request handling** — all context comes from the client and JWT; no server-side session state.
-3. **Never decrypt user blobs** — the backend performs only checksum verification; no decryption keys ever reach the server.
+3. **Tier gates enforced server-side** — the server always reads the current tier from the database, never trusting client-reported values.
 4. **Stateless request handling** — all context comes from the client and JWT; no server-side session state.
 5. **Tier gates enforced server-side** — the server always reads the current tier from the database, never trusting client-reported values.
 ---
@@ -54,27 +50,26 @@ Adiuva Cloud API is the FastAPI backend that powers the **Adiuva Electron deskto
                      │  ┌──────────────────┐  ┌────────────────────────────┐  │
                      │  │  Auth Routes     │  │  Chat Routes               │  │
                      │  │  Billing Routes  │  │    ↓                       │  │
-                      │  │  Storage Routes  │  │  Orchestrator (GPT-4o-mini)│  │
+                      │  │  Agent Routes    │  │  Orchestrator (GPT-4o-mini)│  │
-                      │  │  Backup Routes   │  │    ↓ classify intent       │  │
+                      │  │  Device WS       │  │    ↓ classify intent       │  │
-                      │  │  Plugin Routes   │  │  Agent Registry            │  │
+                      │  └──────────────────┘  │  Agent Registry            │  │
-                      │  │  Vector Routes   │  │    ↓                       │  │
+                      │                        │    ↓                       │  │
-                      │  │  Plans Routes    │  │  TaskAgent  | ProjectAgent │  │
+                      │                        │  TaskAgent  | ProjectAgent │  │
-                      │  └──────────────────┘  │  NoteAgent  | CheckptAgent │  │
+                      │                        │  NoteAgent  | CheckptAgent │  │
                      │                        │  (GPT-4o + LangChain)      │  │
                      │                        └────────────────────────────┘  │
                      └────────────────────────────────────────────────────────┘
-                               │              │              │
+                               │
-                      ┌────────▼───┐  ┌───────▼───────┐  ┌──▼─────────────┐
+                      ┌────────▼───┐
-                      │ PostgreSQL │  │  AWS S3       │  │ Pinecone /     │
+                      │ PostgreSQL │
-                      │ (Auth,     │  │  (E2E blobs,  │  │ Qdrant         │
+                      │ (Auth,     │
-                      │  Billing,  │  │   backups)    │  │ (Vectors)      │
+                      │  Billing,  │
-                      │  Metadata) │  └───────────────┘  └────────────────┘
+                      │  Agents)   │
                      └────────────┘
                               │
                      ┌────────▼───┐
                      │  Stripe    │
-                      │  (Billing, │
+                      │  (Billing) │
                      │   Connect) │
                      └────────────┘
 ```
@@ -85,18 +80,14 @@ Adiuva Cloud API is the FastAPI backend that powers the **Adiuva Electron deskto
 1. **LLM-powered orchestration** — GPT-4o-mini classifies user intent and routes to the appropriate domain agent.
 2. **4 specialized AI agents** — Tasks (8 tools), Projects (6 tools), Timelines (4 tools), Notes (5 tools), all powered by GPT-4o via LangChain.
 3. **Execution plans & playbooks** — Server-side prompt template registry; clients receive only opaque template IDs, never raw prompts.
-4. **E2E encrypted cloud storage** — The backend never decrypts user data; SHA-256 checksum verification uses constant-time comparison to prevent timing attacks.
+4. **Text embeddings** — Generates text-embedding-3-small vectors for local client-side note search.
-5. **Cloud vector store** — Pinecone or Qdrant with user-isolated namespaces and encrypted blob payloads.
+5. **Stripe billing** — Four-tier subscription model (Free / Pro / Power / Team) with checkout sessions and full webhook lifecycle handling.
-6. **Encrypted backup system** — Tiered storage limits with `If-Modified-Since` support for efficient syncing.
+6. **JWT authentication** — Access + refresh tokens with bcrypt password hashing, SHA-256 token hashing, and automatic rotation.
-7. **Plugin marketplace** — Catalog, admin review/approval workflow, security checklist, and 70/30 revenue sharing via Stripe Connect.
+7. **Prompt IP protection** — Sanitizer middleware strips system prompts, reasoning markers, tool schemas, and agent routing metadata from all chat responses.
-8. **Stripe billing** — Four-tier subscription model (Free / Pro / Power / Team) with checkout sessions and full webhook lifecycle handling.
+8. **Tier-based rate limiting** — Sliding-window per-user limiter scaling from 20 to 200 requests/min by subscription tier.
-9. **JWT authentication** — Access + refresh tokens with bcrypt password hashing, SHA-256 token hashing, and automatic rotation.
+9. **WebSocket streaming** — Real-time chat with 30-second heartbeat keep-alive and chunked text delivery.
-10. **Prompt IP protection** — Sanitizer middleware strips system prompts, reasoning markers, tool schemas, and agent routing metadata from all chat responses.
+10. **Alembic migrations** — Versioned schema management.
-11. **Tier-based rate limiting** — Sliding-window per-user limiter scaling from 20 to 200 requests/min by subscription tier.
+11. **Comprehensive test suite** — In-memory SQLite, per-tier test fixtures, and full API coverage without external dependencies.
 12. **Zero-trust data model** — User content is never stored in plaintext; the database holds only authentication, billing, and metadata records.
 13. **WebSocket streaming** — Real-time chat with 30-second heartbeat keep-alive and chunked text delivery.
 14. **Alembic migrations** — Versioned schema management with seed data for the plugin marketplace.
 15. **Comprehensive test suite** — In-memory SQLite + moto S3 mocks, per-tier test fixtures, and full API coverage without external dependencies.
 ---
@@ -114,7 +105,6 @@ Adiuva Cloud API is the FastAPI backend that powers the **Adiuva Electron deskto
 | `pydantic-settings` | ≥ 2.7.0 | Environment-based configuration |
 | `python-jose[cryptography]` | ≥ 3.3.0 | JWT encoding and decoding |
 | `stripe` | ≥ 11.0.0 | Billing and payment integration |
 | `boto3` | ≥ 1.35.0 | AWS S3 client |
 | `slowapi` | ≥ 0.1.9 | Rate limiting utilities |
 | `sqlalchemy` | ≥ 2.0.0 | Async ORM and query builder |
 | `asyncpg` | ≥ 0.30.0 | PostgreSQL async driver |
@@ -124,12 +114,9 @@ Adiuva Cloud API is the FastAPI backend that powers the **Adiuva Electron deskto
 | `httpx` | ≥ 0.28.0 | Async HTTP client (used in tests) |
 | `websockets` | ≥ 14.0 | WebSocket protocol support |
 | `psycopg2-binary` | ≥ 2.9.0 | Synchronous PostgreSQL driver (Alembic) |
 | `pinecone` | ≥ 5.0.0 | Pinecone vector store client |
 | `qdrant-client` | ≥ 1.7.0 | Qdrant vector store client |
 | `pytest` | ≥ 8.0.0 | Test framework |
 | `pytest-asyncio` | ≥ 0.24.0 | Async test support |
 | `aiosqlite` | ≥ 0.20.0 | In-memory SQLite for tests |
 | `moto[s3]` | ≥ 5.0.0 | AWS S3 mock for tests |
 | `ruff` | ≥ 0.8.0 | Linter and formatter |
 ---
@@ -142,7 +129,6 @@ Adiuva Cloud API is the FastAPI backend that powers the **Adiuva Electron deskto
 - PostgreSQL 16+
 - An OpenAI API key (for LLM features)
 - Stripe API keys (optional — billing stubs gracefully when unconfigured)
 - AWS credentials (optional — needed for S3 storage in production)
 ### Installation
@@ -194,11 +180,6 @@ This starts two services:
 - **app** — FastAPI server on port `8000`
 - **db** — PostgreSQL 16 (Alpine) on port `5432` with a persistent volume and health checks
 The compose file also includes optional services for fully local deployments:
 - **minio** — S3-compatible object storage on ports `9000` (API) and `9001` (console)
 - **qdrant** — Vector search engine on ports `6333` (HTTP) and `6334` (gRPC)
 ### Dockerfile Details
 The Dockerfile uses a multi-stage build:
@@ -216,7 +197,7 @@ gunicorn app.main:app -k uvicorn.workers.UvicornWorker -w 4 --timeout 120 -b 0.0
 ## Homelab / Self-Hosted Deployment
-You can run the entire stack locally on a homelab with **no cloud dependencies except the LLM provider**. The compose file includes MinIO (S3 replacement) and Qdrant (vector store) out of the box.
+You can run the entire stack locally on a homelab with **no cloud dependencies except the LLM provider**.
 ### 1. Start all services
@@ -224,35 +205,14 @@ You can run the entire stack locally on a homelab with **no cloud dependencies e
 docker compose up -d
 ```
-This starts PostgreSQL, MinIO, and Qdrant alongside the app.
+This starts PostgreSQL alongside the app.
-### 2. Create the MinIO bucket
+### 2. Configure your `.env`
 Open the MinIO console at [http://localhost:9001](http://localhost:9001) (login: `minioadmin` / `minioadmin`) and create a bucket named `adiuva`, or use the CLI:
 ```bash
 docker compose exec minio mc alias set local http://localhost:9000 minioadmin minioadmin
 docker compose exec minio mc mb local/adiuva
 ```
 ### 3. Configure your `.env`
 ```bash
 # Database (uses the compose PostgreSQL)
 DATABASE_URL=postgresql+asyncpg://postgres:postgres@db:5432/adiuva
 # S3 → MinIO
 S3_BUCKET=adiuva
 S3_REGION=us-east-1
 S3_ENDPOINT_URL=http://minio:9000
 AWS_ACCESS_KEY_ID=minioadmin
 AWS_SECRET_ACCESS_KEY=minioadmin
 # Vector store → local Qdrant (leave PINECONE_API_KEY empty)
 QDRANT_URL=http://qdrant:6333
 QDRANT_API_KEY=
 PINECONE_API_KEY=
 # Billing — leave empty to stub (no Stripe needed)
 STRIPE_SECRET_KEY=
 STRIPE_WEBHOOK_SECRET=
@@ -267,7 +227,7 @@ JWT_SECRET=your-secret-here
 ENV=dev
 ```
-### 4. Run migrations
+### 3. Run migrations
 ```bash
 docker compose exec app alembic upgrade head
@@ -278,9 +238,7 @@ docker compose exec app alembic upgrade head
 | Service | Runs on | Port | Notes |
 |---|---|---|---|
 | FastAPI app | Docker | 8000 | API server |
-| PostgreSQL | Docker | 5432 | Auth, billing, metadata |
+| PostgreSQL | Docker | 5432 | Auth, billing, agents |
 | MinIO | Docker | 9000 / 9001 | S3-compatible blob & backup storage |
 | Qdrant | Docker | 6333 / 6334 | Vector search (replaces Pinecone) |
 | Stripe | — | — | Stubbed when keys are empty |
 | OpenAI / LLM | Cloud | — | Only external dependency |
@@ -300,17 +258,7 @@ All variables are loaded from a `.env` file via Pydantic Settings. Source: `app/
 | `JWT_ACCESS_TOKEN_EXPIRE_MINUTES` | `int` | `30` | Access token time-to-live |
 | `JWT_REFRESH_TOKEN_EXPIRE_DAYS` | `int` | `30` | Refresh token time-to-live |
 | `STRIPE_SECRET_KEY` | `str` | `""` | Stripe API key (empty = stub mode) |
-| `STRIPE_WEBHOOK_SECRET` | `str` | `""` | Stripe webhook signature secret |
+| `STRIPE_WEBHOOK_SECRET` | `str` | `\"\"` | Stripe webhook signature secret |\n| `OPENAI_API_KEY` | `str` | `\"\"` | OpenAI key for LLM agent calls |
 | `S3_BUCKET` | `str` | `""` | S3 bucket for encrypted blobs and backups |
 | `S3_REGION` | `str` | `us-east-1` | AWS region |
 | `S3_ENDPOINT_URL` | `str` | `""` | Custom S3 endpoint (e.g. `http://minio:9000` for MinIO). Leave empty for AWS. |
 | `AWS_ACCESS_KEY_ID` | `str` | `""` | AWS credentials |
 | `AWS_SECRET_ACCESS_KEY` | `str` | `""` | AWS credentials |
 | `PINECONE_API_KEY` | `str` | `""` | Pinecone API key (if set, Pinecone is used for vectors) |
 | `PINECONE_INDEX` | `str` | `adiuva` | Pinecone index name |
 | `QDRANT_URL` | `str` | `""` | Qdrant URL (used when Pinecone is not configured) |
 | `QDRANT_API_KEY` | `str` | `""` | Qdrant API key |
 | `OPENAI_API_KEY` | `str` | `""` | OpenAI key for LLM agent calls |
 | `LLM_MODEL` | `str` | `gpt-4o` | LiteLLM model identifier for agents (e.g. `anthropic/claude-3.5-sonnet`, `gemini/gemini-pro`, `ollama/llama3`) |
 | `LLM_ROUTER_MODEL` | `str` | `gpt-4o-mini` | Lighter model used for intent classification / routing |
 | `CORS_ORIGINS` | `list[str]` | `["app://.", "http://localhost:3000", "http://localhost:5173"]` | Allowed CORS origins |
@@ -342,6 +290,7 @@ All routes are prefixed with `/api/v1`. **27 endpoints** total (25 REST + 1 WebS
 | Method | Path | Auth | Description |
 |---|---|---|---|
 | `POST` | `/api/v1/chat` | JWT | Route message through the orchestrator; returns `ChatResponse` or `ExecutionPlan` depending on execution mode |
 | `POST` | `/api/v1/chat/embed` | JWT | Generate a 1536-dim text embedding vector (`text-embedding-3-small`). Used by Electron for local note search. |
 | `WS` | `/api/v1/chat/stream` | JWT (query param `?token=`) | Streaming chat — first frame is a `ChatRequest`, server yields text chunks, final frame is `{"done": true, "response": "...", "actions": [...]}`. 30-second heartbeat ping. |
 ### Plans
@@ -351,42 +300,6 @@ All routes are prefixed with `/api/v1`. **27 endpoints** total (25 REST + 1 WebS
 | `GET` | `/api/v1/plans/playbook` | JWT | List all cached execution plan playbooks |
 | `GET` | `/api/v1/plans/playbook/{plan_id}` | JWT | Retrieve a specific playbook by ID |
 ### Storage (Cloud Records)
 | Method | Path | Auth | Description |
 |---|---|---|---|
 | `POST` | `/api/v1/storage/records` | JWT | Upload an E2E encrypted record (verifies checksum, enforces storage quota) |
 | `GET` | `/api/v1/storage/records` | JWT | List record metadata with pagination (`?table`, `?page`, `?limit`); no blob bytes returned |
 | `GET` | `/api/v1/storage/records/{id}` | JWT | Download encrypted blob with `X-Checksum` response header |
 | `PUT` | `/api/v1/storage/records/{id}` | JWT | Replace an existing blob (verifies checksum, enforces quota) |
 | `DELETE` | `/api/v1/storage/records/{id}` | JWT | Delete a record and its S3 blob |
 ### Vectors (Cloud Vector Store)
 | Method | Path | Auth | Description |
 |---|---|---|---|
 | `POST` | `/api/v1/storage/vectors/upsert` | JWT | Verify checksums and upsert encrypted vectors |
 | `POST` | `/api/v1/storage/vectors/search` | JWT | Search user-scoped vector namespace |
 | `DELETE` | `/api/v1/storage/vectors` | JWT | Delete vectors by ID list |
 ### Backup
 | Method | Path | Auth | Description |
 |---|---|---|---|
 | `PUT` | `/api/v1/backup` | JWT | Upload encrypted backup blob with custom headers (`X-Backup-Version`, `X-Backup-Timestamp`, `X-Backup-Checksum`). Tier quota enforced. |
 | `GET` | `/api/v1/backup` | JWT | Download latest backup blob. Supports `If-Modified-Since`. |
 | `GET` | `/api/v1/backup/history` | JWT | List backup metadata (no blob content) |
 | `DELETE` | `/api/v1/backup/{backup_id}` | JWT | Delete a specific backup |
 ### Plugins (Marketplace)
 | Method | Path | Auth | Description |
 |---|---|---|---|
 | `GET` | `/api/v1/plugins` | JWT (Power+) | Browse the marketplace (`?category`, `?q`, `?page`, `?sort=rating\|installs\|newest`) |
 | `GET` | `/api/v1/plugins/{id}` | JWT (Power+) | Plugin detail with install count and ratings |
 | `POST` | `/api/v1/plugins/{id}/install` | JWT (Power+) | Install plugin; triggers Stripe Connect revenue split for paid plugins |
 | `DELETE` | `/api/v1/plugins/{id}/install` | JWT | Uninstall plugin |
 ### Billing
 | Method | Path | Auth | Description |
@@ -400,7 +313,7 @@ All routes are prefixed with `/api/v1`. **27 endpoints** total (25 REST + 1 WebS
 ## Data Model
-9 tables managed by Alembic migrations. Source: `app/models.py`
+3 tables managed by Alembic migrations. Source: `app/models.py`
 ### Tables
@@ -409,27 +322,18 @@ All routes are prefixed with `/api/v1`. **27 endpoints** total (25 REST + 1 WebS
 | `users` | `id` (UUID) | `email` (unique), `password_hash`, `tier`, `stripe_customer_id`, timestamps | User accounts |
 | `refresh_tokens` | `id` (UUID) | `user_id` (FK), `token_hash` (SHA-256, unique), `expires_at` | Hashed refresh tokens for rotation |
 | `subscriptions` | `id` (UUID) | `user_id` (FK, unique), `stripe_subscription_id`, `tier`, `status`, `current_period_end` | Stripe subscription records |
 | `storage_records` | `id` (UUID) | `user_id` (FK), `table_name`, `s3_key`, `checksum`, `size_bytes`, timestamps | S3 blob metadata (no plaintext content) |
 | `backup_metadata` | `id` (UUID) | `user_id` (FK), `s3_key`, `version`, `timestamp`, `checksum`, `size_bytes` | Backup manifests |
 | `plugins` | `id` (String) | `name`, `description`, `version`, `author_id` (FK), `category`, `price_cents`, `permissions` (JSON), `status`, `s3_package_key`, `install_count`, `avg_rating` | Marketplace plugin catalog |
 | `plugin_installations` | `id` (UUID) | `plugin_id` (FK), `user_id` (FK), unique constraint on (`plugin_id`, `user_id`) | Per-user install tracking |
 | `plugin_reviews` | `id` (UUID) | `plugin_id` (FK), `reviewer_id` (FK), `decision`, `notes`, `reviewed_at` | Admin review decisions |
 | `revenue_events` | `id` (UUID) | `plugin_id` (FK), `user_id` (FK), `amount_cents`, `developer_share_cents`, `stripe_transfer_id` | 70/30 revenue split ledger |
 ### Enum Types
 | Enum | Values |
 |---|---|
 | `billing_tier` | `free`, `pro`, `power`, `team` |
 | `plugin_status` | `pending_review`, `approved`, `rejected` |
 | `review_decision` | `approved`, `rejected` |
 ### Migrations
 | Version | Description |
 |---|---|
-| `001_initial_schema` | Creates all 9 tables with indexes and foreign key constraints |
+| `001_initial_schema` | Creates core auth and billing tables with indexes and foreign key constraints |
 | `002_seed_plugins` | Seeds 3 approved plugins: GitHub Sync (free), Slack Notifier (€4.99), Time Tracker (€9.99) |
 ---
@@ -439,7 +343,7 @@ The agent system uses a registry pattern with LangChain tool-calling agents powe
 ### Architecture
- **`BaseAgent`** — Abstract base with `user_id`, `shared_memory`, and `vector_store_context`.
+- **`BaseAgent`** — Abstract base with `user_id` and `shared_memory`.
 - **`ChatAgent(BaseAgent)`** — Abstract `handle(query, context)` and `get_tools()` methods, plus a shared `_tool_loop(llm, messages, tools, max_iter=5)` for iterative tool calling.
 - **`AgentRegistry`** — Singleton registry with `@register` decorator, `get(name)`, `list_agents()`, and `call_agent(name, query, context)`.
@@ -554,39 +458,6 @@ Source: `app/api/middleware/sanitizer.py`
 - Scans JSON response bodies and replaces leaked prompt IP fragments with `[REDACTED]`.
 - Detects: system prompt openers, agent routing metadata, LangChain tool schemas, internal reasoning markers (`<thinking>`, `[INST]`), and known prompt fingerprints.
 - Logs sanitization events as `WARNING`.
 - Binary responses (storage, backup) are never touched.
 ---
 ## Storage Layer
 ### Blob Store
 Source: `app/storage/blob_store.py`
 - S3-backed storage for E2E encrypted blobs.
 - Object keys follow the pattern: `{user_id}/{table}/{record_id}`
 - Server-side SSE-S3 encryption at rest (additional layer on top of client-side E2E encryption).
 - Methods: `upload()`, `download()`, `delete()` (idempotent), `list_keys()`
 - The backend **never inspects or decrypts blob content**.
 ### Vector Store
 Source: `app/storage/vector_store.py`
 - Runtime-configurable: **Pinecone** (when `PINECONE_API_KEY` is set) or **Qdrant** (fallback).
 - User isolation: Pinecone uses `namespace=user_id`; Qdrant filters by `user_id` payload field.
 - 32-dimensional SHA-256-derived float vectors (deterministic, not semantically meaningful on encrypted data — a documented trade-off for privacy).
 - Encrypted blobs are stored as base64 in metadata/payload for verbatim retrieval.
 - Methods: `upsert()`, `search()`, `delete()`
 ### Encryption Utilities
 Source: `app/storage/encryption.py`
 - `verify_checksum(blob, checksum)` — SHA-256 hash comparison using `hmac.compare_digest` (constant-time to prevent timing attacks).
 - `reject_if_tampered(blob, checksum)` — Raises HTTP 400 on checksum mismatch.
 - **No decryption key ever reaches the backend.**
 ---
@@ -600,11 +471,8 @@ Source: `app/billing/stripe_service.py`, `app/billing/tier_manager.py`
 |---|---|---|---|---|
 | AI Agents | 3 | Unlimited | Unlimited | Unlimited |
 | Batch Active | 2 | 10 | Unlimited | Unlimited |
 | Cloud Storage | 0 GB | 5 GB | 25 GB | Unlimited |
 | Backup Storage | 0 GB | 5 GB | 25 GB | Unlimited |
 | LLM Providers | 1 | Unlimited | Unlimited | Unlimited |
 | Batch Builder | — | — | ✓ | ✓ |
 | Plugin Marketplace | — | — | ✓ | ✓ |
 | SSO | — | — | — | ✓ |
 | Rate Limit | 20 req/min | 60 req/min | 120 req/min | 200 req/min |
@@ -620,47 +488,6 @@ Source: `app/billing/stripe_service.py`, `app/billing/tier_manager.py`
 - `get_tier(user_id)` — Returns the user's current billing tier.
 - `check_feature(tier, feature)` — Boolean feature gate check.
 - `require_feature(tier, feature)` — Raises HTTP 403 if the feature is not available.
 - `enforce_quota(user_id, tier)` / `enforce_backup_quota(user_id, tier)` — Raises HTTP 402 if storage limits are exceeded.
 ---
 ## Plugin Marketplace
 Source: `app/marketplace/`
 ### Plugin Registry
 - PostgreSQL-backed catalog of submitted and approved plugins.
 - `list_plugins(db, category, query, page, sort)` — Paginated listing (page size: 20) with optional filtering by category, text search, and sorting by `rating`, `installs`, or `newest`.
 - `get_plugin(db, plugin_id)` — Full manifest with install count and ratings.
 - `submit_plugin(db, manifest, s3_key)` — Submits a plugin with `pending_review` status.
 - `approve_plugin()` / `reject_plugin(reason)` — Admin workflow for plugin approval.
 - `record_install()` / `record_uninstall()` — Tracks per-user installations and updates install counts.
 ### Review Queue
 - Automated security checklist before human review:
  - Plugin ID must match `^[a-z0-9-]+$`
  - Permissions must be from the allowed set only
  - No binary blobs in the manifest
 - **Allowed permissions:** `read:tasks`, `write:tasks`, `read:projects`, `write:projects`, `read:notes`, `write:notes`, `read:timelines`, `write:timelines`, `read:calendar`, `write:calendar`
 - `get_pending(db)` — Lists plugins awaiting review.
 - `submit_review(db, plugin_id, reviewer_id, decision, notes)` — Records the review decision.
 ### Revenue Sharing
 - **70% developer / 30% platform** split on all paid plugin sales.
 - `record_install(db, plugin_id, user_id, amount_cents)` — Records the revenue event and triggers a Stripe Connect transfer for the developer share.
 - `get_earnings(db, developer_id, period)` — Aggregated earnings report for plugin developers.
 - Gracefully stubs transfers when Stripe is not configured.
 ### Seed Plugins
 | Plugin | Category | Price |
 |---|---|---|
 | GitHub Sync | Productivity | Free |
 | Slack Notifier | Communication | €4.99 |
 | Time Tracker | Productivity | €9.99 |
 ---
@@ -682,10 +509,8 @@ pytest -v
 ### Test Infrastructure
 - **Database:** Async SQLite in-memory via `aiosqlite` + `StaticPool` — fast, no PostgreSQL needed.
 - **S3 mock:** `moto[s3]` with a fixture that patches `BlobStore` settings.
 - **Auth helpers:** `make_jwt(tier)` and `auth_header(tier)` generate per-tier test tokens.
 - **Seed data:** Auto-creates one `User` + `Subscription` per tier (free/pro/power/team) before each test.
 - **Plugin seeds:** Fixture adds 3 approved plugins for marketplace tests.
 - **FK enforcement:** SQLite `PRAGMA foreign_keys=ON`.
 - **No external dependencies** — all tests run fully offline.
@@ -694,13 +519,6 @@ pytest -v
 | File | Coverage |
 |---|---|
 | `test_auth.py` | Register, login, token access, refresh, expiration |
 | `test_orchestrator.py` | Intent classification, single agent routing, pipeline, plan mode |
 | `test_agents.py` | Each agent with mocked LLM: registration, tools, handle method |
 | `test_storage.py` | Create, list, download, update, delete records; checksum rejection; quota enforcement |
 | `test_backup.py` | Upload, download, history, delete; tier-based storage limits |
 | `test_plugins.py` | List, install, uninstall, revenue events, tier gate enforcement |
 | `test_agent_registry.py` | Registry singleton, registration, lookup, listing |
 | `test_execution_plan.py` | Plan builder, template registry, plan cache |
 | `test_middleware.py` | Rate limiting by tier, sanitizer prompt leak detection |
 ---
@@ -710,7 +528,6 @@ pytest -v
 ```
 adiuva-api/
 ├── alembic.ini                  # Alembic configuration
 ├── BACKEND_PLAN.md              # Architecture & design decisions
 ├── docker-compose.yml           # Docker Compose (app + PostgreSQL)
 ├── Dockerfile                   # Multi-stage production build
 ├── requirements.txt             # Python dependencies
@@ -719,13 +536,12 @@ adiuva-api/
 │   ├── env.py                   # Alembic environment config
 │   ├── script.py.mako           # Migration template
 │   └── versions/
-│       ├── 001_initial_schema.py    # Tables, indexes, FKs
+│       └── 001_initial_schema.py    # Tables, indexes, FKs
 │       └── 002_seed_plugins.py      # Seed marketplace plugins
 │
 ├── app/                         # Application source
 │   ├── main.py                  # FastAPI app factory, middleware, routes
 │   ├── db.py                    # Async SQLAlchemy engine & session
-│   ├── models.py                # SQLAlchemy ORM models (9 tables)
+│   ├── models.py                # SQLAlchemy ORM models
 │   ├── schemas.py               # Pydantic request/response schemas
 │   │
 │   ├── config/
@@ -739,48 +555,30 @@ adiuva-api/
 │   │
 │   ├── core/                    # Orchestration engine
 │   │   ├── agent_registry.py    # BaseAgent, ChatAgent, AgentRegistry
-│   │   ├── llm.py               # LiteLLM factory (get_llm)
+│   │   ├── llm.py               # LiteLLM factory (get_llm, get_router_llm)
-│   │   ├── orchestrator.py      # Intent classification & routing
+│   │   └── deep_agent.py        # Deep agent orchestration
 │   │   └── execution_plan.py    # Plan builder, templates, cache
 │   │
 │   ├── api/                     # HTTP layer
 │   │   ├── deps.py              # Shared FastAPI dependencies
 │   │   ├── middleware/
 │   │   │   ├── auth.py          # JWT validation, live tier lookup
 │   │   │   ├── rate_limit.py    # Sliding-window tier rate limiter
 │   │   │   └── sanitizer.py     # Prompt IP leak protection
 │   │   └── routes/
 │   │       ├── auth.py          # Register, login, refresh, me
-│   │       ├── chat.py          # Chat + WebSocket streaming
+│   │       ├── chat.py          # Chat + embed endpoint
-│   │       ├── plans.py         # Execution plan playbooks
+│   │       ├── billing.py       # Stripe checkout, webhooks, subscription
-│   │       ├── storage.py       # E2E encrypted record CRUD
+│   │       ├── agents.py        # Agent catalog, config, runs
-│   │       ├── vectors.py       # Vector upsert, search, delete
+│   │       └── device_ws.py     # Persistent device WebSocket
 │   │       ├── backup.py        # Encrypted backup management
 │   │       ├── plugins.py       # Marketplace browse & install
 │   │       └── billing.py       # Stripe checkout & webhooks
 │   │
-│   ├── storage/                 # Storage backends
+│   └── billing/
-│   │   ├── blob_store.py        # S3 blob storage
+│       ├── stripe_service.py    # Stripe API wrapper
-│   │   ├── vector_store.py      # Pinecone / Qdrant vector store
+│       └── tier_manager.py      # Feature matrix, rate limits
 │   │   └── encryption.py        # Checksum verification utilities
 │   │
 │   ├── billing/                 # Subscription management
 │   │   ├── stripe_service.py    # Stripe API integration
 │   │   └── tier_manager.py      # Feature matrix & quota enforcement
 │   │
 │   └── marketplace/             # Plugin ecosystem
 │       ├── plugin_registry.py   # Catalog CRUD & search
 │       ├── plugin_review.py     # Security checklist & review queue
 │       └── revenue_share.py     # 70/30 split & Stripe Connect
 │
 └── tests/                       # Test suite
-    ├── conftest.py              # Fixtures: DB, S3, auth, seeds
+    ├── conftest.py              # Fixtures: DB, auth, seeds
    ├── test_auth.py
    ├── test_orchestrator.py
    ├── test_agents.py
    ├── test_storage.py
    ├── test_backup.py
    ├── test_plugins.py
    ├── test_agent_registry.py
    ├── test_execution_plan.py
    └── test_middleware.py
--- a/alembic/versions/001_initial_schema.py
+++ b/alembic/versions/001_initial_schema.py
@@ -1,5 +1,4 @@
-"""Initial schema: users, refresh_tokens, subscriptions, storage_records,
+"""Initial schema: users, refresh_tokens, subscriptions.
 backup_metadata, plugins, plugin_installations, plugin_reviews, revenue_events.
 Revision ID: 001
 Revises:
@@ -28,18 +27,6 @@ def upgrade() -> None:
        EXCEPTION WHEN duplicate_object THEN NULL;
        END $$;
    """)
    op.execute("""
        DO $$ BEGIN
            CREATE TYPE plugin_status AS ENUM ('pending_review', 'approved', 'rejected');
        EXCEPTION WHEN duplicate_object THEN NULL;
        END $$;
    """)
    op.execute("""
        DO $$ BEGIN
            CREATE TYPE review_decision AS ENUM ('approved', 'rejected');
        EXCEPTION WHEN duplicate_object THEN NULL;
        END $$;
    """)
    # ── users ─────────────────────────────────────────────────────────────
    op.create_table(
@@ -88,122 +75,10 @@ def upgrade() -> None:
    op.create_index("ix_subscriptions_user_id", "subscriptions", ["user_id"])
    op.create_index("ix_subscriptions_stripe_id", "subscriptions", ["stripe_subscription_id"])
    # ── storage_records ───────────────────────────────────────────────────
    op.create_table(
        "storage_records",
        sa.Column("id", postgresql.UUID(as_uuid=False), nullable=False),
        sa.Column("user_id", postgresql.UUID(as_uuid=False), nullable=False),
        sa.Column("table_name", sa.String(100), nullable=False),
        sa.Column("s3_key", sa.String(500), nullable=False),
        sa.Column("checksum", sa.String(64), nullable=False),
        sa.Column("size_bytes", sa.Integer, nullable=False),
        sa.Column("created_at", sa.DateTime(timezone=True), nullable=False, server_default=sa.text("now()")),
        sa.Column("updated_at", sa.DateTime(timezone=True), nullable=False, server_default=sa.text("now()")),
        sa.PrimaryKeyConstraint("id"),
        sa.ForeignKeyConstraint(["user_id"], ["users.id"], ondelete="CASCADE"),
    )
    op.create_index("ix_storage_records_user_id", "storage_records", ["user_id"])
    # ── backup_metadata ───────────────────────────────────────────────────
    op.create_table(
        "backup_metadata",
        sa.Column("id", postgresql.UUID(as_uuid=False), nullable=False),
        sa.Column("user_id", postgresql.UUID(as_uuid=False), nullable=False),
        sa.Column("s3_key", sa.String(500), nullable=False),
        sa.Column("version", sa.Integer, nullable=False),
        sa.Column("timestamp", sa.BigInteger, nullable=False),
        sa.Column("checksum", sa.String(64), nullable=False),
        sa.Column("size_bytes", sa.Integer, nullable=False),
        sa.Column("created_at", sa.DateTime(timezone=True), nullable=False, server_default=sa.text("now()")),
        sa.PrimaryKeyConstraint("id"),
        sa.ForeignKeyConstraint(["user_id"], ["users.id"], ondelete="CASCADE"),
    )
    op.create_index("ix_backup_metadata_user_id", "backup_metadata", ["user_id"])
    # ── plugins ───────────────────────────────────────────────────────────
    op.create_table(
        "plugins",
        sa.Column("id", sa.String(255), nullable=False),
        sa.Column("name", sa.String(255), nullable=False),
        sa.Column("description", sa.Text, nullable=False, server_default=""),
        sa.Column("version", sa.String(50), nullable=False, server_default="1.0.0"),
        sa.Column("author_id", postgresql.UUID(as_uuid=False), nullable=True),
        sa.Column("author_name", sa.String(255), nullable=False, server_default=""),
        sa.Column("category", sa.String(100), nullable=False, server_default=""),
        sa.Column("price_cents", sa.Integer, nullable=False, server_default="0"),
        sa.Column("permissions", sa.Text, nullable=False, server_default="[]"),
        sa.Column("status", postgresql.ENUM("pending_review", "approved", "rejected", name="plugin_status", create_type=False), nullable=False, server_default="pending_review"),
        sa.Column("s3_package_key", sa.String(500), nullable=True),
        sa.Column("install_count", sa.Integer, nullable=False, server_default="0"),
        sa.Column("avg_rating", sa.Float, nullable=False, server_default="0.0"),
        sa.Column("rejection_reason", sa.Text, nullable=True),
        sa.Column("submitted_at", sa.DateTime(timezone=True), nullable=False, server_default=sa.text("now()")),
        sa.Column("created_at", sa.DateTime(timezone=True), nullable=False, server_default=sa.text("now()")),
        sa.PrimaryKeyConstraint("id"),
        sa.ForeignKeyConstraint(["author_id"], ["users.id"], ondelete="SET NULL"),
    )
    # ── plugin_installations ──────────────────────────────────────────────
    op.create_table(
        "plugin_installations",
        sa.Column("id", postgresql.UUID(as_uuid=False), nullable=False),
        sa.Column("plugin_id", sa.String(255), nullable=False),
        sa.Column("user_id", postgresql.UUID(as_uuid=False), nullable=False),
        sa.Column("installed_at", sa.DateTime(timezone=True), nullable=False, server_default=sa.text("now()")),
        sa.PrimaryKeyConstraint("id"),
        sa.ForeignKeyConstraint(["plugin_id"], ["plugins.id"], ondelete="CASCADE"),
        sa.ForeignKeyConstraint(["user_id"], ["users.id"], ondelete="CASCADE"),
        sa.UniqueConstraint("plugin_id", "user_id", name="uq_plugin_user"),
    )
    op.create_index("ix_plugin_installations_plugin_id", "plugin_installations", ["plugin_id"])
    op.create_index("ix_plugin_installations_user_id", "plugin_installations", ["user_id"])
    # ── plugin_reviews ────────────────────────────────────────────────────
    op.create_table(
        "plugin_reviews",
        sa.Column("id", postgresql.UUID(as_uuid=False), nullable=False),
        sa.Column("plugin_id", sa.String(255), nullable=False),
        sa.Column("reviewer_id", postgresql.UUID(as_uuid=False), nullable=True),
        sa.Column("decision", postgresql.ENUM("approved", "rejected", name="review_decision", create_type=False), nullable=False),
        sa.Column("notes", sa.Text, nullable=True),
        sa.Column("reviewed_at", sa.DateTime(timezone=True), nullable=False, server_default=sa.text("now()")),
        sa.Column("created_at", sa.DateTime(timezone=True), nullable=False, server_default=sa.text("now()")),
        sa.PrimaryKeyConstraint("id"),
        sa.ForeignKeyConstraint(["plugin_id"], ["plugins.id"], ondelete="CASCADE"),
        sa.ForeignKeyConstraint(["reviewer_id"], ["users.id"], ondelete="SET NULL"),
    )
    op.create_index("ix_plugin_reviews_plugin_id", "plugin_reviews", ["plugin_id"])
    # ── revenue_events ────────────────────────────────────────────────────
    op.create_table(
        "revenue_events",
        sa.Column("id", postgresql.UUID(as_uuid=False), nullable=False),
        sa.Column("plugin_id", sa.String(255), nullable=False),
        sa.Column("user_id", postgresql.UUID(as_uuid=False), nullable=False),
        sa.Column("amount_cents", sa.Integer, nullable=False, server_default="0"),
        sa.Column("developer_share_cents", sa.Integer, nullable=False, server_default="0"),
        sa.Column("stripe_transfer_id", sa.String(255), nullable=True),
        sa.Column("paid_at", sa.DateTime(timezone=True), nullable=True),
        sa.Column("created_at", sa.DateTime(timezone=True), nullable=False, server_default=sa.text("now()")),
        sa.PrimaryKeyConstraint("id"),
        sa.ForeignKeyConstraint(["plugin_id"], ["plugins.id"], ondelete="CASCADE"),
        sa.ForeignKeyConstraint(["user_id"], ["users.id"], ondelete="CASCADE"),
    )
    op.create_index("ix_revenue_events_plugin_id", "revenue_events", ["plugin_id"])
    op.create_index("ix_revenue_events_user_id", "revenue_events", ["user_id"])
 def downgrade() -> None:
    op.drop_table("revenue_events")
    op.drop_table("plugin_reviews")
    op.drop_table("plugin_installations")
    op.drop_table("plugins")
    op.drop_table("backup_metadata")
    op.drop_table("storage_records")
    op.drop_table("subscriptions")
    op.drop_table("refresh_tokens")
    op.drop_table("users")
    op.execute("DROP TYPE IF EXISTS review_decision")
    op.execute("DROP TYPE IF EXISTS plugin_status")
    op.execute("DROP TYPE IF EXISTS billing_tier")
--- a/alembic/versions/002_seed_plugins.py
+++ b/alembic/versions/002_seed_plugins.py
@@ -1,92 +0,0 @@
 """Seed approved plugins: GitHub Sync, Slack Notifier, Time Tracker.
 Revision ID: 002
 Revises: 001
 Create Date: 2026-03-03
 """
 from __future__ import annotations
 import json
 from datetime import datetime, timezone
 from typing import Sequence, Union
 import sqlalchemy as sa
 from alembic import op
 revision: str = "002"
 down_revision: Union[str, None] = "001"
 branch_labels: Union[str, Sequence[str], None] = None
 depends_on: Union[str, Sequence[str], None] = None
 _SEED_PLUGINS = [
    {
        "id": "plugin-github-sync",
        "name": "GitHub Sync",
        "description": "Sync tasks with GitHub Issues and pull requests.",
        "version": "1.0.0",
        "author_name": "Adiuva",
        "category": "productivity",
        "price_cents": 0,
        "permissions": json.dumps(["read:tasks", "write:tasks"]),
        "status": "approved",
        "s3_package_key": "plugins/plugin-github-sync/1.0.0/package.zip",
        "install_count": 0,
        "avg_rating": 0.0,
    },
    {
        "id": "plugin-slack-notify",
        "name": "Slack Notifier",
        "description": "Post task and timeline updates to Slack channels.",
        "version": "1.2.0",
        "author_name": "Adiuva",
        "category": "communication",
        "price_cents": 499,
        "permissions": json.dumps(["read:tasks", "read:timelines"]),
        "status": "approved",
        "s3_package_key": "plugins/plugin-slack-notify/1.2.0/package.zip",
        "install_count": 0,
        "avg_rating": 0.0,
    },
    {
        "id": "plugin-time-tracker",
        "name": "Time Tracker",
        "description": "Track time spent on tasks with automatic reporting.",
        "version": "0.9.1",
        "author_name": "Third Party",
        "category": "productivity",
        "price_cents": 999,
        "permissions": json.dumps(["read:tasks", "write:tasks"]),
        "status": "approved",
        "s3_package_key": "plugins/plugin-time-tracker/0.9.1/package.zip",
        "install_count": 0,
        "avg_rating": 0.0,
    },
 ]
 def upgrade() -> None:
    plugins = sa.table(
        "plugins",
        sa.column("id", sa.String),
        sa.column("name", sa.String),
        sa.column("description", sa.Text),
        sa.column("version", sa.String),
        sa.column("author_name", sa.String),
        sa.column("category", sa.String),
        sa.column("price_cents", sa.Integer),
        sa.column("permissions", sa.Text),
        sa.column("status", sa.Enum("pending_review", "approved", "rejected", name="plugin_status")),
        sa.column("s3_package_key", sa.String),
        sa.column("install_count", sa.Integer),
        sa.column("avg_rating", sa.Float),
    )
    op.bulk_insert(plugins, _SEED_PLUGINS)
 def downgrade() -> None:
    op.execute(
        "DELETE FROM plugins WHERE id IN ("
        "'plugin-github-sync', 'plugin-slack-notify', 'plugin-time-tracker'"
        ")"
    )
--- a/services/auth/app/init.py
+++ b/services/auth/app/init.py
--- a/app/agents/init.py
+++ b/app/agents/init.py
@@ -0,0 +1,5 @@
 """Expose tool modules used by deep orchestrator-worker graphs."""
 from app.agents import filesystem_agent, timeline_agent, note_agent, project_agent, task_agent
 __all__ = ["filesystem_agent", "timeline_agent", "note_agent", "project_agent", "task_agent"]
--- a/services/batch-agent/app/agents/filesystem_agent.py
+++ b/services/batch-agent/app/agents/filesystem_agent.py
@@ -1,6 +1,8 @@
 """Filesystem agent — tools for reading local directories and files on Electron.
-Adapted for Batch Agent Service: import from app.ws_context.
+These tools delegate to the Electron client via ``execute_on_client()`` using
 the same WS tool-call round-trip pattern as CRUD tools.  The Electron app
 handles actual disk I/O and responds with ``tool_result`` frames.
 """
 from __future__ import annotations
@@ -9,7 +11,7 @@ from typing import Any
 from langchain_core.tools import tool
-from shared.ws_context import execute_on_client
+from app.core.ws_context import execute_on_client
@tool
--- a/shared/agents/note_agent.py
+++ b/shared/agents/note_agent.py
@@ -1,7 +1,4 @@
-"""Note agent — Markdown note management (list, get, create, update, delete).
+"""Note agent — Markdown note management (list, get, create, update, delete)."""
 Shared tool definitions used by both Chat and Batch Agent services.
 """
 from __future__ import annotations
@@ -10,8 +7,8 @@ from typing import Any
 from langchain_core.tools import tool
-from shared.llm import embed
+from app.core.llm import embed
-from shared.ws_context import execute_on_client
+from app.core.ws_context import execute_on_client
 _UUID_RE = re.compile(
    r"^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$"
--- a/shared/agents/project_agent.py
+++ b/shared/agents/project_agent.py
@@ -1,7 +1,4 @@
-"""Project agent — full lifecycle management (list, get, create, update, archive, delete).
+"""Project agent — full lifecycle management (list, get, create, update, archive, delete)."""
 Shared tool definitions used by both Chat and Batch Agent services.
 """
 from __future__ import annotations
@@ -9,7 +6,7 @@ from typing import Any
 from langchain_core.tools import tool
-from shared.ws_context import execute_on_client
+from app.core.ws_context import execute_on_client
 PROJECT_SYSTEM_PROMPT = (
    "You are a project management assistant. You help users create, find,\n"
--- a/shared/agents/task_agent.py
+++ b/shared/agents/task_agent.py
@@ -1,7 +1,4 @@
-"""Task agent — full CRUD for tasks and task comments.
+"""Task agent — full CRUD for tasks and task comments."""
 Shared tool definitions used by both Chat and Batch Agent services.
 """
 from __future__ import annotations
@@ -11,7 +8,7 @@ from typing import Any
 from langchain_core.tools import tool
-from shared.ws_context import execute_on_client
+from app.core.ws_context import execute_on_client
 _UUID_RE = re.compile(
    r"^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$"
@@ -32,6 +29,7 @@ TASK_SYSTEM_PROMPT = (
    "  - project_id is optional; link to a project when the user mentions one\n"
    "  - is_ai_suggested: 1 only when proactively proposing a task the user\n"
    "    did not explicitly request; 0 otherwise\n"
    "  - is_ai_suggested: 1 only when proactively proposing a task the user did not explicitly request; 0 otherwise\n"
    "  - Use list_tasks_due_today for 'what's due today' queries\n"
    "  - For update_task, use -1 for integer fields you do not want to change\n"
    "  - Always confirm the action in plain, user-friendly language."
@@ -212,6 +210,7 @@ async def add_task_comment(task_id: str, author: str, content: str) -> str:
    )
    row = result.get("row", {})
    row_author = row.get("author", author)
    # Electron payloads can vary (taskId vs task_id). Fall back to input task_id.
    row_task_id = row.get("taskId") or row.get("task_id") or task_id
    row_comment_id = row.get("id", "unknown")
    return f"Comment added by {row_author} on task {row_task_id} (comment id: {row_comment_id})."
@@ -224,7 +223,7 @@ async def delete_task_comment(comment_id: str) -> str:
    return f"Comment {comment_id} deleted."
-# ── Exports ───────────────────────────────────────────────────────────
+# ── Agent ─────────────────────────────────────────────────────────────
 TASK_TOOLS: list[Any] = [
--- a/shared/agents/timeline_agent.py
+++ b/shared/agents/timeline_agent.py
@@ -1,7 +1,4 @@
-"""Timeline agent — project milestone management (list, create, update, delete).
+"""Timeline agent — project milestone management (list, create, update, delete)."""
 Shared tool definitions used by both Chat and Batch Agent services.
 """
 from __future__ import annotations
@@ -10,7 +7,7 @@ from typing import Any
 from langchain_core.tools import tool
-from shared.ws_context import execute_on_client
+from app.core.ws_context import execute_on_client
 _UUID_RE = re.compile(
    r"^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$"
@@ -28,6 +25,7 @@ TIMELINE_SYSTEM_PROMPT = (
    "  - For listing, project_id must be a UUID; never pass plain names as project_id\n"
    "  - date is a Unix timestamp in milliseconds; convert human-readable dates\n"
    "  - is_ai_suggested: 1 when proactively proposing a timeline, 0 otherwise\n"
    "  - is_ai_suggested: 1 when proactively proposing a timeline, 0 otherwise\n"
    "  - For update_timeline, use -1 for integer fields you do not want to change\n"
    "  - Listing without a project_id returns all timelines across projects\n"
    "  - Always echo the title and formatted date in your confirmation."
--- a/services/batch-agent/app/init.py
+++ b/services/batch-agent/app/init.py
--- a/app/api/deps.py
+++ b/app/api/deps.py
@@ -0,0 +1,14 @@
 """Shared FastAPI dependencies.
 ``get_current_user`` and ``oauth2_scheme`` live in ``app.api.middleware.auth``
 (the canonical location per Step 9).  This module re-exports them so that all
 existing route imports (``from app.api.deps import get_current_user``) continue
 to work without modification.
 Step 12 will update ``get_current_user`` to fetch the live tier from PostgreSQL
 instead of reading it from the JWT payload.
 """
 from app.api.middleware.auth import get_current_user, oauth2_scheme  # noqa: F401
 __all__ = ["get_current_user", "oauth2_scheme"]
--- a/app/api/middleware/init.py
+++ b/app/api/middleware/init.py
@@ -0,0 +1,19 @@
 """API middleware package.
 Exports the three middleware components introduced in Step 9:
  - Auth:        ``get_current_user`` FastAPI dependency + ``oauth2_scheme``
  - Rate limit:  ``TierRateLimitMiddleware`` + ``limiter`` (slowapi Limiter)
  - Sanitizer:   ``SanitizerMiddleware``
 """
 from app.api.middleware.auth import get_current_user, oauth2_scheme
 from app.api.middleware.rate_limit import TierRateLimitMiddleware, limiter
 from app.api.middleware.sanitizer import SanitizerMiddleware
 __all__ = [
    "get_current_user",
    "oauth2_scheme",
    "TierRateLimitMiddleware",
    "limiter",
    "SanitizerMiddleware",
 ]
--- a/app/api/middleware/auth.py
+++ b/app/api/middleware/auth.py
@@ -1,7 +1,14 @@
-"""Auth dependencies — JWT validation for the Auth Service.
+"""Auth middleware — JWT validation dependency.
-This is the canonical get_current_user used by protected endpoints
+``get_current_user`` is the FastAPI dependency used by all protected routes.
-within the Auth Service itself (/me, /me PUT).
+It decodes the Bearer JWT (identity + expiry), then fetches the current tier
 from the ``subscriptions`` table so that tier changes take effect immediately
 without requiring token re-issue.
 Exempt routes (no JWT required):
  - POST /api/v1/auth/register
  - POST /api/v1/auth/login
  - POST /api/v1/billing/webhook
 """
 from __future__ import annotations
@@ -12,12 +19,9 @@ from jose import JWTError, jwt
 from sqlalchemy import select
 from sqlalchemy.ext.asyncio import AsyncSession
-from shared.config import settings
+from app.config.settings import settings
-from shared.db import get_session
+from app.db import get_session
-from shared.models import Subscription, User
+from app.schemas import UserProfile
 from shared.schemas import UserProfile
 from app.config import auth_settings
 oauth2_scheme = OAuth2PasswordBearer(tokenUrl="/api/v1/auth/login")
@@ -28,8 +32,11 @@ async def get_current_user(
 ) -> UserProfile:
    """Validate a Bearer JWT and return the authenticated user.
-    The JWT is used for identity and expiry.  Tier is fetched live from the
+    The JWT is used for identity and expiry only.  The tier is fetched live
-    subscriptions table so upgrades/downgrades take effect immediately.
+    from the ``subscriptions`` table so that upgrades/downgrades take effect
    immediately.  Falls back to ``'free'`` when no subscription row exists.
    Raises HTTP 401 on any invalid or expired token.
    """
    credentials_exc = HTTPException(
        status_code=status.HTTP_401_UNAUTHORIZED,
@@ -38,7 +45,7 @@ async def get_current_user(
    )
    try:
        payload = jwt.decode(
-            token, auth_settings.JWT_PUBLIC_KEY, algorithms=["RS256"]
+            token, settings.JWT_SECRET, algorithms=[settings.JWT_ALGORITHM]
        )
        user_id: str | None = payload.get("sub")
        email: str | None = payload.get("email")
@@ -47,14 +54,18 @@ async def get_current_user(
    except JWTError:
        raise credentials_exc
-    # Live tier lookup
+    # Live tier lookup — subscription row is the authoritative source.
    # In dev, fall back to 'power' (unlimited) so quota limits don't
    # block local development when no Stripe subscription exists.
    from app.models import Subscription, User  # noqa: PLC0415
    result = await db.execute(
        select(Subscription.tier).where(Subscription.user_id == user_id)
    )
    default_tier = "power" if settings.ENV == "dev" else "free"
    tier: str = result.scalar_one_or_none() or default_tier
-    # Fetch name/surname
+    # Fetch name/surname from user row.
    user_result = await db.execute(
        select(User.name, User.surname).where(User.id == user_id)
    )
--- a/app/api/middleware/rate_limit.py
+++ b/app/api/middleware/rate_limit.py
@@ -0,0 +1,129 @@
 """Tier-aware rate limiting middleware.
 Uses a per-user sliding-window counter (in-process, no Redis required).
 The ``slowapi`` Limiter is also exported for optional route-level decoration.
 Limits (requests per minute):
  - free:  20
  - pro:   60
  - power: 120
  - team:  200
 Exempt paths bypass the limiter entirely:
  - POST /api/v1/auth/register
  - POST /api/v1/auth/login
  - POST /api/v1/billing/webhook
  - GET  /api/v1/health
 """
 from __future__ import annotations
 import json
 import time
 from collections import defaultdict
 from fastapi import Request, Response
 from jose import JWTError, jwt
 from slowapi import Limiter
 from slowapi.util import get_remote_address
 from starlette.middleware.base import BaseHTTPMiddleware
 from starlette.types import ASGIApp
 from app.config.settings import settings
 _TIER_LIMITS: dict[str, int] = {
    "free": 20,
    "pro": 60,
    "power": 120,
    "team": 200,
 }
 _EXEMPT_PATHS: frozenset[str] = frozenset(
    {
        "/api/v1/auth/register",
        "/api/v1/auth/login",
        "/api/v1/billing/webhook",
        "/api/v1/health",
    }
 )
 def _get_user_id_from_jwt(request: Request) -> str:
    """Key function for the slowapi Limiter: returns JWT sub or remote IP."""
    auth = request.headers.get("Authorization", "")
    token = auth.removeprefix("Bearer ").strip()
    if not token:
        return get_remote_address(request)
    try:
        payload = jwt.decode(
            token, settings.JWT_SECRET, algorithms=[settings.JWT_ALGORITHM]
        )
        return payload.get("sub") or get_remote_address(request)
    except JWTError:
        return get_remote_address(request)
 # Exported Limiter instance — available for optional route-level decoration.
 limiter = Limiter(key_func=_get_user_id_from_jwt)
 class TierRateLimitMiddleware(BaseHTTPMiddleware):
    """Sliding-window rate limiter applied globally across all non-exempt routes.
    Each authenticated user gets their own 60-second window sized by tier.
    Unauthenticated requests pass through (the auth dependency will reject them
    with 401 before the route handler runs).
    """
    def __init__(self, app: ASGIApp) -> None:
        super().__init__(app)
        # user_id → list of request timestamps (float, seconds since epoch)
        self._window: dict[str, list[float]] = defaultdict(list)
    async def dispatch(self, request: Request, call_next) -> Response:  # type: ignore[override]
        if request.url.path in _EXEMPT_PATHS:
            return await call_next(request)
        # Extract JWT claims — if no valid token, pass through for auth dep to handle.
        auth = request.headers.get("Authorization", "")
        token = auth.removeprefix("Bearer ").strip()
        if not token:
            return await call_next(request)
        try:
            payload = jwt.decode(
                token, settings.JWT_SECRET, algorithms=[settings.JWT_ALGORITHM]
            )
            user_id: str = payload.get("sub") or get_remote_address(request)
            tier: str = payload.get("tier", "free")
        except JWTError:
            return await call_next(request)
        limit = _TIER_LIMITS.get(tier, _TIER_LIMITS["free"])
        now = time.monotonic()
        window_start = now - 60.0
        # Slide the window: discard timestamps older than 60 seconds.
        timestamps = [t for t in self._window[user_id] if t > window_start]
        if len(timestamps) >= limit:
            retry_after = max(1, int(60 - (now - min(timestamps))))
            return Response(
                content=json.dumps(
                    {
                        "detail": (
                            f"Rate limit exceeded ({limit} req/min for {tier} tier). "
                            f"Retry in {retry_after}s."
                        )
                    }
                ),
                status_code=429,
                headers={
                    "Retry-After": str(retry_after),
                    "Content-Type": "application/json",
                },
            )
        timestamps.append(now)
        self._window[user_id] = timestamps
        return await call_next(request)
--- a/app/api/middleware/sanitizer.py
+++ b/app/api/middleware/sanitizer.py
@@ -0,0 +1,138 @@
 """Response sanitizer middleware.
 Scans JSON responses from the /api/v1/chat endpoint and strips any fragments
 that could reveal server-side prompt IP:
  - System prompt openers ("You are a/an/the …")
  - Agent routing metadata ("Available agents:", "intent classifier", …)
  - LangChain tool schema fragments (``"type": "function"``)
  - Internal reasoning markers (<thinking>, <reasoning>, [INST], …)
  - Exact-match known prompt fingerprints
 The middleware only activates for paths under /api/v1/chat.
 Any sanitisation event is logged as a WARNING with the request path and the
 names of the fields that were modified.
 """
 from __future__ import annotations
 import json
 import logging
 import re
 from fastapi import Request, Response
 from starlette.middleware.base import BaseHTTPMiddleware
 from starlette.types import ASGIApp
 logger = logging.getLogger(__name__)
 # ---------------------------------------------------------------------------
 # Detection patterns — order matters: fingerprints checked first (exact),
 # then compiled regexes.
 # ---------------------------------------------------------------------------
 _FINGERPRINTS: tuple[str, ...] = (
    "You are an intent classifier",
    "Respond with just the agent name",
    "Summarize these agent results",
    "Available agents:",
    "route to:",
 )
 _PATTERNS: tuple[re.Pattern[str], ...] = (
    re.compile(r"You are (a|an|the)\b.{0,200}", re.IGNORECASE | re.DOTALL),
    re.compile(r"Available agents\s*:", re.IGNORECASE),
    re.compile(r"\bintent classifier\b", re.IGNORECASE),
    re.compile(r'"type"\s*:\s*"function"'),           # LangChain tool schema
    re.compile(r"<(thinking|reasoning|system|prompt)>", re.IGNORECASE),
    re.compile(r"\[INST\]|\[/INST\]"),                # Llama instruct markers
    re.compile(r"route\s+to\s*:", re.IGNORECASE),
    re.compile(r"prompt_template\s*:\s*['\"].{10,}", re.IGNORECASE),
 )
 def _sanitize_text(text: str) -> tuple[str, bool]:
    """Scan *text* for prompt fragments and replace matches with ``[REDACTED]``.
    Returns ``(cleaned_text, was_changed)``.
    """
    # Fingerprint check — if any exact phrase is present, redact the whole string.
    for fp in _FINGERPRINTS:
        if fp in text:
            return "[REDACTED]", True
    changed = False
    for pattern in _PATTERNS:
        new_text, n = pattern.subn("[REDACTED]", text)
        if n:
            text = new_text
            changed = True
    return text, changed
 class SanitizerMiddleware(BaseHTTPMiddleware):
    """Strip prompt IP from /api/v1/chat JSON responses."""
    def __init__(self, app: ASGIApp) -> None:
        super().__init__(app)
    async def dispatch(self, request: Request, call_next) -> Response:  # type: ignore[override]
        response: Response = await call_next(request)
        # Only process chat endpoint responses.
        if not request.url.path.startswith("/api/v1/chat"):
            return response
        # Read body — collect streaming chunks.
        body_bytes = b""
        async for chunk in response.body_iterator:
            body_bytes += chunk if isinstance(chunk, bytes) else chunk.encode()
        # Skip non-JSON bodies (shouldn't happen on /chat, but be safe).
        try:
            body = json.loads(body_bytes.decode("utf-8"))
        except (json.JSONDecodeError, UnicodeDecodeError):
            return Response(
                content=body_bytes,
                status_code=response.status_code,
                headers=dict(response.headers),
                media_type=response.media_type,
            )
        if not isinstance(body, dict):
            return Response(
                content=body_bytes,
                status_code=response.status_code,
                headers=dict(response.headers),
                media_type=response.media_type,
            )
        # Walk top-level string fields and sanitise.
        sanitised_fields: list[str] = []
        for key, value in body.items():
            if isinstance(value, str):
                cleaned, changed = _sanitize_text(value)
                if changed:
                    body[key] = cleaned
                    sanitised_fields.append(key)
        if sanitised_fields:
            logger.warning(
                "Sanitizer redacted prompt fragments",
                extra={
                    "path": request.url.path,
                    "fields": sanitised_fields,
                },
            )
        new_body = json.dumps(body).encode("utf-8")
        headers = dict(response.headers)
        headers["content-length"] = str(len(new_body))
        return Response(
            content=new_body,
            status_code=response.status_code,
            headers=headers,
            media_type="application/json",
        )
--- a/services/billing/app/init.py
+++ b/services/billing/app/init.py
--- a/services/batch-agent/app/journey.py
+++ b/services/batch-agent/app/journey.py
@@ -1,16 +1,22 @@
-"""Chatbot Journey — guided conversation to build an agent prompt_template.
+"""Chatbot Journey — WS-based guided conversation to build an agent prompt_template.
-Adapted for Batch Agent Service: imports from app.agents.filesystem_agent
+The journey is driven entirely through WebSocket frames (no REST endpoints).
-and app.llm instead of monolith paths.  Session state is in-memory (could
+The device WS handler dispatches ``journey_start`` and ``journey_message``
-be moved to Redis for horizontal scaling in the future).
+frames to the functions exported here.
 Journey flow:
-  1. Redis consumer dispatches ``journey_start`` with basic agent config.
+  1. FE sends ``journey_start`` frame with basic agent config (directory,
-  2. Server creates an in-memory session, runs the setup LLM with
+     data_types, schedule).
-     file-system tools to explore the directory, returns first question.
+  2. Server creates an in-memory session, sets up a WS executor so the
-  3. ``journey_message`` frames drive the conversation.
+     setup LLM can use file-system tools, does a first directory scrape,
-  4. After 3-5 turns the LLM emits PROMPT_TEMPLATE_START / _END block.
+     and sends back a ``journey_reply`` with the first question.
-  5. Server parses the block and returns ``journey_reply`` with ``done=True``.
+  3. FE sends ``journey_message`` frames for each user reply.
  4. Server appends the user message, calls the LLM (which may read files
     via tools), and sends back a ``journey_reply``.
  5. After 3-5 turns the LLM wraps up by emitting a ``prompt_template``
     block delimited by ``PROMPT_TEMPLATE_START`` / ``PROMPT_TEMPLATE_END``.
  6. Server parses the block, sends ``journey_reply`` with ``done=True``
     and the template.  FE stores it locally.
 """
 from __future__ import annotations
@@ -25,8 +31,9 @@ from typing import Any
 from langchain_core.messages import AIMessage, HumanMessage, SystemMessage, ToolMessage
 from app.agents.filesystem_agent import FILESYSTEM_TOOLS
-from shared.llm import get_llm
+from app.config.settings import settings
-import app.tracing as tracing
+from app.core.langfuse_client import extract_usage, get_langfuse, get_prompt_or_fallback
 from app.core.llm import get_llm
 logger = logging.getLogger(__name__)
@@ -38,8 +45,11 @@ _SESSION_TTL_SECONDS: int = 1800  # 30 minutes
 _TEMPLATE_START = "PROMPT_TEMPLATE_START"
 _TEMPLATE_END = "PROMPT_TEMPLATE_END"
 # Minimum turns before we consider nudging the LLM to wrap up.
 _MIN_TURNS_BEFORE_NUDGE: int = 3
 # Hard cap to avoid infinite loops (safety net, not the primary stopping criterion).
 _MAX_TURNS: int = 15
 # Max tool-calling steps per LLM invocation.
 _MAX_TOOL_STEPS: int = 6
 # ── In-memory session store ───────────────────────────────────────────────
@@ -54,6 +64,7 @@ class JourneySession:
    data_types: list[str]
    history: list[dict[str, Any]] = field(default_factory=list)
    system_prompt: str = ""
    langfuse_prompt: Any = None
    created_at: float = field(default_factory=time.monotonic)
    def is_expired(self) -> bool:
@@ -77,12 +88,20 @@ def get_journey_session(session_id: str, user_id: str) -> JourneySession | None:
 # ── System prompt builder ─────────────────────────────────────────────────
-_SYSTEM_PROMPT_TEMPLATE = """\
+_JOURNEY_SYSTEM_PROMPT = """\
 You are a friendly assistant helping a freelancer configure a data-extraction agent.
 Your job is to understand exactly what data the user wants to extract from their
-local directory and produce a concise prompt_template that a separate AI will use
+local directory and produce a detailed prompt_template that a separate AI will use
 as its instruction set.
 The extraction agent already has this base behaviour built in:
  - Reads each file using file-system tools.
  - Creates records (tasks, notes, timelines, projects) via CRUD tools.
  - Sets isAiSuggested=1 on every new record.
  - Only extracts data explicitly present in the files — it never invents information.
 The user's custom prompt is appended AFTER this base behaviour, so focus on
 what to look for and how to map it — not on the general extraction mechanics.
 You have access to file-system tools to explore the user's directory:
 - list_directory: to see folder structure
 - read_file_content: to peek at file contents
@@ -91,43 +110,38 @@ You have access to file-system tools to explore the user's directory:
 The user's configured directory is: {directory}
 Target data types: {data_types}
-IMPORTANT — project assignment is handled automatically.  You MUST NOT ask the user
+IMPORTANT — project assignment is handled automatically by the main agent runner
-about projects, projectId, or how to link records to projects.  Never include
+before the custom prompt is ever used.  You MUST NOT ask the user about projects,
-projectId logic or project creation instructions in the generated prompt_template.
+projectId, or how to link records to projects.  Never include projectId logic or
 project creation instructions in the generated prompt_template.
 Start by exploring the directory to understand its structure.  Then ask concise,
-focused questions one at a time.  Cover only the topics relevant to the target
+focused questions one at a time.  Cover these topics (not necessarily in this order):
-data types listed above:
+  1. The type and format of the source content (confirmed by your exploration).
  2. How fields should be mapped (e.g. filename → task title).
  3. Priority or status rules (e.g. "urgent" keyword → high priority).
  4. Any special handling, date extraction, or exclusions.
-  1. Content type and format — confirmed by your exploration.
+Once you reach 90% confidence, output the final prompt_template between these exact
-  2. For TASKS (if in scope): field mapping for title, status, priority, content,
+markers on their own lines:
       dueDate (where is the date found? what's the fallback when absent?),
       and assignee (is there a person name to assign?).
  3. For NOTES when TASKS are also in scope: note vs task distinction —
       what makes something a note rather than a task?
  4. For TIMELINES (if in scope): the date source — what marks a milestone or event?
  5. Exclusions and special handling applicable to the target data types.
 Keep asking focused questions until you are at least 90% confident.  Then stop and
 output the final prompt_template immediately, wrapped between these exact markers
 on their own lines:
 {template_start}
 <the complete extraction prompt here>
 {template_end}
-The prompt_template must be concise (bullet points, ~15–25 lines maximum).
+The prompt_template must be a self-contained instruction for an AI that reads files
-Specify only:
+and must perform CRUD operations using tools to create records.  It should specify:
-  - Scope: what files/content qualify and what entity types to create.
+  - What entity types to create (tasks, notes, timelines) — never projects.
-  - Field mapping rules per entity type (camelCase fields: title, status, priority,
+  - How to map file content to record fields (camelCase: title, status, priority,
-    dueDate, content, assignee, etc.).
+    dueDate, content, etc.) — never include projectId.
-  - dueDate rule (if tasks in scope): source and fallback behaviour.
+  - That isAiSuggested must be set to 1 on every new record.
-  - Note vs task rule (if both in scope): the criterion that separates them.
+  - Concrete examples of mappings based on what you discovered in the directory.
  - Timeline date rule (if timelines in scope): what constitutes a timeline event.
  - Exclusion/filtering rules.
  - 2–3 concrete mapping examples based on what you discovered.
-{existing_section}Begin by exploring the directory, then ask your first question.\
+{existing_section}\
 Keep asking clarifying questions until you are at least 90% confident you have
 enough information to generate an accurate prompt_template.  Once you reach that
 confidence level, stop asking and produce the final template immediately.
 Begin by exploring the directory, then ask your first question.\
 """
@@ -135,23 +149,25 @@ def _build_system_prompt(
    directory: str,
    data_types: list[str],
    existing_template: str | None = None,
-) -> str:
+) -> tuple[str, Any]:
    """Return ``(compiled_system_prompt, langfuse_prompt_obj_or_None)``."""
    existing_section = (
        f"\nThe user already has the following prompt_template — refine it based on their answers:\n"
        f"---\n{existing_template}\n---\n"
        if existing_template
        else ""
    )
-    # Use Langfuse compile_prompt ({{variable}} syntax) with Python .format() fallback
+    template, prompt_obj = get_prompt_or_fallback(
-    return tracing.compile_prompt(
+        "journey_system", _JOURNEY_SYSTEM_PROMPT
        "journey_system",
        fallback=_SYSTEM_PROMPT_TEMPLATE,
        variables={
            "directory": directory,
            "data_types": ", ".join(data_types),
            "existing_section": existing_section,
        },
    )
    compiled = template.format(
        directory=directory,
        data_types=", ".join(data_types),
        template_start=_TEMPLATE_START,
        template_end=_TEMPLATE_END,
        existing_section=existing_section,
    )
    return compiled, prompt_obj
 # ── Template extraction ───────────────────────────────────────────────────
@@ -191,13 +207,17 @@ async def _call_llm_with_tools(
    system_prompt: str,
    history: list[dict[str, Any]],
    tools: list[Any],
-    langfuse_handler: Any | None = None,
+    *,
    user_id: str = "",
    session_id: str = "",
    langfuse_prompt: Any = None,
 ) -> str:
    """Build LangChain messages from history and invoke the LLM with tools.
    Handles tool-calling loops: if the LLM calls tools, execute them and
    continue until a final text response is produced.
    """
    lf = get_langfuse()
    messages: list[Any] = [SystemMessage(content=system_prompt)]
    for turn in history:
        if turn["role"] == "user":
@@ -205,23 +225,52 @@ async def _call_llm_with_tools(
        else:
            messages.append(AIMessage(content=turn["content"]))
-    callbacks = [langfuse_handler] if langfuse_handler else None
+    llm = get_llm(model=None, temperature=0.4)
    llm = get_llm(model=None, temperature=0.4, callbacks=callbacks)
    llm_with_tools = llm.bind_tools(tools)
    tool_map = {tool_def.name: tool_def for tool_def in tools}
    _span_ctx = (
        lf.start_as_current_observation(
            as_type="span",
            name="journey-setup",
            user_id=user_id or None,
            session_id=session_id or None,
            input=history[-1]["content"] if history else "",
        )
        if lf else None
    )
    _span = _span_ctx.__enter__() if _span_ctx else None
    try:
        for _ in range(_MAX_TOOL_STEPS):
            _gen_ctx = (
                lf.start_as_current_observation(
                    as_type="generation",
                    name="journey-setup-llm",
                    model=settings.LLM_MODEL,
                    prompt=langfuse_prompt,
                    input=messages,
                )
                if lf else None
            )
            _gen = _gen_ctx.__enter__() if _gen_ctx else None
            response: AIMessage = await llm_with_tools.ainvoke(messages)
            if _gen_ctx:
                _gen.update(output=_as_text(response.content), usage=extract_usage(response))
                _gen_ctx.__exit__(None, None, None)
            messages.append(response)
            if not response.tool_calls:
                if _span:
                    _span.update(output=_as_text(response.content))
                return _as_text(response.content)
            for call in response.tool_calls:
                call_name = str(call.get("name", ""))
                call_args = call.get("args", {})
                logger.info(
-                "journey: tool_call name=%s args=%s",
+                    "agent_setup: journey tool_call name=%s args=%s",
                    call_name,
                    json.dumps(call_args, ensure_ascii=True)[:500],
                )
@@ -233,27 +282,33 @@ async def _call_llm_with_tools(
                    tool_output = await tool_fn.ainvoke(call_args)
                logger.info(
-                "journey: tool_result name=%s output=%s",
+                    "agent_setup: journey tool_result name=%s output=%s",
                    call_name,
                    str(tool_output)[:800],
                )
                messages.append(ToolMessage(content=str(tool_output), tool_call_id=call["id"]))
-    # Fallback: exceeded max tool steps.
+        # Fallback: exceeded max steps.
        final = await llm.ainvoke(messages)
-    return _as_text(final.content)
+        final_text = _as_text(final.content)
        if _span:
            _span.update(output=final_text)
        return final_text
    finally:
        if _span_ctx:
            _span_ctx.__exit__(None, None, None)
        if lf:
            lf.flush()
-# ── Journey handlers (called from redis_consumer) ────────────────────────
+# ── Journey handlers (called from device_ws.py) ──────────────────────────
 async def handle_journey_start(
    user_id: str,
    frame: dict[str, Any],
    *,
    langfuse_handler: Any | None = None,
 ) -> dict[str, Any]:
-    """Handle a ``journey_start`` request.
+    """Handle a ``journey_start`` WS frame.
    Creates a session, runs the setup LLM with directory exploration,
    and returns the ``journey_reply`` payload.
@@ -263,8 +318,10 @@ async def handle_journey_start(
    data_types = frame.get("data_types", [])
    existing_template = frame.get("existing_template")
    # Use the session_id provided by the FE so the reply matches the
    # listener key; fall back to a generated one if absent.
    session_id = frame.get("session_id") or str(uuid.uuid4())
-    system_prompt = _build_system_prompt(directory, data_types, existing_template)
+    system_prompt, langfuse_prompt = _build_system_prompt(directory, data_types, existing_template)
    session = JourneySession(
        session_id=session_id,
@@ -273,8 +330,13 @@ async def handle_journey_start(
        directory=directory,
        data_types=data_types,
        system_prompt=system_prompt,
        langfuse_prompt=langfuse_prompt,
    )
    # The LLM will explore the directory using FILESYSTEM_TOOLS via the
    # ws_context executor (already set by the WS handler before calling us).
    # Seed with an initial user message — some providers (e.g. GitHub Copilot)
    # require at least one user/input message to be present.
    seed_history: list[dict[str, Any]] = [
        {"role": "user", "content": "Hi, I'm ready to set up my agent. Please explore my directory and ask me your first question."},
    ]
@@ -282,7 +344,9 @@ async def handle_journey_start(
        system_prompt=system_prompt,
        history=seed_history,
        tools=list(FILESYSTEM_TOOLS),
-        langfuse_handler=langfuse_handler,
+        user_id=user_id,
        session_id=session_id,
        langfuse_prompt=langfuse_prompt,
    )
    session.history.extend(seed_history)
@@ -290,12 +354,13 @@ async def handle_journey_start(
    _sessions[session_id] = session
    logger.info(
-        "journey: session %s started for user %s (directory=%s)",
+        "agent_setup: journey session %s started for user %s (directory=%s)",
        session_id,
        user_id,
        directory,
    )
    # Check if the LLM produced the template on the first turn (unlikely but possible).
    prompt_template = _extract_template(ai_reply)
    done = prompt_template is not None
@@ -319,10 +384,8 @@ async def handle_journey_start(
 async def handle_journey_message(
    user_id: str,
    frame: dict[str, Any],
    *,
    langfuse_handler: Any | None = None,
 ) -> dict[str, Any]:
-    """Handle a ``journey_message`` request.
+    """Handle a ``journey_message`` WS frame.
    Appends the user message, calls the LLM, and returns the
    ``journey_reply`` payload.
@@ -340,20 +403,27 @@ async def handle_journey_message(
            "prompt_template": None,
        }
    # Append user turn.
    session.history.append({"role": "user", "content": message})
    # Call the LLM with tools.
    ai_reply = await _call_llm_with_tools(
        system_prompt=session.system_prompt,
        history=session.history,
        tools=list(FILESYSTEM_TOOLS),
-        langfuse_handler=langfuse_handler,
+        user_id=session.user_id,
        session_id=session_id,
        langfuse_prompt=session.langfuse_prompt,
    )
    session.history.append({"role": "assistant", "content": ai_reply})
    # Check if the LLM produced the final template.
    prompt_template = _extract_template(ai_reply)
    done = prompt_template is not None
    # If the LLM didn't produce a template, nudge it once it has asked enough
    # questions (>= _MIN_TURNS_BEFORE_NUDGE) or hits the hard safety cap.
    if not done:
        turns = sum(1 for t in session.history if t["role"] == "user")
        if turns >= _MAX_TURNS:
@@ -367,7 +437,9 @@ async def handle_journey_message(
                system_prompt=session.system_prompt,
                history=session.history,
                tools=list(FILESYSTEM_TOOLS),
-                langfuse_handler=langfuse_handler,
+                user_id=session.user_id,
                session_id=session_id,
                langfuse_prompt=session.langfuse_prompt,
            )
            session.history.append({"role": "assistant", "content": nudge_reply})
@@ -384,7 +456,7 @@ async def handle_journey_message(
            else "Here is your agent configuration. You can save it or continue refining."
        )
        _sessions.pop(session_id, None)
-        logger.info("journey: session %s completed for user %s", session_id, user_id)
+        logger.info("agent_setup: journey session %s completed for user %s", session_id, user_id)
    return {
        "type": "journey_reply",
--- a/app/api/routes/agents.py
+++ b/app/api/routes/agents.py
@@ -0,0 +1,222 @@
 """Agent routes.
 Backend responsibilities are intentionally minimal:
    GET  /agents/catalog         — static catalog for UI display
    POST /agents/can-create      — billing eligibility check
    POST /agents/trigger         — trigger a local agent run
 Agent configuration is owned by the Electron app and is not persisted
 in backend agent-config tables.
 """
 from __future__ import annotations
 import asyncio
 import uuid
 from datetime import datetime, timedelta, timezone
 from fastapi import APIRouter, Depends, HTTPException, status
 from sqlalchemy import func, select
 from sqlalchemy.ext.asyncio import AsyncSession
 from app.api.deps import get_current_user
 from app.billing.tier_manager import FEATURES
 from app.core.agent_runner import is_agent_running, run_local_agent
 from app.core.device_manager import device_manager
 from app.db import get_session
 from app.models import AgentRunLog, LocalAgentConfig
 from app.schemas import (
    AgentCatalogItem,
    AgentCreationCheckRequest,
    AgentCreationCheckResponse,
    AgentRunLogResponse,
    AgentTriggerRequest,
    UserProfile,
 )
 router = APIRouter(prefix="/agents", tags=["agents"])
 # ── Datetime helpers ──────────────────────────────────────────────────
 def _dt_ms(dt: datetime) -> int:
    return int(dt.timestamp() * 1000)
 def _dt_ms_opt(dt: datetime | None) -> int | None:
    return int(dt.timestamp() * 1000) if dt else None
 def _to_data_types(values: list[str]) -> list[str]:
    normalize = {
        "task": "tasks",           "tasks": "tasks",
        "note": "notes",           "notes": "notes",
        "timeline": "timelines",   "timelines": "timelines",   "timelineEvents": "timelines",
        "project": "projects",     "projects": "projects",
    }
    seen: set[str] = set()
    result: list[str] = []
    for v in values:
        mapped = normalize.get(v)
        if mapped and mapped not in seen:
            seen.add(mapped)
            result.append(mapped)
    return result
 def _to_run_log_response(log: AgentRunLog) -> AgentRunLogResponse:
    return AgentRunLogResponse(
        id=log.id,
        agent_id=log.agent_id,
        agent_type=log.agent_type,  # type: ignore[arg-type]
        status=log.status,  # type: ignore[arg-type]
        items_processed=log.items_processed,
        items_created=log.items_created,
        errors=log.errors or [],
        started_at=_dt_ms(log.started_at),
        completed_at=_dt_ms_opt(log.completed_at),
    )
 def _enforce_agent_limit(tier: str, current_count: int) -> int:
    limit: int = FEATURES.get(tier, FEATURES["free"])["batch_active"]
    if limit != -1 and current_count >= limit:
        raise HTTPException(
            status_code=status.HTTP_403_FORBIDDEN,
            detail=f"Agent limit ({limit}) reached for your tier. Upgrade to create more.",
        )
    return limit
 async def _enforce_run_frequency(
    tier: str,
    user_id: str,
    db: AsyncSession,
 ) -> None:
    """Raise HTTP 402 if the user has exceeded their daily batch run limit."""
    limit: int = FEATURES.get(tier, FEATURES["free"])["batch_runs_per_day"]
    if limit == -1:
        return  # unlimited
    today_start = datetime.now(timezone.utc).replace(
        hour=0, minute=0, second=0, microsecond=0
    )
    result = await db.execute(
        select(func.count(AgentRunLog.id)).where(
            AgentRunLog.user_id == user_id,
            AgentRunLog.started_at >= today_start,
        )
    )
    runs_today: int = result.scalar_one()
    if runs_today >= limit:
        raise HTTPException(
            status_code=status.HTTP_402_PAYMENT_REQUIRED,
            detail=f"Daily batch run limit ({limit}) reached for your tier. Upgrade for more runs.",
        )
 # ── Catalog ───────────────────────────────────────────────────────────
@router.get("/catalog", response_model=list[AgentCatalogItem])
 async def get_agent_catalog(
    current_user: UserProfile = Depends(get_current_user),
 ) -> list[AgentCatalogItem]:
    """Return the static list of available agent types and their descriptions."""
    return [
        AgentCatalogItem(
            type="local_directory",
            name="Local Directory Monitor",
            description="Watches local directories, extracts data from files using AI",
        ),
        AgentCatalogItem(
            type="gmail",
            name="Gmail Connector",
            description="Scans Gmail inbox, extracts tasks/notes from emails",
        ),
        AgentCatalogItem(
            type="teams",
            name="Microsoft Teams Connector",
            description="Monitors Teams messages, extracts action items",
        ),
        AgentCatalogItem(
            type="outlook",
            name="Outlook Connector",
            description="Scans Outlook inbox, extracts tasks/notes",
        ),
    ]
@router.post("/can-create", response_model=AgentCreationCheckResponse)
 async def can_create_agent(
    body: AgentCreationCheckRequest,
    current_user: UserProfile = Depends(get_current_user),
 ) -> AgentCreationCheckResponse:
    """Check if the user can create one more agent based on billing tier.
    Since configuration is client-owned, the Electron app sends its current
    active agent count and the backend applies tier limits.
    """
    limit: int = FEATURES.get(current_user.tier, FEATURES["free"])["batch_active"]
    allowed = limit == -1 or body.active_agents < limit
    return AgentCreationCheckResponse(
        allowed=allowed,
        tier=current_user.tier,
        active_agents=body.active_agents,
        limit=limit,
    )
@router.post("/trigger", response_model=AgentRunLogResponse, status_code=status.HTTP_202_ACCEPTED)
 async def trigger_agent_run(
    body: AgentTriggerRequest,
    current_user: UserProfile = Depends(get_current_user),
    db: AsyncSession = Depends(get_session),
 ) -> AgentRunLogResponse:
    """Trigger a local agent run using client-provided configuration."""
    _enforce_agent_limit(current_user.tier, body.active_agents)
    await _enforce_run_frequency(current_user.tier, current_user.id, db)
    config = LocalAgentConfig(
        id=str(uuid.uuid4()),
        user_id=current_user.id,
        device_id=body.device_id,
        name="Local Directory Monitor",
        directory_paths=[body.directory],
        data_types=_to_data_types(body.what_to_extract),
        prompt_template=body.custom_agent_prompt,
        file_extensions=[],
        schedule_cron=body.batch_interval,
        enabled=True,
    )
    # Use the FE's stable agent_id if provided, fall back to the ephemeral config id.
    stable_agent_id = body.agent_id or config.id
    if is_agent_running(stable_agent_id):
        raise HTTPException(
            status_code=status.HTTP_409_CONFLICT,
            detail="Agent is already running. Only one run per agent is allowed at a time.",
        )
    run_log = AgentRunLog(
        agent_id=stable_agent_id,
        agent_type="local",
        user_id=current_user.id,
        status="running",
    )
    db.add(run_log)
    await db.commit()
    await db.refresh(run_log)
    run_context = {
        "type": "agent_batch",
        "run_id": run_log.id,
        "agent_id": stable_agent_id,
    }
    asyncio.create_task(
        run_local_agent(current_user.id, config, run_log, device_manager, run_context)
    )
    return _to_run_log_response(run_log)
--- a/services/auth/app/routes.py
+++ b/services/auth/app/routes.py
@@ -1,6 +1,8 @@
 """Auth routes: register, login, refresh, me.
-Extracted from app/api/routes/auth.py — uses shared.* imports instead of app.*.
+Users and refresh tokens are persisted in PostgreSQL (users + refresh_tokens
 tables).  Passwords are hashed with bcrypt; refresh tokens are stored as
 SHA-256 hashes so plaintext never reaches the DB.
 """
 from __future__ import annotations
@@ -18,13 +20,11 @@ from pydantic import BaseModel
 from sqlalchemy import select
 from sqlalchemy.ext.asyncio import AsyncSession
-from shared.config import settings
+from app.api.deps import get_current_user
-from shared.db import get_session
+from app.config.settings import settings
-from shared.models import RefreshToken, Subscription, User
+from app.db import get_session
-from shared.schemas import AuthTokens, UserProfile
+from app.models import RefreshToken, User
-
+from app.schemas import AuthTokens, UserProfile
 from app.config import auth_settings
 from app.deps import get_current_user
 router = APIRouter(prefix="/auth", tags=["auth"])
@@ -46,7 +46,7 @@ def _hash_token(plain_token: str) -> str:
 def _make_access_token(user_id: str, email: str, tier: str) -> tuple[str, int]:
-    """Return (RS256-signed JWT, expires_at_ms)."""
+    """Return (signed JWT, expires_at_ms)."""
    now = int(time.time())
    exp = now + settings.JWT_ACCESS_TOKEN_EXPIRE_MINUTES * 60
    payload = {
@@ -56,19 +56,10 @@ def _make_access_token(user_id: str, email: str, tier: str) -> tuple[str, int]:
        "exp": exp,
        "iat": now,
    }
-    token = jwt.encode(payload, auth_settings.JWT_PRIVATE_KEY, algorithm="RS256")
+    token = jwt.encode(payload, settings.JWT_SECRET, algorithm=settings.JWT_ALGORITHM)
    return token, exp * 1000  # ms for client
 async def _get_live_tier(db: AsyncSession, user_id: str) -> str:
    """Fetch authoritative tier from subscriptions table."""
    result = await db.execute(
        select(Subscription.tier).where(Subscription.user_id == user_id)
    )
    default_tier = "power" if settings.ENV == "dev" else "free"
    return result.scalar_one_or_none() or default_tier
 # ── Request bodies ────────────────────────────────────────────────────
@@ -88,11 +79,6 @@ class _RefreshRequest(BaseModel):
    refresh_token: str
 class _UpdateProfileRequest(BaseModel):
    name: str | None = None
    surname: str | None = None
 # ── Routes ────────────────────────────────────────────────────────────
@@ -116,7 +102,7 @@ async def register(
        encryption_key=Fernet.generate_key().decode(),
    )
    db.add(user)
-    await db.flush()
+    await db.flush()  # get user.id without committing
    plain_token = str(uuid.uuid4())
    expires_at = datetime.now(timezone.utc) + timedelta(
@@ -149,9 +135,6 @@ async def login(
    if user is None or not _verify_password(body.password, user.password_hash):
        raise HTTPException(status.HTTP_401_UNAUTHORIZED, "Invalid credentials")
    # Fetch live tier for the JWT claim
    tier = await _get_live_tier(db, user.id)
    plain_token = str(uuid.uuid4())
    expires_at = datetime.now(timezone.utc) + timedelta(
        days=settings.JWT_REFRESH_TOKEN_EXPIRE_DAYS
@@ -164,7 +147,7 @@ async def login(
    db.add(rt)
    await db.commit()
-    access_token, expires_at_ms = _make_access_token(user.id, user.email, tier)
+    access_token, expires_at_ms = _make_access_token(user.id, user.email, user.tier)
    return AuthTokens(
        access_token=access_token,
        refresh_token=plain_token,
@@ -188,6 +171,7 @@ async def refresh(
    if rt is None or rt.expires_at.replace(tzinfo=timezone.utc) < now:
        raise HTTPException(status.HTTP_401_UNAUTHORIZED, "Invalid or expired refresh token")
    # Rotate: delete old token, issue new one.
    await db.delete(rt)
    user_result = await db.execute(select(User).where(User.id == rt.user_id))
@@ -195,9 +179,6 @@ async def refresh(
    if user is None:
        raise HTTPException(status.HTTP_401_UNAUTHORIZED, "User not found")
    # Fetch live tier for the new JWT
    tier = await _get_live_tier(db, user.id)
    plain_token = str(uuid.uuid4())
    new_expires = now + timedelta(days=settings.JWT_REFRESH_TOKEN_EXPIRE_DAYS)
    new_rt = RefreshToken(
@@ -208,7 +189,7 @@ async def refresh(
    db.add(new_rt)
    await db.commit()
-    access_token, expires_at_ms = _make_access_token(user.id, user.email, tier)
+    access_token, expires_at_ms = _make_access_token(user.id, user.email, user.tier)
    return AuthTokens(
        access_token=access_token,
        refresh_token=plain_token,
@@ -216,6 +197,11 @@ async def refresh(
    )
 class _UpdateProfileRequest(BaseModel):
    name: str | None = None
    surname: str | None = None
@router.get("/me", response_model=UserProfile)
 async def me(current_user: UserProfile = Depends(get_current_user)) -> UserProfile:
    """Return the profile for the authenticated user."""
--- a/app/api/routes/billing.py
+++ b/app/api/routes/billing.py
@@ -0,0 +1,85 @@
 """Billing routes: Stripe checkout, webhook, subscription management.
 Business logic lives in ``app.billing.stripe_service.StripeService``.
 The route layer handles HTTP concerns (request parsing, response shaping)
 and delegates everything else to the service singleton.
 """
 from __future__ import annotations
 from typing import Any
 from fastapi import APIRouter, Depends, Header, Request, status
 from pydantic import BaseModel
 from sqlalchemy.ext.asyncio import AsyncSession
 from app.api.deps import get_current_user
 from app.billing.stripe_service import stripe_service
 from app.db import get_session
 from app.schemas import BillingTier, UserProfile
 router = APIRouter(prefix="/billing", tags=["billing"])
 # ── Request bodies ─────────────────────────────────────────────────────
 class _CheckoutRequest(BaseModel):
    tier: BillingTier
 # ── Routes ─────────────────────────────────────────────────────────────
@router.post("/checkout", response_model=dict)
 async def create_checkout(
    body: _CheckoutRequest,
    current_user: UserProfile = Depends(get_current_user),
 ) -> dict[str, str]:
    """Create a Stripe checkout session for a tier upgrade.
    Returns a stub URL when ``STRIPE_SECRET_KEY`` is not configured.
    """
    url = stripe_service.create_checkout_session(current_user.id, body.tier)
    return {"checkout_url": url}
@router.post("/webhook", response_model=dict)
 async def stripe_webhook(
    request: Request,
    stripe_signature: str = Header(default="", alias="Stripe-Signature"),
    db: AsyncSession = Depends(get_session),
 ) -> dict[str, bool]:
    """Handle Stripe webhook events.
    No JWT auth — authenticated via Stripe signature verification instead.
    Returns 200 immediately when Stripe is not configured (local dev).
    """
    payload = await request.body()
    await stripe_service.handle_webhook(payload, stripe_signature, db)
    return {"ok": True}
@router.get("/subscription", response_model=dict)
 async def get_subscription(
    current_user: UserProfile = Depends(get_current_user),
    db: AsyncSession = Depends(get_session),
 ) -> dict[str, Any]:
    """Return the current subscription info for the authenticated user."""
    sub = await stripe_service.get_subscription(current_user.id, db)
    if sub is None:
        return {
            "tier": current_user.tier,
            "status": "free",
            "stripe_subscription_id": None,
            "current_period_end": None,
        }
    return sub
@router.delete("/subscription", response_model=dict, status_code=status.HTTP_200_OK)
 async def cancel_subscription(
    current_user: UserProfile = Depends(get_current_user),
    db: AsyncSession = Depends(get_session),
 ) -> dict[str, bool]:
    """Cancel the active subscription."""
    await stripe_service.cancel_subscription(current_user.id, db)
    return {"ok": True}
--- a/app/api/routes/chat.py
+++ b/app/api/routes/chat.py
@@ -0,0 +1,59 @@
 """Chat routes: POST /chat (REST fallback) and POST /chat/embed (text → vector).
 WebSocket chat is handled by the unified device WS endpoint (/api/v1/ws/device).
 """
 from __future__ import annotations
 from fastapi import APIRouter, Depends
 from fastapi.responses import JSONResponse
 from pydantic import BaseModel
 from app.api.deps import get_current_user
 from app.core.deep_agent import run_home
 from app.core.llm import embed
 from app.schemas import ChatRequest, UserProfile
 router = APIRouter(prefix="/chat", tags=["chat"])
 # ── Embed helpers ─────────────────────────────────────────────────────────
 class _EmbedRequest(BaseModel):
    text: str
 class _EmbedResponse(BaseModel):
    vector: list[float]
 # ── Endpoints ─────────────────────────────────────────────────────────────
@router.post("")
 async def chat(
    body: ChatRequest,
    current_user: UserProfile = Depends(get_current_user),
 ) -> JSONResponse:
    """REST fallback for home chat when websocket streaming is unavailable."""
    response = await run_home(
        user_id=current_user.id,
        message=body.message,
        context=body.context.model_dump(),
    )
    return JSONResponse(content={"response": response})
@router.post("/embed", response_model=_EmbedResponse)
 async def embed_text(
    body: _EmbedRequest,
    current_user: UserProfile = Depends(get_current_user),
 ) -> _EmbedResponse:
    """Generate a 1536-dim embedding vector for the given text.
    Uses ``text-embedding-3-small`` via OpenAI.  Auth required (JWT).
    Used by Electron (vectordb.ts) for local note search.
    """
    vector = await embed(body.text)
    return _EmbedResponse(vector=vector)
--- a/app/api/routes/device_ws.py
+++ b/app/api/routes/device_ws.py
@@ -0,0 +1,417 @@
 """Device WebSocket endpoint.
 Persistent connection from Electron devices to the backend.
  WS  /api/v1/ws/device?token=<jwt>
 Auth: JWT passed as ``?token=`` query parameter (Bearer header is not
 available during the WebSocket handshake).
 Protocol:
  1. Client connects → JWT validated → connection accepted.
  2. Client sends ``device_hello`` frame: ``{ type, device_id, agent_ids }``.
  3. Backend registers the connection in ``DeviceConnectionManager``.
  4. Session enters message dispatch loop + heartbeat.
 Incoming frame dispatch:
  - ``tool_result``      → resolves a pending tool-call Future.
  - ``journey_start``    → starts a guided setup journey session.
  - ``journey_message``  → continues a journey conversation.
  - ``pong``             → heartbeat acknowledgement (updates last-seen).
  - unknown types        → logged, ignored.
 Outgoing heartbeat: ``{ "type": "ping" }`` every 30 s.
 On disconnect:
  - Unregisters from DeviceConnectionManager.
  - Marks all in-progress AgentRunLog rows for this user as ``error``
    with message "device disconnected".
 """
 from __future__ import annotations
 import asyncio
 import json
 import logging
 from uuid import uuid4
 from fastapi import APIRouter, WebSocket, WebSocketDisconnect
 from jose import JWTError, jwt
 from sqlalchemy import update
 from app.api.routes.agent_setup import handle_journey_message, handle_journey_start
 from app.config.settings import settings
 from app.core.agent_runner import trigger_pending_runs
 from app.core.deep_agent import run_floating_stream, run_home_stream
 from app.core.device_manager import device_manager
 from app.core.memory_middleware import MemoryMiddleware
 from app.core.output_formatter import StreamFormatter
 from app.core.ws_context import clear_client_executor, set_client_executor
 from app.db import async_session
 from app.models import AgentRunLog
 from app.schemas import WsFrameType
 logger = logging.getLogger(__name__)
 router = APIRouter(prefix="/ws", tags=["device-ws"])
 _HEARTBEAT_INTERVAL = 30  # seconds
 _PONG_TIMEOUT = 10  # seconds — grace window after a ping
@router.websocket("/device")
 async def device_ws(websocket: WebSocket) -> None:
    """Persistent WebSocket endpoint for Electron device connections.
    Authentication is via ``?token=<jwt>`` query parameter.
    """
    # ── 1. Authenticate before accepting ─────────────────────────────
    token = websocket.query_params.get("token", "")
    try:
        payload = jwt.decode(
            token, settings.JWT_SECRET, algorithms=[settings.JWT_ALGORITHM]
        )
        user_id: str | None = payload.get("sub")
        if not user_id:
            raise JWTError("missing sub")
    except JWTError:
        await websocket.close(code=1008)  # Policy Violation
        return
    await websocket.accept()
    # ── 2. Await device_hello frame ───────────────────────────────────
    try:
        raw = await asyncio.wait_for(websocket.receive_text(), timeout=15.0)
    except (asyncio.TimeoutError, WebSocketDisconnect):
        await websocket.close(code=1008)
        return
    try:
        hello = json.loads(raw)
        if hello.get("type") != WsFrameType.device_hello:
            raise ValueError("expected device_hello as first frame")
        device_id: str = hello["device_id"]
        agent_ids: list[str] = hello.get("agent_ids", [])
    except (KeyError, ValueError, json.JSONDecodeError) as exc:
        logger.warning("device_ws: invalid device_hello from user=%s: %s", user_id, exc)
        await websocket.close(code=1008)
        return
    # ── 3. Register connection ────────────────────────────────────────
    device_manager.register(user_id, device_id, websocket)
    logger.info(
        "device_ws: connected user=%s device=%s agents=%s",
        user_id,
        device_id,
        agent_ids,
    )
    # Trigger any overdue agent runs now that the device is connected.
    asyncio.create_task(trigger_pending_runs(user_id, device_id, device_manager))
    # ── 4. Concurrent message loop + heartbeat ────────────────────────
    try:
        await asyncio.gather(
            _message_loop(websocket, user_id),
            _heartbeat_loop(websocket),
        )
    except WebSocketDisconnect:
        pass
    except Exception as exc:
        logger.warning("device_ws: unhandled exception user=%s: %s", user_id, exc)
    finally:
        device_manager.unregister(user_id)
        logger.info("device_ws: disconnected user=%s device=%s", user_id, device_id)
        await _mark_runs_disconnected(user_id)
 # ── Message dispatch loop ─────────────────────────────────────────────
 async def _message_loop(websocket: WebSocket, user_id: str) -> None:
    """Receive frames from Electron and dispatch to the appropriate handler."""
    async for raw in websocket.iter_text():
        try:
            frame: dict = json.loads(raw)
        except json.JSONDecodeError:
            logger.warning("device_ws: invalid JSON from user=%s", user_id)
            continue
        frame_type = frame.get("type")
        if frame_type == WsFrameType.tool_result:
            call_id = frame.get("id")
            if call_id:
                device_manager.resolve_pending_call(user_id, call_id, frame)
            else:
                logger.warning(
                    "device_ws: tool_result missing id from user=%s", user_id
                )
        elif frame_type == WsFrameType.home_request:
            asyncio.create_task(
                _handle_home_request(websocket, user_id, frame)
            )
        elif frame_type == WsFrameType.floating_request:
            asyncio.create_task(
                _handle_floating_request(websocket, user_id, frame)
            )
        elif frame_type == WsFrameType.journey_start:
            asyncio.create_task(
                _handle_journey_start(websocket, user_id, frame)
            )
        elif frame_type == WsFrameType.journey_message:
            asyncio.create_task(
                _handle_journey_message(websocket, user_id, frame)
            )
        elif frame_type == "pong":
            # Heartbeat ack — nothing to do, connection is alive.
            pass
        else:
            logger.debug(
                "device_ws: unknown frame type %r from user=%s", frame_type, user_id
            )
 # ── v3 Chat Handlers ──────────────────────────────────────────────────
 async def _make_ws_executor(websocket: WebSocket, user_id: str):
    """Return a callback that sends tool_call frames and awaits tool_result."""
    async def _executor(payload: dict) -> dict:
        payload["type"] = WsFrameType.tool_call
        await websocket.send_text(json.dumps(payload))
        future = device_manager.create_pending_call(user_id, payload["id"])
        return await future
    return _executor
 async def _handle_home_request(
    websocket: WebSocket,
    user_id: str,
    frame: dict,
 ) -> None:
    """Handle a home_request frame — streams HomeFormatter output back on the socket."""
    request_id = frame.get("request_id") or str(uuid4())
    message: str = frame.get("message", "")
    session_id: str = frame.get("session_id") or str(uuid4())
    logger.info(
        "device_ws: home_request_start user=%s req=%s session=%s msg=%s",
        user_id,
        request_id,
        session_id,
        message[:200],
    )
    # ── Memory: enrich context before LLM call ────────────────────────
    async with async_session() as db:
        memory = MemoryMiddleware(db)
        memory_context = await memory.enrich_context(
            user_id,
            message,
            trace_id=request_id,
            session_id=session_id,
        )
    context: dict = {
        "conversation_history": frame.get("conversation_history", []),
        "_debug": {"request_id": request_id, "session_id": session_id, "user_id": user_id},
        **memory_context,
    }
    executor = await _make_ws_executor(websocket, user_id)
    set_client_executor(executor)
    response_chunks: list[str] = []
    try:
        event_stream = run_home_stream(user_id, message, context)
        formatter = StreamFormatter(request_id=request_id)
        async for ws_frame in formatter.format(event_stream):
            await websocket.send_text(ws_frame.model_dump_json())
            # Collect text chunks to build the full response for episode storage
            if ws_frame.type == "stream_text":  # type: ignore[union-attr]
                response_chunks.append(ws_frame.chunk)  # type: ignore[union-attr]
    except Exception as exc:
        logger.error(
            "device_ws: home_request failed user=%s req=%s: %s",
            user_id, request_id, exc,
        )
    finally:
        clear_client_executor()
    # ── Memory: store episode after response ──────────────────────────
    async with async_session() as db:
        memory = MemoryMiddleware(db)
        await memory.store_episode(
            user_id, session_id, message, "".join(response_chunks), trace_id=request_id
        )
    logger.info(
        "device_ws: home_request_end user=%s req=%s session=%s response_chars=%d",
        user_id,
        request_id,
        session_id,
        len("".join(response_chunks)),
    )
 async def _handle_floating_request(
    websocket: WebSocket,
    user_id: str,
    frame: dict,
 ) -> None:
    """Handle a floating_request frame — streams FloatingFormatter output back on the socket."""
    request_id = frame.get("request_id") or str(uuid4())
    message: str = frame.get("message", "")
    session_id: str = frame.get("session_id") or str(uuid4())
    scope: dict = frame.get("scope", {})
    logger.info(
        "device_ws: floating_request_start user=%s req=%s session=%s scope=%s msg=%s",
        user_id,
        request_id,
        session_id,
        json.dumps(scope, ensure_ascii=True)[:200],
        message[:200],
    )
    # ── Memory: enrich context before LLM call ────────────────────────
    async with async_session() as db:
        memory = MemoryMiddleware(db)
        memory_context = await memory.enrich_context(
            user_id,
            message,
            trace_id=request_id,
            session_id=session_id,
        )
    context: dict = {
        "scope": scope,
        "_debug": {"request_id": request_id, "session_id": session_id, "user_id": user_id},
        **memory_context,
    }
    executor = await _make_ws_executor(websocket, user_id)
    set_client_executor(executor)
    response_chunks: list[str] = []
    try:
        event_stream = run_floating_stream(user_id, message, context)
        formatter = StreamFormatter(request_id=request_id)
        async for ws_frame in formatter.format(event_stream):
            await websocket.send_text(ws_frame.model_dump_json())
            if ws_frame.type == "stream_text":  # type: ignore[union-attr]
                response_chunks.append(ws_frame.chunk)  # type: ignore[union-attr]
    except Exception as exc:
        logger.error(
            "device_ws: floating_request failed user=%s req=%s: %s",
            user_id, request_id, exc,
        )
    finally:
        clear_client_executor()
    # ── Memory: store episode after response ──────────────────────────
    async with async_session() as db:
        memory = MemoryMiddleware(db)
        await memory.store_episode(
            user_id, session_id, message, "".join(response_chunks), trace_id=request_id
        )
    logger.info(
        "device_ws: floating_request_end user=%s req=%s session=%s response_chars=%d",
        user_id,
        request_id,
        session_id,
        len("".join(response_chunks)),
    )
 # ── v4 Journey Handlers ─────────────────────────────────────────────
 async def _handle_journey_start(
    websocket: WebSocket,
    user_id: str,
    frame: dict,
 ) -> None:
    """Handle a journey_start frame — explores directory and sends first question."""
    executor = await _make_ws_executor(websocket, user_id)
    set_client_executor(executor)
    try:
        reply = await handle_journey_start(user_id, frame)
        await websocket.send_text(json.dumps(reply))
    except Exception as exc:
        logger.error(
            "device_ws: journey_start failed user=%s: %s", user_id, exc
        )
        await websocket.send_text(json.dumps({
            "type": "journey_reply",
            "session_id": frame.get("session_id", ""),
            "message": f"Failed to start journey: {exc}",
            "done": True,
            "prompt_template": None,
        }))
    finally:
        clear_client_executor()
 async def _handle_journey_message(
    websocket: WebSocket,
    user_id: str,
    frame: dict,
 ) -> None:
    """Handle a journey_message frame — continues the journey conversation."""
    executor = await _make_ws_executor(websocket, user_id)
    set_client_executor(executor)
    try:
        reply = await handle_journey_message(user_id, frame)
        await websocket.send_text(json.dumps(reply))
    except Exception as exc:
        session_id = frame.get("session_id", "")
        logger.error(
            "device_ws: journey_message failed user=%s session=%s: %s",
            user_id, session_id, exc,
        )
        await websocket.send_text(json.dumps({
            "type": "journey_reply",
            "session_id": session_id,
            "message": f"Journey error: {exc}",
            "done": True,
            "prompt_template": None,
        }))
    finally:
        clear_client_executor()
 # ── Heartbeat ─────────────────────────────────────────────────────────
 async def _heartbeat_loop(websocket: WebSocket) -> None:
    """Send a ping frame every 30 s to keep the connection alive."""
    while True:
        await asyncio.sleep(_HEARTBEAT_INTERVAL)
        await websocket.send_text(json.dumps({"type": "ping"}))
 # ── Disconnect cleanup ────────────────────────────────────────────────
 async def _mark_runs_disconnected(user_id: str) -> None:
    """Mark all in-progress AgentRunLog rows as 'error' for this user."""
    try:
        async with async_session() as db:
            await db.execute(
                update(AgentRunLog)
                .where(
                    AgentRunLog.user_id == user_id,
                    AgentRunLog.status == "running",
                )
                .values(
                    status="error",
                    errors=["device disconnected"],
                )
            )
            await db.commit()
    except Exception as exc:
        logger.error(
            "device_ws: failed to mark runs as disconnected for user=%s: %s",
            user_id,
            exc,
        )
--- a/app/billing/init.py
+++ b/app/billing/init.py
@@ -0,0 +1,4 @@
 from app.billing.stripe_service import stripe_service
 from app.billing.tier_manager import tier_manager
 __all__ = ["stripe_service", "tier_manager"]
--- a/services/billing/app/stripe_service.py
+++ b/services/billing/app/stripe_service.py
@@ -1,7 +1,7 @@
 """Stripe service: checkout sessions, webhook handling, subscription management.
-Adapted for the Billing microservice — uses shared.models and shared.db.
+Subscription records are persisted in the PostgreSQL ``subscriptions`` table.
-All Stripe calls are gracefully stubbed when STRIPE_SECRET_KEY is not
+All Stripe calls are gracefully stubbed when ``STRIPE_SECRET_KEY`` is not
 configured, enabling local development without live credentials.
 """
@@ -15,8 +15,7 @@ from fastapi import HTTPException, status
 from sqlalchemy import select
 from sqlalchemy.ext.asyncio import AsyncSession
-from shared.config import settings
+from app.config.settings import settings
 from shared.models import Subscription
 # Stripe price IDs per tier — replace with real IDs in production .env
 TIER_PRICE_IDS: dict[str, str] = {
@@ -47,7 +46,11 @@ class StripeService:
        success_url: str = "https://app.adiuva.app/billing/success?session_id={CHECKOUT_SESSION_ID}",
        cancel_url: str = "https://app.adiuva.app/billing/cancel",
    ) -> str:
-        """Create a Stripe checkout session and return the URL."""
+        """Create a Stripe checkout session and return the URL.
        Returns a stub URL when Stripe is not configured.
        Raises ``HTTP 400`` for the free tier or an unknown tier.
        """
        if tier == "free":
            raise HTTPException(
                status_code=status.HTTP_400_BAD_REQUEST,
@@ -84,6 +87,8 @@ class StripeService:
        """Process a Stripe webhook event.
        Verifies the signature, then dispatches on event type.
        Raises ``HTTP 400`` on signature mismatch.
        No-ops when Stripe is not configured.
        """
        if not self._configured():
            return
@@ -150,7 +155,9 @@ class StripeService:
    async def get_subscription(
        self, user_id: str, db: AsyncSession
    ) -> dict[str, Any] | None:
-        """Return the subscription record for user_id, or None."""
+        """Return the subscription record for ``user_id``, or ``None`` if absent."""
        from app.models import Subscription  # noqa: PLC0415
        result = await db.execute(
            select(Subscription).where(Subscription.user_id == user_id)
        )
@@ -169,7 +176,12 @@ class StripeService:
        }
    async def cancel_subscription(self, user_id: str, db: AsyncSession) -> None:
-        """Cancel the user's Stripe subscription and downgrade to free."""
+        """Cancel the user's Stripe subscription and downgrade them to free.
        Raises ``HTTP 404`` when no active subscription exists.
        """
        from app.models import Subscription  # noqa: PLC0415
        result = await db.execute(
            select(Subscription).where(Subscription.user_id == user_id)
        )
@@ -199,6 +211,8 @@ class StripeService:
        sub_status: str,
        current_period_end: datetime | None,
    ) -> None:
        from app.models import Subscription  # noqa: PLC0415
        result = await db.execute(
            select(Subscription).where(Subscription.user_id == user_id)
        )
@@ -220,6 +234,8 @@ class StripeService:
        status: str | None = None,
        current_period_end: datetime | None = None,
    ) -> None:
        from app.models import Subscription  # noqa: PLC0415
        result = await db.execute(
            select(Subscription).where(
                Subscription.stripe_subscription_id == stripe_subscription_id
@@ -236,5 +252,5 @@ class StripeService:
            sub.current_period_end = current_period_end
-# Module-level singleton
+# Module-level singleton shared across the app.
 stripe_service = StripeService()
--- a/app/billing/tier_manager.py
+++ b/app/billing/tier_manager.py
@@ -0,0 +1,118 @@
 """Tier manager: feature matrix and quota enforcement.
 ``TierManager`` is the single source of truth for what each billing tier
 allows.  ``get_tier`` queries the ``subscriptions`` table for the live tier.
 Quota-enforcement helpers take ``tier`` directly — the caller already has it
 from ``current_user.tier`` (provided by ``get_current_user``).
 """
 from __future__ import annotations
 from typing import Any
 from fastapi import HTTPException, status
 from sqlalchemy import select
 from sqlalchemy.ext.asyncio import AsyncSession
 from app.schemas import BillingTier
 # Feature matrix per tier.  -1 means unlimited; 0 means disabled.
 FEATURES: dict[str, dict[str, Any]] = {
    "free": {
        "agents": 3,
        "batch_active": 2,
        "batch_runs_per_day": 5,
        "providers": 1,
        "batch_builder": False,
        "sso": False,
    },
    "pro": {
        "agents": -1,           # unlimited
        "batch_active": 10,
        "batch_runs_per_day": 50,
        "providers": -1,
        "batch_builder": False,
        "sso": False,
    },
    "power": {
        "agents": -1,
        "batch_active": -1,     # unlimited
        "batch_runs_per_day": -1,  # unlimited
        "providers": -1,
        "batch_builder": True,
        "sso": False,
    },
    "team": {
        "agents": -1,
        "batch_active": -1,
        "batch_runs_per_day": -1,  # unlimited
        "providers": -1,
        "batch_builder": True,
        "sso": True,
    },
 }
 # Requests-per-minute limit per tier.
 RATE_LIMITS: dict[str, int] = {
    "free": 20,
    "pro": 60,
    "power": 120,
    "team": 200,
 }
 class TierManager:
    """Centralises tier feature-gating, rate-limit lookups, and quota checks."""
    # ── Tier lookup ─────────────────────────────────────────────────────
    async def get_tier(self, user_id: str, db: AsyncSession) -> BillingTier:
        """Return the current billing tier for ``user_id`` from the DB.
        Falls back to ``'power'`` in dev (unlimited) or ``'free'`` in prod
        when no subscription row exists.
        """
        from app.models import Subscription  # noqa: PLC0415
        from app.config.settings import settings  # noqa: PLC0415
        result = await db.execute(
            select(Subscription.tier).where(Subscription.user_id == user_id)
        )
        tier: str | None = result.scalar_one_or_none()
        if tier is None or tier not in FEATURES:
            return "power" if settings.ENV == "dev" else "free"
        return tier  # type: ignore[return-value]
    # ── Feature access ───────────────────────────────────────────────────
    def check_feature(self, tier: BillingTier, feature: str) -> bool:
        """Return ``True`` if ``tier`` has ``feature`` enabled.
        For numeric features, any value > 0 or -1 (unlimited) counts as enabled.
        """
        value = FEATURES.get(tier, FEATURES["free"]).get(feature)
        if value is None:
            return False
        if isinstance(value, bool):
            return value
        return value != 0
    def require_feature(self, tier: BillingTier, feature: str, tier_name: str = "") -> None:
        """Raise ``HTTP 403`` if ``tier`` does not have ``feature``."""
        if not self.check_feature(tier, feature):
            detail = (
                f"Feature '{feature}' requires {tier_name} tier or above."
                if tier_name
                else f"Feature '{feature}' is not available on your current tier."
            )
            raise HTTPException(status_code=status.HTTP_403_FORBIDDEN, detail=detail)
    # ── Rate limiting ────────────────────────────────────────────────────
    def get_rate_limit(self, tier: BillingTier) -> int:
        """Return the requests-per-minute limit for ``tier``."""
        return RATE_LIMITS.get(tier, RATE_LIMITS["free"])
 # Module-level singleton shared across the app.
 tier_manager = TierManager()
--- a/services/chat/app/init.py
+++ b/services/chat/app/init.py
--- a/app/config/settings.py
+++ b/app/config/settings.py
@@ -0,0 +1,53 @@
 from typing import Literal
 from pydantic_settings import BaseSettings, SettingsConfigDict
 class Settings(BaseSettings):
    DATABASE_URL: str = "postgresql+asyncpg://postgres:postgres@localhost:5432/adiuva"
    JWT_SECRET: str = "change-me-in-production"
    JWT_ALGORITHM: str = "HS256"
    JWT_ACCESS_TOKEN_EXPIRE_MINUTES: int = 30
    JWT_REFRESH_TOKEN_EXPIRE_DAYS: int = 30
    STRIPE_SECRET_KEY: str = ""
    STRIPE_WEBHOOK_SECRET: str = ""
    OPENAI_API_KEY: str = ""
    ANTHROPIC_API_KEY: str = ""
    GOOGLE_API_KEY: str = ""
    CEREBRAS_API_KEY: str = ""
    LLM_MODEL: str = "gpt-4o"
    LLM_ROUTER_MODEL: str = "gpt-4o-mini"
    LLM_EMBED_MODEL: str = "text-embedding-3-small"
    # GitHub Copilot OAuth token storage directory.
    # Leave empty to use the LiteLLM default (~/.config/litellm/github_copilot).
    # In Docker, set this to a path backed by a named volume so tokens survive restarts.
    GITHUB_COPILOT_TOKEN_DIR: str = ""
    # OAuth client credentials — used for Gmail and Microsoft (Outlook/Teams) flows.
    GMAIL_CLIENT_ID: str = ""
    GMAIL_CLIENT_SECRET: str = ""
    MS_CLIENT_ID: str = ""
    MS_CLIENT_SECRET: str = ""
    # MS_TENANT_ID: set to 'common' to allow multi-tenant (personal + work accounts).
    MS_TENANT_ID: str = "common"
    # Fernet key (URL-safe base64, 32-byte key) for at-rest encryption of OAuth
    # tokens stored in cloud_agent_configs.oauth_token_encrypted.
    # Generate with: from cryptography.fernet import Fernet; Fernet.generate_key()
    OAUTH_ENCRYPTION_KEY: str = ""
    CORS_ORIGINS: list[str] = ["app://.", "http://localhost:3000", "http://localhost:5173"]
    LANGFUSE_SECRET_KEY: str = ""
    LANGFUSE_PUBLIC_KEY: str = ""
    LANGFUSE_HOST: str = "https://cloud.langfuse.com"
    ENV: Literal["dev", "prod"] = "dev"
    model_config = SettingsConfigDict(env_file=".env", env_file_encoding="utf-8")
 settings = Settings()
--- a/services/ws-gateway/app/init.py
+++ b/services/ws-gateway/app/init.py
--- a/app/core/agent_registry.py
+++ b/app/core/agent_registry.py
@@ -0,0 +1,30 @@
 """Minimal agent base types retained for compatibility with batch runners."""
 from __future__ import annotations
 from abc import ABC, abstractmethod
 from typing import Any
 class BaseAgent(ABC):
    """Common base for non-chat agents still using the old base contract."""
    def __init__(
        self,
        user_id: str = "",
        shared_memory: dict[str, Any] | None = None,
        vector_store_context: list[str] | None = None,
    ) -> None:
        self.user_id = user_id
        self.shared_memory: dict[str, Any] = shared_memory or {}
        self.vector_store_context: list[str] = vector_store_context or []
    @abstractmethod
    def get_name(self) -> str: ...
    @abstractmethod
    def get_description(self) -> str: ...
    @property
    def skills(self) -> list[str]:
        return []
--- a/services/batch-agent/app/agent_runner.py
+++ b/services/batch-agent/app/agent_runner.py
@@ -1,12 +1,27 @@
-"""Agent run orchestrator — adapted for Batch Agent Service.
+"""Agent run orchestrator.
-Key changes from monolith app/core/agent_runner.py:
+Drives two agent types:
-  - No DeviceConnectionManager — tool calls go through Redis ws_context.
+
-  - set_current_user / clear_current_user replace set_client_executor.
+* **Local directory agent** — two-step execution per file:
-  - run_local_agent accepts a serialized dict (from Redis / REST) instead
+  Step 1 (Classification) uses code to fetch all projects and asks the LLM
-    of SQLAlchemy model objects.
+  to identify which project the file belongs to and which domains are relevant.
-  - _finalize_run writes to PostgreSQL via shared.db.async_session.
+  Step 2 (Processing) fetches existing entities for that project/domains via
-  - Cloud agent import path changed to app.integrations.
+  code and runs an LLM with tools — existing data in context enforces
  update-first naturally.
 * **Cloud connector agent** — fetches data from third-party APIs (Gmail,
  Teams, Outlook) and pushes extracted items to Electron.
 Usage
 -----
 Background tasks are spawned with ``asyncio.create_task()``::
    asyncio.create_task(run_local_agent(user_id, config, run_log, device_manager))
    asyncio.create_task(trigger_pending_runs(user_id, device_id, device_manager))
 The ``trigger_pending_runs`` function is called by the device WS endpoint
 when Electron sends ``device_hello``, so any overdue runs fire immediately
 when the device reconnects.
 """
 from __future__ import annotations
@@ -18,37 +33,48 @@ import uuid
 from datetime import datetime, timedelta, timezone
 from typing import Any
 from croniter import croniter
 from langchain_core.messages import AIMessage, HumanMessage, SystemMessage, ToolMessage
 from sqlalchemy import select
 from app.agents.filesystem_agent import FILESYSTEM_TOOLS
-from shared.agents.note_agent import NOTE_TOOLS
+from app.agents.note_agent import NOTE_TOOLS
-from shared.agents.project_agent import PROJECT_TOOLS
+from app.agents.project_agent import PROJECT_TOOLS
-from shared.agents.task_agent import TASK_TOOLS
+from app.agents.task_agent import TASK_TOOLS
-from shared.agents.timeline_agent import TIMELINE_TOOLS
+from app.agents.timeline_agent import TIMELINE_TOOLS
-from shared.llm import get_llm
+from app.config.settings import settings
-from shared.ws_context import execute_on_client, set_current_user, clear_current_user
+from app.core.device_manager import DeviceConnectionManager
-import app.tracing as tracing
+from app.core.langfuse_client import extract_usage, get_langfuse, get_prompt_or_fallback
-from shared.db import async_session
+from app.core.llm import get_llm
-from shared.models import AgentRunLog, CloudAgentConfig, LocalAgentConfig
+from app.core.ws_context import clear_client_executor, execute_on_client, set_client_executor
-from shared.redis import redis_client, ws_out_channel
+from app.db import async_session
 from app.models import AgentRunLog, CloudAgentConfig, LocalAgentConfig
 logger = logging.getLogger(__name__)
 # ── Concurrency guard ─────────────────────────────────────────────────────
 # Tracks agent IDs that currently have a run in progress.
 # Prevents multiple simultaneous runs of the same agent within a single process.
 _running_agents: set[str] = set()
 def is_agent_running(agent_id: str) -> bool:
    """Return ``True`` if *agent_id* already has a run in progress."""
    return agent_id in _running_agents
 # ── Timeouts ───────────────────────────────────────────────────────────────
 # Max seconds to wait for a single tool-call round-trip (FE → BE).
 _TOOL_CALL_TIMEOUT: int = 30
 # Max LLM reasoning steps for Step 2 processing.
 _MAX_PROCESSING_STEPS: int = 12
 # Max directory recursion depth during scan.
 _MAX_SCAN_DEPTH: int = 5
 # ── Data-type to tool mapping ─────────────────────────────────────────────
 # NOTE: "projects" is intentionally excluded — project creation/assignment is
 # handled in code by the runner, never delegated to the Step 2 LLM.
 _DATA_TYPE_TOOLS: dict[str, list[Any]] = {
    "tasks": TASK_TOOLS,
    "notes": NOTE_TOOLS,
@@ -76,7 +102,7 @@ _DOMAIN_DESCRIPTIONS: dict[str, str] = {
    ),
 }
-_STEP1_SYSTEM_PROMPT = """\
+_BATCH_FILE_CLASSIFIER_PROMPT = """\
 You are a file classifier for a freelance project management tool.
 Your job is to match a file to an existing project and identify which data domains to extract.
@@ -107,7 +133,7 @@ Respond ONLY with a JSON object — no markdown, no explanation:
 # ── Step 2: Processing prompt ─────────────────────────────────────────────
-_PROCESSING_SYSTEM_PROMPT = """\
+_BATCH_PROCESSING_PROMPT = """\
 You are a data extraction assistant for a freelance project management tool.
 Your task: extract structured data from the file content and persist it using the available tools.
@@ -134,9 +160,9 @@ Domains to extract: {data_types}
 {custom_prompt_section}
 """
-# ── Cloud processing prompt ───────────────────────────────────────────────
+# ── Cloud processing prompt (kept separate for cloud agent) ───────────────
-_CLOUD_PROCESSING_PROMPT = """\
+_BATCH_CLOUD_PROCESSING_PROMPT = """\
 You are a data extraction and management assistant for a freelance project
 management tool.
@@ -167,6 +193,56 @@ and what you created.
 """
 # ── Cron helper ────────────────────────────────────────────────────────────
 def _is_overdue(schedule_cron: str, last_run_at: datetime | None) -> bool:
    """Return ``True`` if the next scheduled run time has already passed.
    Always validates the cron expression first — an invalid expression returns
    ``False`` (fail-safe: never trigger an unparseable schedule).
    """
    try:
        now = datetime.now(timezone.utc)
        if last_run_at is None:
            croniter(schedule_cron, now)
            return True
        ts = last_run_at
        if ts.tzinfo is None:
            ts = ts.replace(tzinfo=timezone.utc)
        cron = croniter(schedule_cron, ts)
        next_run: datetime = cron.get_next(datetime)
        return now >= next_run
    except Exception as exc:
        logger.warning("agent_runner: cannot parse cron %r: %s", schedule_cron, exc)
        return False
 # ── WS executor for agent context ─────────────────────────────────────────
 def _make_agent_executor(
    user_id: str,
    device_mgr: DeviceConnectionManager,
    run_context: dict | None = None,
 ) -> Any:
    """Create a WS callback for ``set_client_executor()`` so that all tools
    can use ``execute_on_client()`` during an agent run.
    If *run_context* is provided it is attached to every ``tool_call`` frame
    so the Electron client can attribute actions to the correct agent run.
    """
    async def _executor(payload: dict) -> dict:
        payload["type"] = "tool_call"
        if run_context:
            payload["run_context"] = run_context
        call_id = payload["id"]
        fut = device_mgr.create_pending_call(user_id, call_id)
        await device_mgr.send_frame(user_id, payload)
        return await asyncio.wait_for(fut, timeout=_TOOL_CALL_TIMEOUT)
    return _executor
 # ── LLM tool-calling loop ─────────────────────────────────────────────────
@@ -194,11 +270,13 @@ async def _run_agent_with_tools(
    user_message: str,
    tools: list[Any],
    max_steps: int,
-    langfuse_handler: Any | None = None,
+    user_id: str = "",
    langfuse_prompt: Any = None,
    agent_name: str = "batch-agent",
 ) -> str:
    """Run an LLM agent with tool-calling, returning the final text response."""
-    callbacks = [langfuse_handler] if langfuse_handler else None
+    lf = get_langfuse()
-    llm = get_llm(callbacks=callbacks)
+    llm = get_llm()
    llm_with_tools = llm.bind_tools(tools)
    messages: list[Any] = [
        SystemMessage(content=system_prompt),
@@ -207,12 +285,42 @@ async def _run_agent_with_tools(
    tool_map = {tool_def.name: tool_def for tool_def in tools}
    _span_ctx = (
        lf.start_as_current_observation(
            as_type="span",
            name=agent_name,
            user_id=user_id or None,
            input=user_message,
        )
        if lf else None
    )
    _span = _span_ctx.__enter__() if _span_ctx else None
    try:
        for _ in range(max_steps):
            _gen_ctx = (
                lf.start_as_current_observation(
                    as_type="generation",
                    name=f"{agent_name}-llm",
                    model=settings.LLM_MODEL,
                    prompt=langfuse_prompt,
                    input=messages,
                )
                if lf else None
            )
            _gen = _gen_ctx.__enter__() if _gen_ctx else None
            response: AIMessage = await llm_with_tools.ainvoke(messages)
            if _gen_ctx:
                _gen.update(output=_as_text(response.content), usage=extract_usage(response))
                _gen_ctx.__exit__(None, None, None)
            messages.append(response)
            if not response.tool_calls:
-            return _as_text(response.content)
+                final_text = _as_text(response.content)
                if _span:
                    _span.update(output=final_text)
                return final_text
            for call in response.tool_calls:
                call_id = str(call.get("id", ""))
@@ -238,13 +346,22 @@ async def _run_agent_with_tools(
                messages.append(ToolMessage(content=str(tool_output), tool_call_id=call["id"]))
        final = await llm.ainvoke(messages)
-    return _as_text(final.content)
+        final_text = _as_text(final.content)
        if _span:
            _span.update(output=final_text)
        return final_text
    finally:
        if _span_ctx:
            _span_ctx.__exit__(None, None, None)
        if lf:
            lf.flush()
 # ── Tool list builder ─────────────────────────────────────────────────────
 def _build_processing_tools(data_types: list[str]) -> list[Any]:
    """Build the tool list for processing based on user's data_types selection."""
    tools: list[Any] = list(FILESYSTEM_TOOLS)
    for dt in data_types:
        dt_tools = _DATA_TYPE_TOOLS.get(dt)
@@ -261,6 +378,12 @@ async def _scan_directories(
    extensions: list[str],
    last_run_at: datetime | None,
 ) -> list[str]:
    """Walk directories via WS tool calls and return filtered file paths.
    Recursion is capped at ``_MAX_SCAN_DEPTH``.  Files are filtered by
    extension (if configured) and by modification date (if ``last_run_at``
    is set).  Fails open: if metadata cannot be read, the file is included.
    """
    all_files: list[str] = []
    ext_set = {e.lstrip(".").lower() for e in extensions} if extensions else set()
@@ -292,6 +415,7 @@ async def _scan_directories(
    if last_run_at is None:
        return all_files
    # Filter by modification date.
    last_run_ms = int(last_run_at.timestamp() * 1000)
    filtered: list[str] = []
    for file_path in all_files:
@@ -308,7 +432,7 @@ async def _scan_directories(
            if mod_ms > last_run_ms:
                filtered.append(file_path)
        except Exception:
-            filtered.append(file_path)
+            filtered.append(file_path)  # fail-open
    return filtered
@@ -317,6 +441,7 @@ async def _scan_directories(
 async def _fetch_projects() -> list[dict]:
    """Fetch all projects from the Electron client via WS."""
    try:
        result = await execute_on_client(action="select", table="projects")
        return result.get("rows", [])
@@ -334,6 +459,7 @@ _DOMAIN_TABLE: dict[str, str] = {
 async def _fetch_domain_entities(domain: str, project_id: str) -> list[dict]:
    """Fetch existing rows for a domain, scoped to a project where applicable."""
    table = _DOMAIN_TABLE.get(domain)
    if not table:
        return []
@@ -353,6 +479,12 @@ async def _fetch_domain_entities(domain: str, project_id: str) -> list[dict]:
 def _format_entities_for_context(domain: str, rows: list[dict]) -> str:
    """Format existing entity rows as a readable context block for the LLM.
    Includes enough detail per record for the LLM to make a confident
    update-vs-create decision without overwhelming the context.
    Note content is truncated to 200 chars to stay within token budget.
    """
    if not rows:
        return f"No existing {domain}."
    lines: list[str] = []
@@ -399,9 +531,14 @@ async def _classify_file(
    file_content: str,
    projects: list[dict],
    config_data_types: list[str],
    langfuse_handler: Any | None = None,
    custom_system_prompt: str | None = None,
 ) -> tuple[str, list[str], str | None]:
    """Call the LLM to classify a file by project and relevant domains.
    Returns ``(project_id_or_"new", domains, new_project_name_or_None)``.
    - ``project_id`` is an existing project UUID, or ``"new"`` when no match found.
    - ``new_project_name`` is only set when ``project_id == "new"``.
    Falls back to ``("new", config_data_types, None)`` on any error.
    """
    fallback: tuple[str, list[str], str | None] = ("new", list(config_data_types), None)
    if not file_content.strip():
@@ -422,34 +559,42 @@ async def _classify_file(
        if d in _DOMAIN_DESCRIPTIONS
    )
-    if custom_system_prompt:
+    step1_template, step1_prompt_obj = get_prompt_or_fallback(
-        # Fixture-provided prompt takes absolute priority
+        "batch_file_classifier", _BATCH_FILE_CLASSIFIER_PROMPT
        system = custom_system_prompt.format_map(
            {"domain_definitions": domain_definitions, "projects_list": projects_list}
    )
-    else:
+    system = step1_template.format(
-        system = tracing.compile_prompt(
+        domain_definitions=domain_definitions,
-            "batch_file_classifier",
+        projects_list=projects_list,
            fallback=_STEP1_SYSTEM_PROMPT,
            variables={
                "domain_definitions": domain_definitions,
                "projects_list": projects_list,
            },
    )
-    llm = get_llm(callbacks=[langfuse_handler] if langfuse_handler else None)
+    lf = get_langfuse()
-    try:
+    llm = get_llm()
-        response = await llm.ainvoke([
+    classifier_messages = [
        SystemMessage(content=system),
        HumanMessage(content=f"File: {file_path}\n\nContent:\n{file_content[:4000]}"),
-        ])
+    ]
    try:
        if lf:
            with lf.start_as_current_observation(
                as_type="generation",
                name="step1-classifier",
                model=settings.LLM_ROUTER_MODEL,
                prompt=step1_prompt_obj,
                input=classifier_messages,
            ) as gen:
                response = await llm.ainvoke(classifier_messages)
                gen.update(output=_as_text(response.content), usage=extract_usage(response))
        else:
            response = await llm.ainvoke(classifier_messages)
        raw = _as_text(response.content).strip()
        # Strip markdown fences if the model wraps the JSON.
        if raw.startswith("```"):
            raw = raw.split("```")[1]
            if raw.startswith("json"):
                raw = raw[4:]
        parsed = json.loads(raw.strip())
        raw_project_id: str = str(parsed.get("project_id") or "new")
        # Reject hallucinated UUIDs — only accept ids that exist in the fetched list.
        project_id = raw_project_id if raw_project_id in valid_project_ids else "new"
        new_project_name: str | None = (
            str(parsed["new_project_name"]).strip() or None
@@ -473,104 +618,114 @@ async def _classify_file(
 # ── Local agent runner (two-step per file) ────────────────────────────────
-async def run_local_agent(user_id: str, trigger_data: dict[str, Any], *, langfuse_handler: Any | None = None) -> None:
+async def run_local_agent(
-    """Execute a local directory agent run.
+    user_id: str,
    config: LocalAgentConfig,
    run_log: AgentRunLog,
    device_mgr: DeviceConnectionManager,
    run_context: dict | None = None,
 ) -> None:
    """Execute a local directory agent run using a two-step approach per file.
-    In the microservice world, trigger_data is a serialized dict from
+    Step 1 — Classification (code + 1 LLM call per file, no tools):
-    the REST route (forwarded via Redis), containing the agent config
+        Code scans directories and fetches all projects via WS.
-    fields and run_context.
+        For each file, LLM identifies the project and relevant domains.
-    set_current_user() must be called BEFORE this function.
+    Step 2 — Processing (code + 1 LLM call per file, with tools):
        Code fetches existing entities for the identified project/domains.
        LLM receives file content + existing entities in context and uses
        tools to update existing records or create new ones.
    """
-    run_context: dict = trigger_data.get("run_context", {})
+    run_id = run_log.id
-    agent_id = run_context.get("agent_id", str(uuid.uuid4()))
+    agent_id = (run_context or {}).get("agent_id") or config.id
    run_id = run_context.get("run_id")
    _running_agents.add(agent_id)
-    # Extract config from trigger payload
+    # ── Device online check ─────────────────────────────────────────
-    directory_paths: list[str] = trigger_data.get("directory_paths", [])
+    target_device_id = config.device_id.strip() if isinstance(config.device_id, str) else ""
-    if not directory_paths:
+    is_online = (
-        directory = trigger_data.get("directory", "")
+        device_mgr.is_online(user_id, target_device_id)
-        if directory:
+        if target_device_id
-            directory_paths = [directory]
+        else device_mgr.is_online(user_id)
    )
-    data_types: list[str] = trigger_data.get("data_types", [])
+    if not is_online:
-    file_extensions: list[str] = trigger_data.get("file_extensions", [])
+        logger.info(
-    prompt_template: str = trigger_data.get("prompt_template", "")
+            "agent_runner: skip run=%s — device %r offline for user=%s",
-    last_run_at_raw = trigger_data.get("last_run_at")
+            run_id,
-    last_run_at: datetime | None = None
+            target_device_id or "<any>",
-    if last_run_at_raw:
+            user_id,
-        if isinstance(last_run_at_raw, str):
+        )
-            last_run_at = datetime.fromisoformat(last_run_at_raw)
+        await _finalize_run(
-        elif isinstance(last_run_at_raw, (int, float)):
+            run_log,
-            last_run_at = datetime.fromtimestamp(last_run_at_raw / 1000, tz=timezone.utc)
+            status="error",
            errors=[f"Device {target_device_id or '<any>'!r} is not connected"],
        )
        return
    # ── Set up WS executor for tools ────────────────────────────────
    executor = _make_agent_executor(user_id, device_mgr, run_context)
    set_client_executor(executor)
    errors: list[str] = []
    items_processed = 0
    items_created = 0
    custom_section = (
-        f"User instructions:\n{prompt_template}"
+        f"User instructions:\n{config.prompt_template}"
-        if prompt_template
+        if config.prompt_template
        else ""
    )
    # Create or load run log
    run_log_id = run_id
    if not run_log_id:
        async with async_session() as db:
            run_log = AgentRunLog(
                agent_id=agent_id,
                agent_type="local",
                user_id=user_id,
                status="running",
            )
            db.add(run_log)
            await db.commit()
            await db.refresh(run_log)
            run_log_id = run_log.id
    try:
-        # ── Scan directories ─────────────────────────────────────────
+        # ── Code: scan directories ───────────────────────────────────
-        logger.info("agent_runner: run=%s scanning directories user=%s", run_log_id, user_id)
+        logger.info("agent_runner: run=%s scanning directories user=%s", run_id, user_id)
        file_paths = await _scan_directories(
-            paths=directory_paths,
+            paths=config.directory_paths,
-            extensions=file_extensions,
+            extensions=config.file_extensions or [],
-            last_run_at=last_run_at,
+            last_run_at=config.last_run_at,
        )
        logger.info(
-            "agent_runner: run=%s found %d file(s) after filtering", run_log_id, len(file_paths)
+            "agent_runner: run=%s found %d file(s) after filtering", run_id, len(file_paths)
        )
        if not file_paths:
-            await _finalize_run(run_log_id, status="success", items_processed=0, items_created=0)
+            await _finalize_run(run_log, status="success", items_processed=0, items_created=0)
            return
-        # ── Fetch all projects once ──────────────────────────────────
+        # ── Code: fetch all projects once ────────────────────────────
        projects = await _fetch_projects()
        for file_path in file_paths:
            try:
                # Read file content via code.
                file_result = await execute_on_client(
                    action="read_file_content", data={"path": file_path}
                )
                file_content: str = file_result.get("content", "")
                if not file_content:
                    logger.debug("agent_runner: run=%s skipping empty file %r", run_id, file_path)
                    continue
                items_processed += 1
-                # Step 1 — classify file
+                # Step 1 — classify file.
                project_id, domains, new_project_name = await _classify_file(
                    file_path=file_path,
                    file_content=file_content,
                    projects=projects,
-                    config_data_types=data_types,
+                    config_data_types=config.data_types,
-                    langfuse_handler=langfuse_handler,
+                )
                logger.info(
                    "agent_runner: run=%s file=%r → project=%s new_name=%r domains=%s",
                    run_id,
                    file_path,
                    project_id,
                    new_project_name,
                    domains,
                )
-                # Step 2 — resolve project_id, fetch entities, process
+                # Step 2 — resolve project_id via CODE, then fetch entities.
                # Project creation is NEVER delegated to the Step 2 LLM.
                if project_id == "new":
                    proj_name = new_project_name or "Untitled Project"
                    try:
@@ -581,10 +736,18 @@ async def run_local_agent(user_id: str, trigger_data: dict[str, Any], *, langfus
                        )
                        created = proj_result.get("row", {})
                        effective_project_id = created.get("id", "standalone")
                        # Add to local list so subsequent files can match it.
                        if "id" in created:
                            projects.append(created)
                        logger.info(
                            "agent_runner: run=%s created project %r id=%s",
                            run_id, proj_name, effective_project_id,
                        )
                    except Exception as exc:
-                        logger.warning("agent_runner: run=%s create project failed: %s", run_log_id, exc)
+                        logger.warning(
                            "agent_runner: run=%s failed to create project %r: %s",
                            run_id, proj_name, exc,
                        )
                        effective_project_id = "standalone"
                        proj_name = "unknown"
                    project_context = (
@@ -600,6 +763,7 @@ async def run_local_agent(user_id: str, trigger_data: dict[str, Any], *, langfus
                        "Always set projectId to this id on every record you create."
                    )
                # "projects" domain is never passed to Step 2 — handled above in code.
                domains = [d for d in domains if d != "projects"]
                existing_blocks: list[str] = []
@@ -609,15 +773,14 @@ async def run_local_agent(user_id: str, trigger_data: dict[str, Any], *, langfus
                existing_context = "\n\n".join(existing_blocks)
-                system_prompt = tracing.compile_prompt(
+                step2_template, step2_prompt_obj = get_prompt_or_fallback(
-                    "batch_processing",
+                    "batch_processing", _BATCH_PROCESSING_PROMPT
-                    fallback=_PROCESSING_SYSTEM_PROMPT,
+                )
-                    variables={
+                system_prompt = step2_template.format(
-                        "existing_context": existing_context,
+                    existing_context=existing_context,
-                        "project_context": project_context,
+                    project_context=project_context,
-                        "data_types": ", ".join(domains),
+                    data_types=", ".join(domains),
-                        "custom_prompt_section": custom_section,
+                    custom_prompt_section=custom_section,
                    },
                )
                processing_tools = _build_processing_tools(domains)
@@ -630,22 +793,29 @@ async def run_local_agent(user_id: str, trigger_data: dict[str, Any], *, langfus
                    ),
                    tools=processing_tools,
                    max_steps=_MAX_PROCESSING_STEPS,
-                    langfuse_handler=langfuse_handler,
+                    user_id=user_id,
                    langfuse_prompt=step2_prompt_obj,
                    agent_name="step2-processor",
                )
                logger.info(
                    "agent_runner: run=%s file=%r result=%s",
-                    run_log_id, file_path, result_text[:200],
+                    run_id,
                    file_path,
                    result_text[:200],
                )
            except Exception as exc:
                errors.append(f"Error processing '{file_path}': {exc}")
-                logger.error("agent_runner: run=%s file=%r failed: %s", run_log_id, file_path, exc)
+                logger.error(
                    "agent_runner: run=%s file=%r failed: %s", run_id, file_path, exc
                )
    except Exception as exc:
        errors.append(f"Agent run failed: {exc}")
-        logger.error("agent_runner: run=%s failed: %s", run_log_id, exc)
+        logger.error("agent_runner: run=%s failed: %s", run_id, exc)
    finally:
        _running_agents.discard(agent_id)
        clear_client_executor()
    # ── Finalise ────────────────────────────────────────────────────
    if errors and items_processed == 0:
@@ -656,24 +826,32 @@ async def run_local_agent(user_id: str, trigger_data: dict[str, Any], *, langfus
        final_status = "success"
    await _finalize_run(
-        run_log_id,
+        run_log,
        status=final_status,
        items_processed=items_processed,
        items_created=items_created,
        errors=errors,
    )
    logger.info(
        "agent_runner: run=%s done status=%s processed=%d errors=%d",
        run_id,
        final_status,
        items_processed,
        len(errors),
    )
-    # Notify Electron that the run is complete via Redis
+    # Notify Electron that the run is complete.
-    if run_context:
+    if run_context and device_mgr.is_online(user_id):
        try:
-            channel = ws_out_channel(user_id)
+            await device_mgr.send_frame(user_id, {
            await redis_client.publish(channel, json.dumps({
                "type": "run_complete",
                "run_context": run_context,
                "status": final_status,
-            }))
+            })
        except Exception as exc:
-            logger.warning("agent_runner: run=%s failed to send run_complete: %s", run_log_id, exc)
+            logger.warning(
                "agent_runner: run=%s failed to send run_complete: %s", run_id, exc
            )
 # ── Cloud agent runner ─────────────────────────────────────────────────────
@@ -681,41 +859,49 @@ async def run_local_agent(user_id: str, trigger_data: dict[str, Any], *, langfus
 _CLOUD_DEFAULT_LOOKBACK_DAYS: int = 7
-async def run_cloud_agent(user_id: str, config_id: str, *, langfuse_handler: Any | None = None) -> None:
+async def run_cloud_agent(
-    """Execute a cloud connector agent run.
+    user_id: str,
    config: CloudAgentConfig,
    run_log: AgentRunLog,
    device_mgr: DeviceConnectionManager,
 ) -> None:
    """Execute a cloud connector agent run end-to-end.
-    Loads the CloudAgentConfig from DB, decrypts OAuth tokens, fetches
+    Steps:
    messages from the provider, and runs LLM extraction.
-    set_current_user() must be called BEFORE this function.
+    1. Verify the user's device is online.
    2. Decrypt the stored OAuth token from ``config.oauth_token_encrypted``.
    3. Instantiate the provider client (Gmail or MS Graph).
    4. Fetch messages/emails since ``config.last_run_at`` (or 7 days ago for
       the first run) applying ``config.filter_config`` filters.
    5. For each message/email call the LLM to extract structured items.
    6. Push each item to Electron as an ``insert`` tool-call.
    7. If the provider refreshed its access token, re-encrypt and write it
       back to ``config.oauth_token_encrypted``.
    8. Persist the run outcome via ``_finalize_run``.
    """
-    from app.integrations import decrypt_token, encrypt_token, get_provider
+    run_id = run_log.id
-    async with async_session() as db:
+    # ── 1. Device online check ─────────────────────────────────────────
-        result = await db.execute(
+    if not device_mgr.is_online(user_id):
-            select(CloudAgentConfig).where(CloudAgentConfig.id == config_id)
+        logger.info(
            "agent_runner: skip cloud run=%s — no device online for user=%s",
            run_id,
            user_id,
        )
        await _finalize_run(
            run_log,
            status="error",
            errors=["No connected device — cloud agent results cannot be delivered"],
        )
        config = result.scalar_one_or_none()
        if config is None:
            logger.error("agent_runner: cloud config %s not found", config_id)
        return
-        # Create run log
+    # ── 2. Decrypt OAuth token ─────────────────────────────────────────
-        run_log = AgentRunLog(
+    from app.integrations import decrypt_token, encrypt_token, get_provider
            agent_id=config.id,
            agent_type="cloud",
            user_id=user_id,
            status="running",
        )
        db.add(run_log)
        await db.commit()
        await db.refresh(run_log)
        run_log_id = run_log.id
    # ── Decrypt OAuth token ────────────────────────────────────────
    if not config.oauth_token_encrypted:
        await _finalize_run(
-            run_log_id,
+            run_log,
            status="error",
            errors=[f"No OAuth token stored for cloud agent '{config.name}'"],
        )
@@ -724,21 +910,22 @@ async def run_cloud_agent(user_id: str, config_id: str, *, langfuse_handler: Any
    try:
        credentials_info = decrypt_token(config.oauth_token_encrypted)
    except ValueError as exc:
        logger.error("agent_runner: failed to decrypt OAuth token for agent %s: %s", config.id, exc)
        await _finalize_run(
-            run_log_id,
+            run_log,
            status="error",
            errors=[f"Failed to decrypt OAuth token: {exc}"],
        )
        return
-    # ── Instantiate provider ──────────────────────────────────────
+    # ── 3. Instantiate provider client ────────────────────────────────
    try:
        provider = get_provider(config.provider, credentials_info)
    except ValueError as exc:
-        await _finalize_run(run_log_id, status="error", errors=[str(exc)])
+        await _finalize_run(run_log, status="error", errors=[str(exc)])
        return
-    # ── Fetch messages ────────────────────────────────────────────
+    # ── 4. Fetch messages ─────────────────────────────────────────────
    since: datetime | None = config.last_run_at
    if since is None:
        since = datetime.now(timezone.utc) - timedelta(days=_CLOUD_DEFAULT_LOOKBACK_DAYS)
@@ -747,28 +934,32 @@ async def run_cloud_agent(user_id: str, config_id: str, *, langfuse_handler: Any
    errors: list[str] = []
    items_processed = 0
    items_created = 0
    try:
        if config.provider == "gmail":
-            raw_messages = await provider.fetch_messages(
+            raw_messages = await provider.fetch_messages(  # type: ignore[union-attr]
                filter_config=config.filter_config,
                since=since,
            )
        elif config.provider == "outlook":
-            raw_messages = await provider.fetch_emails(
+            raw_messages = await provider.fetch_emails(  # type: ignore[union-attr]
                filter_config=config.filter_config,
                since=since,
            )
        elif config.provider == "teams":
-            raw_messages = await provider.fetch_messages(
+            raw_messages = await provider.fetch_messages(  # type: ignore[union-attr]
                filter_config=config.filter_config,
                since=since,
            )
        else:
            raw_messages = []
    except RuntimeError as exc:
        logger.error(
            "agent_runner: provider fetch failed for cloud agent %s: %s", config.id, exc
        )
        await _finalize_run(
-            run_log_id,
+            run_log,
            status="error",
            errors=[f"Provider fetch failed: {exc}"],
            update_config_last_run=True,
@@ -778,11 +969,17 @@ async def run_cloud_agent(user_id: str, config_id: str, *, langfuse_handler: Any
        return
    logger.info(
-        "agent_runner: cloud agent %s fetched %d item(s) from %s",
+        "agent_runner: cloud agent %s fetched %d item(s) from %s for user=%s",
-        config.id, len(raw_messages), config.provider,
+        config.id,
        len(raw_messages),
        config.provider,
        user_id,
    )
-    # ── Extract + insert via LLM ─────────────────────────────────
+    # ── 5–6. Extract + insert via LLM with tools ─────────────────────
    executor = _make_agent_executor(user_id, device_mgr)
    set_client_executor(executor)
    try:
        processing_tools = _build_processing_tools(config.data_types)
        custom_section = (
@@ -797,15 +994,14 @@ async def run_cloud_agent(user_id: str, config_id: str, *, langfuse_handler: Any
                continue
            items_processed += 1
-            processing_prompt = tracing.compile_prompt(
+            cloud_template, cloud_prompt_obj = get_prompt_or_fallback(
-                "batch_cloud_processing",
+                "batch_cloud_processing", _BATCH_CLOUD_PROCESSING_PROMPT
-                fallback=_CLOUD_PROCESSING_PROMPT,
+            )
-                variables={
+            processing_prompt = cloud_template.format(
-                    "data_types": ", ".join(config.data_types),
+                data_types=", ".join(config.data_types),
-                    "project_context": "Determine the appropriate project from the message context.",
+                project_context="Determine the appropriate project from the message context.",
-                    "file_list": f"Message from {config.provider} (id: {msg.id})",
+                file_list=f"Message from {config.provider} (id: {msg.id})",
-                    "custom_prompt_section": custom_section,
+                custom_prompt_section=custom_section,
                },
            )
            try:
@@ -814,14 +1010,16 @@ async def run_cloud_agent(user_id: str, config_id: str, *, langfuse_handler: Any
                    user_message=f"Process this message content:\n\n{content_text[:8000]}",
                    tools=processing_tools,
                    max_steps=_MAX_PROCESSING_STEPS,
-                    langfuse_handler=langfuse_handler,
+                    user_id=user_id,
                    langfuse_prompt=cloud_prompt_obj,
                    agent_name="cloud-processor",
                )
            except Exception as exc:
                errors.append(f"LLM processing error for message {msg.id!r}: {exc}")
-    except Exception as exc:
+    finally:
-        errors.append(f"Agent run failed: {exc}")
+        clear_client_executor()
-    # ── Persist refreshed token ───────────────────────────────────
+    # ── 7. Persist refreshed token (if any) ───────────────────────────
    refreshed = getattr(provider, "refreshed_credentials", None)
    if refreshed:
        try:
@@ -834,11 +1032,16 @@ async def run_cloud_agent(user_id: str, config_id: str, *, langfuse_handler: Any
                if cfg_row:
                    cfg_row.oauth_token_encrypted = new_encrypted
                    await db.commit()
            logger.debug("agent_runner: refreshed OAuth token persisted for agent %s", config.id)
        except Exception as exc:
-            logger.warning("agent_runner: failed to persist refreshed token: %s", exc)
+            logger.warning(
                "agent_runner: failed to persist refreshed token for agent %s: %s",
                config.id,
                exc,
            )
-    # ── Finalise ──────────────────────────────────────────────────
+    # ── 8. Finalise ────────────────────────────────────────────────────
-    if errors and items_processed == 0:
+    if errors and items_created == 0:
        final_status = "error"
    elif errors:
        final_status = "partial"
@@ -846,22 +1049,50 @@ async def run_cloud_agent(user_id: str, config_id: str, *, langfuse_handler: Any
        final_status = "success"
    await _finalize_run(
-        run_log_id,
+        run_log,
        status=final_status,
        items_processed=items_processed,
-        items_created=0,
+        items_created=items_created,
        errors=errors,
        update_config_last_run=True,
        config_id=config.id,
        config_type="cloud",
    )
    logger.info(
        "agent_runner: cloud run=%s done status=%s processed=%d created=%d errors=%d",
        run_id,
        final_status,
        items_processed,
        items_created,
        len(errors),
    )
 # ── Pending-run trigger ─────────────────────────────────────────────────────
 async def trigger_pending_runs(
    user_id: str,
    device_id: str,
    device_mgr: DeviceConnectionManager,
 ) -> None:
    """Dispatch any overdue agent runs after an Electron device connects.
    Called as a background task from the device WS endpoint on ``device_hello``.
    """
    logger.info(
        "agent_runner: pending-run scan skipped for user=%s device=%s (client-owned agent config)",
        user_id,
        device_id,
    )
    return
 # ── Internal helper ─────────────────────────────────────────────────────────
 async def _finalize_run(
-    run_log_id: int | str,
+    run_log: AgentRunLog,
    *,
    status: str,
    items_processed: int = 0,
@@ -871,18 +1102,11 @@ async def _finalize_run(
    config_id: str | None = None,
    config_type: str | None = None,
 ) -> None:
-    """Persist the run outcome and optionally update last_run_at on the config."""
+    """Persist the run outcome and optionally update ``last_run_at`` on the config."""
    now = datetime.now(timezone.utc)
    try:
        async with async_session() as db:
-            result = await db.execute(
+            managed = await db.merge(run_log)
                select(AgentRunLog).where(AgentRunLog.id == run_log_id)
            )
            managed = result.scalar_one_or_none()
            if managed is None:
                logger.warning("agent_runner: run_log %s not found for finalization", run_log_id)
                return
            managed.status = status
            managed.items_processed = items_processed
            managed.items_created = items_created
@@ -907,4 +1131,6 @@ async def _finalize_run(
            await db.commit()
    except Exception as exc:
-        logger.error("agent_runner: failed to finalize run_log=%s: %s", run_log_id, exc)
+        logger.error(
            "agent_runner: failed to finalize run_log=%s: %s", run_log.id, exc
        )
--- a/services/chat/app/deep_agent.py
+++ b/services/chat/app/deep_agent.py
@@ -1,8 +1,4 @@
-"""Single-agent runners for home and floating chat contexts.
+"""Single-agent runners for home and floating chat contexts."""
 Adapted from app/core/deep_agent.py for the Chat Service.
 Import paths changed to use local app modules and shared/.
 """
 from __future__ import annotations
@@ -16,22 +12,23 @@ from typing import Any, Literal
 from langchain_core.messages import AIMessage, HumanMessage, SystemMessage, ToolMessage
 from langchain_core.tools import tool
-from shared.agents.note_agent import NOTE_TOOLS
+from app.agents.note_agent import NOTE_TOOLS
-from shared.agents.project_agent import PROJECT_TOOLS
+from app.agents.project_agent import PROJECT_TOOLS
-from shared.agents.task_agent import TASK_TOOLS
+from app.agents.task_agent import TASK_TOOLS
-from shared.agents.timeline_agent import TIMELINE_TOOLS
+from app.agents.timeline_agent import TIMELINE_TOOLS
-from shared.llm import get_llm
+from app.core.langfuse_client import extract_usage, get_langfuse, get_prompt_or_fallback
-from app.memory_middleware import MemoryMiddleware
+from app.core.llm import get_llm
-from shared.ws_context import clear_tool_result_collector, execute_on_client, set_tool_result_collector
+from app.config.settings import settings
-from app import tracing
+from app.core.memory_middleware import MemoryMiddleware
-from shared.db import async_session
+from app.core.ws_context import clear_tool_result_collector, execute_on_client, set_tool_result_collector
 from app.db import async_session
 logger = logging.getLogger(__name__)
 FloatingDomainType = Literal["task", "timeline", "project", "node"]
 FloatingDomainSection = Literal["task", "timeline", "note"]
-_HOME_SINGLE_AGENT_SYSTEM = (
+_HOME_SYSTEM_PROMPT = (
    "You are the home assistant with direct access to all tools: tasks, projects, notes, timelines, and memory tools. "
    "Always use tools for factual data retrieval before answering. "
    "When the user asks to remember, forget, or update what you know about them, use memory tools. "
@@ -44,7 +41,7 @@ _HOME_SINGLE_AGENT_SYSTEM = (
    "For upcoming tasks, after tag lines add a short recommendation based on due date and priority."
 )
-_FLOATING_SINGLE_AGENT_SYSTEM = (
+_FLOATING_SYSTEM_PROMPT = (
    "You are the floating assistant with direct access to all tools: tasks, projects, notes, timelines, and memory tools. "
    "Stay focused on the floating scope in context.scope and answer concisely. "
    "Return plain text only. Do not output XML/HTML-like tags such as <task>, <project>, <note>, <timeline>, or any bracketed id tag wrappers. "
@@ -53,7 +50,7 @@ _FLOATING_SINGLE_AGENT_SYSTEM = (
    "If context.context.resolved_project_id exists, use it as project_id for scoped list calls. "
 )
-_FLOATING_DOMAIN_CLASSIFIER_SYSTEM = (
+_FLOATING_DOMAIN_CLASSIFIER_PROMPT = (
    "You are a strict domain classifier for websocket floating requests. "
    "Return ONLY a JSON object with keys: type, id, section. "
    "Allowed type values: task, timeline, project, node. "
@@ -245,6 +242,7 @@ def _strip_floating_markup(text: str) -> str:
        return text
    cleaned = _strip_floating_markup_fragment(text)
    # Collapse excessive spaces introduced by tag/id removal while preserving lines.
    lines = [re.sub(r"[ \t]{2,}", " ", line).strip() for line in cleaned.splitlines()]
    return "\n".join(line for line in lines if line)
@@ -283,6 +281,7 @@ class _FloatingStreamSanitizer:
        return _strip_floating_markup_fragment(safe_text)
    def finalize(self) -> str:
        # Drop dangling unfinished wrappers at the very end.
        tail = re.sub(r"<[^>\n]*$", "", self._pending)
        tail = re.sub(r"\[[^\]\n]*$", "", tail)
        self._pending = ""
@@ -528,9 +527,7 @@ def _infer_floating_domain_rule_based(message: str, context: dict[str, Any]) ->
    return {"type": "task", "id": None, "section": None}
-async def _infer_floating_domain(
+async def _infer_floating_domain(message: str, context: dict[str, Any]) -> dict[str, str | None]:
    message: str, context: dict[str, Any], *, langfuse_handler: Any | None = None,
 ) -> dict[str, str | None]:
    resolved_project_id = context.get("resolved_project_id") if isinstance(context, dict) else None
    project_id = resolved_project_id if isinstance(resolved_project_id, str) and resolved_project_id else None
@@ -540,14 +537,9 @@ async def _infer_floating_domain(
    }
    try:
-        classifier_prompt = _get_system_prompt(
+        llm = get_llm()
-            "floating_domain_classifier", _FLOATING_DOMAIN_CLASSIFIER_SYSTEM,
+        classifier_messages = [
-        )
+            SystemMessage(content=_FLOATING_DOMAIN_CLASSIFIER_PROMPT),
        callbacks = _build_callbacks(langfuse_handler)
        llm = get_llm(callbacks=callbacks)
        response = await llm.ainvoke(
            [
                SystemMessage(content=classifier_prompt),
            HumanMessage(
                content=(
                    f"Message:\n{message}\n\n"
@@ -555,7 +547,22 @@ async def _infer_floating_domain(
                )
            ),
        ]
        lf = get_langfuse()
        _, classifier_prompt_obj = get_prompt_or_fallback(
            "floating_domain_classifier", _FLOATING_DOMAIN_CLASSIFIER_PROMPT
        )
        if lf:
            with lf.start_as_current_observation(
                as_type="generation",
                name="floating-classifier",
                model=settings.LLM_MODEL,
                prompt=classifier_prompt_obj,
                input=classifier_messages,
            ) as gen:
                response = await llm.ainvoke(classifier_messages)
                gen.update(output=_as_text(response.content), usage=extract_usage(response))
        else:
            response = await llm.ainvoke(classifier_messages)
        parsed = _parse_json_object(_as_text(response.content))
        if parsed is not None:
            domain = _normalize_domain_payload(parsed, project_id)
@@ -573,19 +580,6 @@ async def _infer_floating_domain(
    return _infer_floating_domain_rule_based(message, context)
 def _get_system_prompt(langfuse_name: str, fallback: str) -> str:
    """Fetch a managed prompt from Langfuse, falling back to the hardcoded string."""
    managed = tracing.get_prompt(langfuse_name, fallback=None)
    return managed if managed is not None else fallback
 def _build_callbacks(langfuse_handler: Any | None) -> list[Any] | None:
    """Return a callbacks list if a Langfuse handler is available."""
    if langfuse_handler is None:
        return None
    return [langfuse_handler]
 async def _run_single_agent(
    *,
    user_id: str,
@@ -593,11 +587,12 @@ async def _run_single_agent(
    message: str,
    context: dict[str, Any],
    max_steps: int = 6,
-    langfuse_handler: Any | None = None,
+    langfuse_prompt: Any = None,
    agent_name: str = "agent",
 ) -> str:
    trace_id = _trace_id_from_context(context)
-    callbacks = _build_callbacks(langfuse_handler)
+    lf = get_langfuse()
-    llm = get_llm(callbacks=callbacks)
+    llm = get_llm()
    tools = _all_tools_for_user(user_id, trace_id)
    model_context = _context_for_model(context)
    logger.info("deep_agent: run_single_agent_start trace=%s user=%s", trace_id or "-", user_id)
@@ -615,9 +610,37 @@ async def _run_single_agent(
    tool_calls_count = 0
    collected: list[dict[str, Any]] = []
    set_tool_result_collector(collected)
    _span_ctx = (
        lf.start_as_current_observation(
            as_type="span",
            name=agent_name,
            user_id=user_id,
            session_id=trace_id,
            input=message,
        )
        if lf else None
    )
    _span = _span_ctx.__enter__() if _span_ctx else None
    try:
        for _ in range(max_steps):
            _gen_ctx = (
                lf.start_as_current_observation(
                    as_type="generation",
                    name=f"{agent_name}-llm",
                    model=settings.LLM_MODEL,
                    prompt=langfuse_prompt,
                    input=messages,
                )
                if lf else None
            )
            _gen = _gen_ctx.__enter__() if _gen_ctx else None
            response: AIMessage = await llm_with_tools.ainvoke(messages)
            if _gen_ctx:
                _gen.update(output=_as_text(response.content), usage=extract_usage(response))
                _gen_ctx.__exit__(None, None, None)
            messages.append(response)
            if not response.tool_calls:
@@ -629,6 +652,8 @@ async def _run_single_agent(
                    tool_calls_count,
                    len(final_text),
                )
                if _span:
                    _span.update(output=final_text)
                return final_text
            tool_map = {tool_def.name: tool_def for tool_def in tools}
@@ -668,9 +693,15 @@ async def _run_single_agent(
            tool_calls_count,
            len(final_text),
        )
        if _span:
            _span.update(output=final_text)
        return final_text
    finally:
        clear_tool_result_collector()
        if _span_ctx:
            _span_ctx.__exit__(None, None, None)
        if lf:
            lf.flush()
 async def _run_single_agent_stream(
@@ -680,11 +711,12 @@ async def _run_single_agent_stream(
    message: str,
    context: dict[str, Any],
    max_steps: int = 6,
-    langfuse_handler: Any | None = None,
+    langfuse_prompt: Any = None,
    agent_name: str = "agent",
 ) -> AsyncGenerator[tuple[str, Any], None]:
    trace_id = _trace_id_from_context(context)
-    callbacks = _build_callbacks(langfuse_handler)
+    lf = get_langfuse()
-    llm = get_llm(callbacks=callbacks)
+    llm = get_llm()
    tools = _all_tools_for_user(user_id, trace_id)
    model_context = _context_for_model(context)
    logger.info("deep_agent: run_single_agent_stream_start trace=%s user=%s", trace_id or "-", user_id)
@@ -703,9 +735,38 @@ async def _run_single_agent_stream(
    streamed_chars = 0
    collected: list[dict[str, Any]] = []
    set_tool_result_collector(collected)
    _span_ctx = (
        lf.start_as_current_observation(
            as_type="span",
            name=f"{agent_name}-stream",
            user_id=user_id,
            session_id=trace_id,
            input=message,
        )
        if lf else None
    )
    _span = _span_ctx.__enter__() if _span_ctx else None
    streamed_text: list[str] = []
    try:
        for _ in range(max_steps):
            _gen_ctx = (
                lf.start_as_current_observation(
                    as_type="generation",
                    name=f"{agent_name}-llm",
                    model=settings.LLM_MODEL,
                    prompt=langfuse_prompt,
                    input=messages,
                )
                if lf else None
            )
            _gen = _gen_ctx.__enter__() if _gen_ctx else None
            response: AIMessage = await llm_with_tools.ainvoke(messages)
            if _gen_ctx:
                _gen.update(output=_as_text(response.content), usage=extract_usage(response))
                _gen_ctx.__exit__(None, None, None)
            messages.append(response)
            if not response.tool_calls:
@@ -714,13 +775,16 @@ async def _run_single_agent_stream(
                    token = _as_text(getattr(chunk, "content", ""))
                    if token:
                        streamed_chars += len(token)
                        streamed_text.append(token)
                        emitted_any = True
                        yield "token", token
                # Some providers return final text in `response.content` but stream no chunks.
                if not emitted_any:
                    fallback_text = _as_text(response.content)
                    if fallback_text:
                        streamed_chars += len(fallback_text)
                        streamed_text.append(fallback_text)
                        yield "token", fallback_text
                logger.info(
                    "deep_agent: run_single_agent_stream_end trace=%s user=%s tool_calls=%d response_chars=%d",
@@ -729,6 +793,8 @@ async def _run_single_agent_stream(
                    tool_calls_count,
                    streamed_chars,
                )
                if _span:
                    _span.update(output="".join(streamed_text))
                return
            tool_map = {tool_def.name: tool_def for tool_def in tools}
@@ -763,6 +829,7 @@ async def _run_single_agent_stream(
            token = _as_text(getattr(chunk, "content", ""))
            if token:
                streamed_chars += len(token)
                streamed_text.append(token)
                yield "token", token
        logger.info(
            "deep_agent: run_single_agent_stream_end trace=%s user=%s tool_calls=%d response_chars=%d fallback=1",
@@ -771,33 +838,45 @@ async def _run_single_agent_stream(
            tool_calls_count,
            streamed_chars,
        )
        if _span:
            _span.update(output="".join(streamed_text))
    finally:
        clear_tool_result_collector()
        if _span_ctx:
            _span_ctx.__exit__(None, None, None)
        if lf:
            lf.flush()
-async def run_home(user_id: str, message: str, context: dict[str, Any], *, langfuse_handler: Any | None = None) -> str:
+async def run_home(user_id: str, message: str, context: dict[str, Any]) -> str:
    prepared_context = await _prepare_context(message, context)
-    system_prompt = _get_system_prompt("home_system", _HOME_SINGLE_AGENT_SYSTEM)
+    system_prompt, langfuse_prompt = get_prompt_or_fallback(
        "home_system", _HOME_SYSTEM_PROMPT
    )
    response = await _run_single_agent(
        user_id=user_id,
        system_prompt=system_prompt,
        message=message,
        context=prepared_context,
-        langfuse_handler=langfuse_handler,
+        langfuse_prompt=langfuse_prompt,
        agent_name="home-agent",
    )
    return _normalize_tagged_list_lines(response, message)
-async def run_floating(user_id: str, message: str, context: dict[str, Any], *, langfuse_handler: Any | None = None) -> tuple[str, dict[str, str | None]]:
+async def run_floating(user_id: str, message: str, context: dict[str, Any]) -> tuple[str, dict[str, str | None]]:
    prepared_context = await _prepare_context(message, context)
-    domain = await _infer_floating_domain(message, prepared_context, langfuse_handler=langfuse_handler)
+    domain = await _infer_floating_domain(message, prepared_context)
-    system_prompt = _get_system_prompt("floating_system", _FLOATING_SINGLE_AGENT_SYSTEM)
+    system_prompt, langfuse_prompt = get_prompt_or_fallback(
        "floating_system", _FLOATING_SYSTEM_PROMPT
    )
    response = await _run_single_agent(
        user_id=user_id,
        system_prompt=system_prompt,
        message=message,
        context=prepared_context,
-        langfuse_handler=langfuse_handler,
+        langfuse_prompt=langfuse_prompt,
        agent_name="floating-agent",
    )
    sanitized = _strip_floating_markup(response)
    if not sanitized and response:
@@ -809,18 +888,19 @@ async def run_home_stream(
    user_id: str,
    message: str,
    context: dict[str, Any],
    *,
    langfuse_handler: Any | None = None,
 ) -> AsyncGenerator[tuple[str, Any], None]:
    prepared_context = await _prepare_context(message, context)
-    system_prompt = _get_system_prompt("home_system", _HOME_SINGLE_AGENT_SYSTEM)
+    system_prompt, langfuse_prompt = get_prompt_or_fallback(
        "home_system", _HOME_SYSTEM_PROMPT
    )
    text_chunks: list[str] = []
    async for event in _run_single_agent_stream(
        user_id=user_id,
        system_prompt=system_prompt,
        message=message,
        context=prepared_context,
-        langfuse_handler=langfuse_handler,
+        langfuse_prompt=langfuse_prompt,
        agent_name="home-agent",
    ):
        event_type, data = event
        if event_type != "token":
@@ -837,14 +917,14 @@ async def run_floating_stream(
    user_id: str,
    message: str,
    context: dict[str, Any],
    *,
    langfuse_handler: Any | None = None,
 ) -> AsyncGenerator[tuple[str, Any], None]:
    prepared_context = await _prepare_context(message, context)
-    domain = await _infer_floating_domain(message, prepared_context, langfuse_handler=langfuse_handler)
+    domain = await _infer_floating_domain(message, prepared_context)
    yield "floating_domain", domain
-    system_prompt = _get_system_prompt("floating_system", _FLOATING_SINGLE_AGENT_SYSTEM)
+    system_prompt, langfuse_prompt = get_prompt_or_fallback(
        "floating_system", _FLOATING_SYSTEM_PROMPT
    )
    sanitizer = _FloatingStreamSanitizer()
    emitted_sanitized = False
    raw_chunks: list[str] = []
@@ -853,7 +933,8 @@ async def run_floating_stream(
        system_prompt=system_prompt,
        message=message,
        context=prepared_context,
-        langfuse_handler=langfuse_handler,
+        langfuse_prompt=langfuse_prompt,
        agent_name="floating-agent",
    ):
        event_type, data = event
        if event_type != "token":
--- a/app/core/device_manager.py
+++ b/app/core/device_manager.py
@@ -0,0 +1,151 @@
 """Device connection manager.
 Maintains in-memory state for all active Electron → backend WebSocket
 connections.  One connection per user (latest replaces previous).
 The manager handles the **tool-call round-trip** pattern:
  - Backend sends ``tool_call`` frame → Electron executes the action →
    returns ``tool_result`` frame.
  - ``create_pending_call`` registers a Future keyed by ``call_id``.
  - ``resolve_pending_call`` fulfils the Future; callers awaiting it
    receive the result dict from Electron.
 This pattern is used by all tools (CRUD, file-system, etc.) via
 ``execute_on_client()`` in ``ws_context.py``.
 The ``device_manager`` module-level singleton is imported by both the
 device WS route and the agent runner.
 """
 from __future__ import annotations
 import asyncio
 import json
 import logging
 from dataclasses import dataclass, field
 from fastapi import WebSocket
 logger = logging.getLogger(__name__)
@dataclass
 class DeviceConnection:
    """State for a single connected Electron device."""
    ws: WebSocket
    device_id: str
    # Futures indexed by tool_call id — resolved when tool_result arrives.
    pending_calls: dict[str, asyncio.Future[dict]] = field(default_factory=dict)
 class DeviceConnectionManager:
    """Singleton registry of active Electron WebSocket connections.
    Thread/task safety note: asyncio is single-threaded by design.  All
    mutations happen inside await-points on the main event loop, so no
    locking is required for the in-memory dicts.
    """
    def __init__(self) -> None:
        self._connections: dict[str, DeviceConnection] = {}
    # ── Registration ──────────────────────────────────────────────────
    def register(self, user_id: str, device_id: str, ws: WebSocket) -> None:
        """Store the active connection for *user_id*, replacing any previous one."""
        if user_id in self._connections:
            old = self._connections[user_id]
            logger.info(
                "device_manager: replacing existing connection for user=%s device=%s",
                user_id,
                old.device_id,
            )
            # Cancel any futures that were waiting on the old connection.
            for fut in old.pending_calls.values():
                if not fut.done():
                    fut.cancel()
        self._connections[user_id] = DeviceConnection(ws=ws, device_id=device_id)
        logger.info(
            "device_manager: registered user=%s device=%s", user_id, device_id
        )
    def unregister(self, user_id: str) -> None:
        """Remove the connection for *user_id* and cancel any pending futures."""
        conn = self._connections.pop(user_id, None)
        if conn is None:
            return
        for fut in conn.pending_calls.values():
            if not fut.done():
                fut.cancel()
        logger.info("device_manager: unregistered user=%s", user_id)
    # ── Presence queries ──────────────────────────────────────────────
    def get_ws(self, user_id: str) -> WebSocket | None:
        """Return the active WebSocket for *user_id*, or ``None`` if offline."""
        conn = self._connections.get(user_id)
        return conn.ws if conn else None
    def is_online(self, user_id: str, device_id: str | None = None) -> bool:
        """Return ``True`` if the user has an active connection.
        If *device_id* is provided also checks that it matches the connected device.
        """
        conn = self._connections.get(user_id)
        if conn is None:
            return False
        if device_id is not None:
            return conn.device_id == device_id
        return True
    # ── Frame sending ─────────────────────────────────────────────────
    async def send_frame(self, user_id: str, frame: dict) -> None:
        """Send *frame* as a JSON text message to the device.
        Raises ``RuntimeError`` if the user is not connected.
        """
        conn = self._connections.get(user_id)
        if conn is None:
            raise RuntimeError(
                f"send_frame: user {user_id!r} is not connected"
            )
        await conn.ws.send_text(json.dumps(frame))
    # ── Tool-call round-trip ──────────────────────────────────────────
    def create_pending_call(
        self, user_id: str, call_id: str
    ) -> asyncio.Future[dict]:
        """Register a Future that will be resolved when the tool_result arrives.
        Raises ``RuntimeError`` if the user is not connected.
        """
        conn = self._connections.get(user_id)
        if conn is None:
            raise RuntimeError(
                f"create_pending_call: user {user_id!r} is not connected"
            )
        loop = asyncio.get_event_loop()
        fut: asyncio.Future[dict] = loop.create_future()
        conn.pending_calls[call_id] = fut
        return fut
    def resolve_pending_call(
        self, user_id: str, call_id: str, result: dict
    ) -> None:
        """Fulfil the Future registered under *call_id* with the Electron result.
        No-ops if the call_id is unknown (already timed out or cancelled).
        """
        conn = self._connections.get(user_id)
        if conn is None:
            return
        fut = conn.pending_calls.pop(call_id, None)
        if fut is not None and not fut.done():
            fut.set_result(result)
 # Module-level singleton — import this everywhere.
 device_manager = DeviceConnectionManager()
--- a/app/core/langfuse_client.py
+++ b/app/core/langfuse_client.py
@@ -0,0 +1,114 @@
 """Langfuse observability — singleton client and prompt helpers.
 If LANGFUSE_SECRET_KEY / LANGFUSE_PUBLIC_KEY are not set,
 all helpers are no-ops so the app works without Langfuse configured.
 Usage
 -----
 Tracing::
    from app.core.langfuse_client import get_langfuse
    lf = get_langfuse()
    if lf:
        with lf.start_as_current_observation(as_type="span", name="my-agent") as span:
            span.update(input=user_message)
            # ... do work ...
            span.update(output=result)
        lf.flush()
 Prompt management::
    from app.core.langfuse_client import get_prompt_or_fallback
    text, prompt_obj = get_prompt_or_fallback("home_system", FALLBACK_PROMPT)
    # Use text as the system prompt; pass prompt_obj to generations for linking.
 Linking a prompt to a generation::
    with lf.start_as_current_observation(
        as_type="generation",
        name="llm-call",
        model="gpt-4o",
        prompt=prompt_obj,   # links generation → prompt version in the UI
        input=messages,
    ) as gen:
        response = await llm.ainvoke(messages)
        gen.update(output=response.content, usage=_usage(response))
 """
 from __future__ import annotations
 import logging
 from typing import Any
 logger = logging.getLogger(__name__)
 _client: Any = None
 _initialized: bool = False
 def get_langfuse() -> Any | None:
    """Return the Langfuse singleton, or ``None`` when not configured."""
    global _client, _initialized
    if _initialized:
        return _client
    _initialized = True
    from app.config.settings import settings  # local import to avoid circular deps
    if not settings.LANGFUSE_SECRET_KEY or not settings.LANGFUSE_PUBLIC_KEY:
        logger.debug("langfuse: not configured — observability disabled")
        return None
    try:
        from langfuse import Langfuse
        _client = Langfuse(
            secret_key=settings.LANGFUSE_SECRET_KEY,
            public_key=settings.LANGFUSE_PUBLIC_KEY,
            host=settings.LANGFUSE_HOST,
        )
        logger.info("langfuse: client initialized host=%s", settings.LANGFUSE_HOST)
    except Exception as exc:
        logger.warning("langfuse: failed to initialize: %s", exc)
        _client = None
    return _client
 def get_prompt_or_fallback(name: str, fallback: str) -> tuple[str, Any]:
    """Fetch a text prompt from Langfuse; fall back to ``fallback`` on any error.
    Returns ``(prompt_text, prompt_obj_or_None)``.
    * ``prompt_text`` — the raw template string (variables not yet substituted).
      Callers perform variable substitution with Python's ``.format()``.
    * ``prompt_obj`` — the Langfuse prompt object, or ``None`` when Langfuse is
      unavailable / the fetch failed.  Pass this to generation observations so
      Langfuse links the generation to the exact prompt version in the UI.
    """
    lf = get_langfuse()
    if lf is None:
        return fallback, None
    try:
        prompt = lf.get_prompt(name, label="production", fallback=fallback)
        # For text-type prompts .prompt holds the raw template string.
        raw = prompt.prompt if hasattr(prompt, "prompt") and isinstance(prompt.prompt, str) else fallback
        return raw, prompt
    except Exception as exc:
        logger.warning("langfuse: get_prompt %r failed: %s — using fallback", name, exc)
        return fallback, None
 def extract_usage(response: Any) -> dict[str, int]:
    """Extract token usage from a LangChain AI message into Langfuse format."""
    meta = getattr(response, "usage_metadata", None)
    if not meta:
        return {}
    return {
        "input": int(meta.get("input_tokens", 0)),
        "output": int(meta.get("output_tokens", 0)),
        "total": int(meta.get("total_tokens", 0)),
    }
--- a/app/core/llm.py
+++ b/app/core/llm.py
@@ -0,0 +1,125 @@
 """LLM factory — centralised model instantiation via LiteLLM.
 Every agent and the orchestrator call ``get_llm()`` or ``get_router_llm()``
 instead of directly constructing a provider-specific class.  The model string
 follows the `LiteLLM model naming convention
 <https://docs.litellm.ai/docs/providers>`_:
 * OpenAI:     ``gpt-4o``, ``gpt-4o-mini``
 * Anthropic:  ``anthropic/claude-3.5-sonnet``
 * Google:     ``gemini/gemini-pro``
 * Ollama:     ``ollama/llama3``
 * Bedrock:    ``bedrock/anthropic.claude-v2``
 Switch providers by changing **LLM_MODEL** / **LLM_ROUTER_MODEL** in ``.env``
 — no code changes required.
 """
 from __future__ import annotations
 import os
 import warnings
 from openai import AsyncOpenAI
 import litellm
 from langchain_openai import ChatOpenAI
 from langchain_litellm import ChatLiteLLM
 from litellm import get_supported_openai_params  # noqa: F401 – validates install
 from app.config.settings import settings
 # Some models (e.g. gpt-5, o-series) reject unsupported params like temperature.
 # Drop them silently instead of raising UnsupportedParamsError.
 litellm.drop_params = True
 # Some provider responses include a plain dict in the `usage` field where a
 # richer Pydantic model is expected. This warning is noisy but non-fatal.
 warnings.filterwarnings(
    "ignore",
    message=r"PydanticSerializationUnexpectedValue\(Expected `ResponseAPIUsage`",
    category=UserWarning,
 )
 def _api_key_for_model(model: str) -> str | None:
    """Return the most appropriate API key for the given LiteLLM model string."""
    if model.startswith("anthropic/"):
        return settings.ANTHROPIC_API_KEY or None
    if model.startswith("gemini/") or model.startswith("google/"):
        return settings.GOOGLE_API_KEY or None
    if model.startswith("cerebras/"):
        return settings.CEREBRAS_API_KEY or None
    if model.startswith("github_copilot/"):
        # GitHub Copilot uses OAuth device-flow tokens managed by LiteLLM.
        # No API key is required; returning None lets LiteLLM handle auth.
        return None
    # Default: OpenAI-compatible (covers plain model names like "gpt-4o")
    return settings.OPENAI_API_KEY or None
 def get_llm(
    *,
    model: str | None = None,
    temperature: float = 0,
 ) -> ChatOpenAI | ChatLiteLLM:
    """Return a LangChain chat model backed by LiteLLM.
    LiteLLM exposes an OpenAI-compatible API, so we use ``ChatOpenAI`` pointed
    at the LiteLLM proxy endpoint.  In practice, ``litellm`` patches the
    ``openai`` client transparently when the model string contains a provider
    prefix (``anthropic/…``, ``gemini/…``, etc.).
    Parameters
    ----------
    model:
        LiteLLM model identifier. Defaults to ``settings.LLM_MODEL``.
    temperature:
        Sampling temperature.  ``0`` = deterministic.
    """
    model = model or settings.LLM_MODEL
    # Point LiteLLM to the custom token directory when configured.
    if settings.GITHUB_COPILOT_TOKEN_DIR:
        os.environ.setdefault("GITHUB_COPILOT_TOKEN_DIR", settings.GITHUB_COPILOT_TOKEN_DIR)
    # Use ChatLiteLLM for provider-prefixed models (github_copilot/, anthropic/, etc.)
    # so LiteLLM handles routing and auth. ChatOpenAI for plain OpenAI model names.
    if "/" in model:
        return ChatLiteLLM(model=model, temperature=temperature)
    return ChatOpenAI(
        model=model,
        temperature=temperature,
        api_key=_api_key_for_model(model),
    )
 def get_router_llm(
    *,
    temperature: float = 0,
 ) -> ChatOpenAI | ChatLiteLLM:
    """Return the lighter model used for intent classification / routing."""
    return get_llm(model=settings.LLM_ROUTER_MODEL, temperature=temperature)
 async def embed(text: str) -> list[float]:
    """Return an embedding vector for *text*.
    Uses ``settings.LLM_EMBED_MODEL`` so the same provider switch in ``.env``
    (e.g. ``github_copilot/text-embedding-3-small``) applies here without any
    code changes.  Falls back to the raw AsyncOpenAI client for plain OpenAI
    model names to preserve existing behaviour.
    """
    model = settings.LLM_EMBED_MODEL
    if model.startswith("github_copilot/") or "/" in model:
        # Use LiteLLM for all provider-prefixed models (Copilot, Bedrock, etc.)
        # so the provider's auth mechanism is applied correctly.
        response = await litellm.aembedding(model=model, input=[text])
        return response.data[0]["embedding"]
    # Plain OpenAI model name — use the raw AsyncOpenAI client (existing path).
    client = AsyncOpenAI(api_key=settings.OPENAI_API_KEY)
    response = await client.embeddings.create(model=model, input=text)
    return response.data[0].embedding
--- a/services/chat/app/memory_middleware.py
+++ b/services/chat/app/memory_middleware.py
@@ -1,7 +1,19 @@
-"""Memory Middleware — adapted for Chat Service.
+"""Memory Middleware — enrich requests with memory context and store interactions.
-Uses shared.models instead of app.models. Otherwise identical to the
+Four-tier memory model (MemGPT-style):
-monolith's app/core/memory_middleware.py.
+  core         — persistent key/value user preferences, always injected
  associative  — semantic similarity search via pgvector (top-k)
  episodic     — recent session summaries (last N)
  proactive    — behavioral patterns above confidence threshold
 All memory content is encrypted at rest using the per-user Fernet key
 stored in User.encryption_key. Decryption happens in-memory only.
 Usage:
    memory = MemoryMiddleware(db_session)
    context = await memory.enrich_context(user_id, message)
    # ... run agent ...
    await memory.store_episode(user_id, session_id, message, response)
 """
 from __future__ import annotations
@@ -14,7 +26,7 @@ from cryptography.fernet import Fernet, InvalidToken
 from sqlalchemy import select
 from sqlalchemy.ext.asyncio import AsyncSession
-from shared.models import (
+from app.models import (
    MemoryAssociative,
    MemoryCore,
    MemoryEpisodic,
@@ -24,16 +36,20 @@ from shared.models import (
 logger = logging.getLogger(__name__)
 # Tuning constants
 _ASSOCIATIVE_TOP_K = 5
 _EPISODIC_RECENT_N = 10
 _PROACTIVE_CONFIDENCE_THRESHOLD = 0.6
 class MemoryMiddleware:
    """Enrich orchestrator context with memory and persist interactions after."""
    def __init__(self, db: AsyncSession) -> None:
        self._db = db
    # ── Public API ────────────────────────────────────────────────────────────
    async def enrich_context(
        self,
        user_id: str,
@@ -41,6 +57,14 @@ class MemoryMiddleware:
        trace_id: str | None = None,
        session_id: str | None = None,
    ) -> dict[str, Any]:
        """Build memory context dict to inject into the orchestrator before LLM call.
        Returns a dict with keys:
          core_memory        — {key: plaintext_value, ...}
          associative_memory — [plaintext_content, ...]  (top-k by keyword match)
          episodic_memory    — [plaintext_summary, ...]  (most recent N)
          proactive_hints    — [plaintext_pattern, ...]  (above threshold)
        """
        fernet = await self._get_fernet(user_id)
        if fernet is None:
            return {}
@@ -50,9 +74,16 @@ class MemoryMiddleware:
        episodic = await self._load_episodic(user_id, fernet, session_id=session_id)
        proactive = await self._load_proactive(user_id, fernet)
        user_dbg = await self._get_user_debug(user_id)
        logger.info(
-            "memory: enrich_context trace=%s user=%s core=%d assoc=%d episodic=%d proactive=%d",
+            "memory: enrich_context trace=%s user=%s tier=%s core=%d associative=%d episodic=%d proactive=%d",
-            trace_id or "-", user_id, len(core), len(associative), len(episodic), len(proactive),
+            trace_id or "-",
            user_id,
            user_dbg.get("tier") or "-",
            len(core),
            len(associative),
            len(episodic),
            len(proactive),
        )
        return {
@@ -63,9 +94,18 @@ class MemoryMiddleware:
        }
    async def store_episode(
-        self, user_id: str, session_id: str, message: str, response: str,
+        self,
        user_id: str,
        session_id: str,
        message: str,
        response: str,
        trace_id: str | None = None,
    ) -> None:
        """Summarise and store a completed interaction in episodic memory.
        The summary is a simple heuristic concatenation (no LLM call) to keep
        latency low. Full LLM summarisation can be added in a later step.
        """
        fernet = await self._get_fernet(user_id)
        if fernet is None:
            return
@@ -82,68 +122,113 @@ class MemoryMiddleware:
        self._db.add(row)
        try:
            await self._db.commit()
            user_dbg = await self._get_user_debug(user_id)
            logger.info(
                "memory: store_episode trace=%s user=%s tier=%s session=%s",
                trace_id or "-",
                user_id,
                user_dbg.get("tier") or "-",
                session_id,
            )
        except Exception as exc:
            logger.error("memory: store_episode failed user=%s: %s", user_id, exc)
            await self._db.rollback()
    async def update_core(self, user_id: str, key: str, value: str, trace_id: str | None = None) -> None:
        """Upsert a core memory key/value for a user."""
        fernet = await self._get_fernet(user_id)
        if fernet is None:
            return
        encrypted = _encrypt(fernet, value)
        result = await self._db.execute(
-            select(MemoryCore).where(MemoryCore.user_id == user_id, MemoryCore.key == key)
+            select(MemoryCore).where(
                MemoryCore.user_id == user_id,
                MemoryCore.key == key,
            )
        )
        existing = result.scalar_one_or_none()
        if existing is not None:
            existing.value_encrypted = encrypted
        else:
            self._db.add(MemoryCore(
-                id=str(uuid.uuid4()), user_id=user_id, key=key, value_encrypted=encrypted,
+                id=str(uuid.uuid4()),
                user_id=user_id,
                key=key,
                value_encrypted=encrypted,
            ))
        try:
            await self._db.commit()
            user_dbg = await self._get_user_debug(user_id)
            logger.info(
                "memory: update_core trace=%s user=%s tier=%s key=%s",
                trace_id or "-",
                user_id,
                user_dbg.get("tier") or "-",
                key,
            )
        except Exception as exc:
            logger.error("memory: update_core failed user=%s key=%s: %s", user_id, key, exc)
            await self._db.rollback()
    async def list_core_blocks(self, user_id: str) -> list[dict[str, str]]:
        """Return core memory as editable blocks (label/value)."""
        fernet = await self._get_fernet(user_id)
        if fernet is None:
            return []
        result = await self._db.execute(
-            select(MemoryCore).where(MemoryCore.user_id == user_id).order_by(MemoryCore.key.asc())
+            select(MemoryCore)
            .where(MemoryCore.user_id == user_id)
            .order_by(MemoryCore.key.asc())
        )
        rows = result.scalars().all()
        out: list[dict[str, str]] = []
-        for row in result.scalars().all():
+        for row in rows:
            plaintext = _safe_decrypt(fernet, row.value_encrypted)
            if plaintext is not None:
                out.append({"label": row.key, "value": plaintext})
        logger.debug("memory: list_core_blocks user=%s count=%d", user_id, len(out))
        return out
    async def get_core_block(self, user_id: str, label: str) -> str | None:
        """Return a single core memory block value by label."""
        fernet = await self._get_fernet(user_id)
        if fernet is None:
            return None
        result = await self._db.execute(
-            select(MemoryCore).where(MemoryCore.user_id == user_id, MemoryCore.key == label)
+            select(MemoryCore).where(
                MemoryCore.user_id == user_id,
                MemoryCore.key == label,
            )
        )
        row = result.scalar_one_or_none()
        if row is None:
            logger.debug("memory: get_core_block user=%s label=%s found=0", user_id, label)
            return None
-        return _safe_decrypt(fernet, row.value_encrypted)
+        value = _safe_decrypt(fernet, row.value_encrypted)
        logger.debug("memory: get_core_block user=%s label=%s found=%d", user_id, label, 1 if value is not None else 0)
        return value
    async def delete_core(self, user_id: str, label: str) -> bool:
        """Delete a core memory block by label. Returns True if deleted."""
        result = await self._db.execute(
-            select(MemoryCore).where(MemoryCore.user_id == user_id, MemoryCore.key == label)
+            select(MemoryCore).where(
                MemoryCore.user_id == user_id,
                MemoryCore.key == label,
            )
        )
        row = result.scalar_one_or_none()
        if row is None:
            logger.debug("memory: delete_core user=%s label=%s found=0", user_id, label)
            return False
        await self._db.delete(row)
        try:
            await self._db.commit()
            logger.info("memory: delete_core user=%s label=%s", user_id, label)
            return True
        except Exception as exc:
            logger.error("memory: delete_core failed user=%s label=%s: %s", user_id, label, exc)
@@ -151,47 +236,64 @@ class MemoryMiddleware:
            return False
    async def append_core(self, user_id: str, label: str, content: str) -> None:
        """Append content to a core block, creating it if missing."""
        current = await self.get_core_block(user_id, label)
        if current is None:
            await self.update_core(user_id, label, content)
            logger.info("memory: append_core user=%s label=%s created=1", user_id, label)
            return
        await self.update_core(user_id, label, f"{current}\n{content}")
        logger.info("memory: append_core user=%s label=%s created=0", user_id, label)
    async def replace_core(self, user_id: str, label: str, old: str, new: str) -> bool:
        """Replace one exact string inside a core block. Returns False if not found."""
        current = await self.get_core_block(user_id, label)
        if current is None or old not in current:
            logger.debug("memory: replace_core user=%s label=%s changed=0", user_id, label)
            return False
        await self.update_core(user_id, label, current.replace(old, new, 1))
        logger.info("memory: replace_core user=%s label=%s changed=1", user_id, label)
        return True
    async def insert_archival(self, user_id: str, content: str, source: str = "manual") -> None:
        """Insert a long-term archival memory entry."""
        fernet = await self._get_fernet(user_id)
        if fernet is None:
            return
        encrypted = _encrypt(fernet, content)
        row = MemoryAssociative(
-            id=str(uuid.uuid4()), user_id=user_id,
+            id=str(uuid.uuid4()),
-            content_encrypted=encrypted, embedding=None,
+            user_id=user_id,
-            entity_type=source, entity_id=None,
+            content_encrypted=encrypted,
            embedding=None,
            entity_type=source,
            entity_id=None,
        )
        self._db.add(row)
        try:
            await self._db.commit()
            logger.info("memory: insert_archival user=%s source=%s", user_id, source)
        except Exception as exc:
            logger.error("memory: insert_archival failed user=%s: %s", user_id, exc)
            await self._db.rollback()
    async def search_archival(self, user_id: str, query: str, top_k: int = 5) -> list[str]:
        """Search archival memory (keyword fallback; semantic ranking can replace this)."""
        fernet = await self._get_fernet(user_id)
        if fernet is None:
            return []
        result = await self._db.execute(
-            select(MemoryAssociative).where(MemoryAssociative.user_id == user_id)
+            select(MemoryAssociative)
-            .order_by(MemoryAssociative.updated_at.desc()).limit(100)
+            .where(MemoryAssociative.user_id == user_id)
            .order_by(MemoryAssociative.updated_at.desc())
            .limit(100)
        )
        rows = result.scalars().all()
        needle = query.strip().lower()
        out: list[str] = []
-        for row in result.scalars().all():
+        for row in rows:
            plaintext = _safe_decrypt(fernet, row.content_encrypted)
            if plaintext is None:
                continue
@@ -199,19 +301,25 @@ class MemoryMiddleware:
                out.append(plaintext)
            if len(out) >= max(top_k, 1):
                break
        logger.info("memory: search_archival user=%s query=%s hits=%d", user_id, query[:80], len(out))
        return out
    async def search_recall(self, user_id: str, query: str, top_k: int = 5) -> list[str]:
        """Search recall memory (episodic summaries) by keyword."""
        fernet = await self._get_fernet(user_id)
        if fernet is None:
            return []
        result = await self._db.execute(
-            select(MemoryEpisodic).where(MemoryEpisodic.user_id == user_id)
+            select(MemoryEpisodic)
-            .order_by(MemoryEpisodic.created_at.desc()).limit(100)
+            .where(MemoryEpisodic.user_id == user_id)
            .order_by(MemoryEpisodic.created_at.desc())
            .limit(100)
        )
        rows = result.scalars().all()
        needle = query.strip().lower()
        out: list[str] = []
-        for row in result.scalars().all():
+        for row in rows:
            plaintext = _safe_decrypt(fernet, row.summary_encrypted)
            if plaintext is None:
                continue
@@ -219,11 +327,13 @@ class MemoryMiddleware:
                out.append(plaintext)
            if len(out) >= max(top_k, 1):
                break
        logger.info("memory: search_recall user=%s query=%s hits=%d", user_id, query[:80], len(out))
        return out
-    # ── Private ───────────────────────────────────────────────────────
+    # ── Private helpers ───────────────────────────────────────────────────────
    async def _get_fernet(self, user_id: str) -> Fernet | None:
        """Load the user's Fernet key from DB. Returns None if missing."""
        result = await self._db.execute(select(User).where(User.id == user_id))
        user = result.scalar_one_or_none()
        if user is None or not user.encryption_key:
@@ -231,38 +341,68 @@ class MemoryMiddleware:
            return None
        return Fernet(user.encryption_key.encode())
    async def _get_user_debug(self, user_id: str) -> dict[str, str | None]:
        """Load lightweight user debug fields for trace logs."""
        result = await self._db.execute(select(User).where(User.id == user_id))
        user = result.scalar_one_or_none()
        if user is None:
            return {"tier": None}
        return {
            "tier": user.tier,
        }
    async def _load_core(self, user_id: str, fernet: Fernet) -> dict[str, str]:
        result = await self._db.execute(
            select(MemoryCore).where(MemoryCore.user_id == user_id)
        )
        rows = result.scalars().all()
        out: dict[str, str] = {}
-        for row in result.scalars().all():
+        for row in rows:
            plaintext = _safe_decrypt(fernet, row.value_encrypted)
            if plaintext is not None:
                out[row.key] = plaintext
        return out
-    async def _load_associative(self, user_id: str, message: str, fernet: Fernet) -> list[str]:
+    async def _load_associative(
        self, user_id: str, message: str, fernet: Fernet
    ) -> list[str]:
        """Load top-k associative memories.
        Production: uses pgvector cosine similarity on the message embedding.
        Current implementation: keyword-based fallback (no external embedding call)
        so tests pass without a live OpenAI key.
        """
        result = await self._db.execute(
-            select(MemoryAssociative).where(MemoryAssociative.user_id == user_id)
+            select(MemoryAssociative)
-            .order_by(MemoryAssociative.updated_at.desc()).limit(_ASSOCIATIVE_TOP_K)
+            .where(MemoryAssociative.user_id == user_id)
            .order_by(MemoryAssociative.updated_at.desc())
            .limit(_ASSOCIATIVE_TOP_K)
        )
        rows = result.scalars().all()
        out: list[str] = []
-        for row in result.scalars().all():
+        for row in rows:
            plaintext = _safe_decrypt(fernet, row.content_encrypted)
            if plaintext is not None:
                out.append(plaintext)
        return out
-    async def _load_episodic(self, user_id: str, fernet: Fernet, session_id: str | None = None) -> list[str]:
+    async def _load_episodic(
        self,
        user_id: str,
        fernet: Fernet,
        session_id: str | None = None,
    ) -> list[str]:
        query = select(MemoryEpisodic).where(MemoryEpisodic.user_id == user_id)
        if session_id:
            query = query.where(MemoryEpisodic.session_id == session_id)
        result = await self._db.execute(
-            query.order_by(MemoryEpisodic.created_at.desc()).limit(_EPISODIC_RECENT_N)
+            query
            .order_by(MemoryEpisodic.created_at.desc())
            .limit(_EPISODIC_RECENT_N)
        )
        rows = result.scalars().all()
        out: list[str] = []
-        for row in result.scalars().all():
+        for row in rows:
            plaintext = _safe_decrypt(fernet, row.summary_encrypted)
            if plaintext is not None:
                out.append(plaintext)
@@ -270,24 +410,30 @@ class MemoryMiddleware:
    async def _load_proactive(self, user_id: str, fernet: Fernet) -> list[str]:
        result = await self._db.execute(
-            select(MemoryProactive).where(
+            select(MemoryProactive)
            .where(
                MemoryProactive.user_id == user_id,
                MemoryProactive.confidence >= _PROACTIVE_CONFIDENCE_THRESHOLD,
            ).order_by(MemoryProactive.confidence.desc())
            )
            .order_by(MemoryProactive.confidence.desc())
        )
        rows = result.scalars().all()
        out: list[str] = []
-        for row in result.scalars().all():
+        for row in rows:
            plaintext = _safe_decrypt(fernet, row.pattern_encrypted)
            if plaintext is not None:
                out.append(plaintext)
        return out
 # ── Encryption helpers ────────────────────────────────────────────────────────
 def _encrypt(fernet: Fernet, plaintext: str) -> str:
    return fernet.encrypt(plaintext.encode()).decode()
 def _safe_decrypt(fernet: Fernet, ciphertext: str) -> str | None:
    """Decrypt and return plaintext, or None on error (corrupted/wrong key)."""
    try:
        return fernet.decrypt(ciphertext.encode()).decode()
    except (InvalidToken, Exception) as exc:
--- a/services/chat/app/output_formatter.py
+++ b/services/chat/app/output_formatter.py
@@ -1,14 +1,11 @@
-"""Output formatter for deep-agent stream events — Chat Service copy.
+"""Output formatter for deep-agent stream events."""
 Converts (event_type, data) tuples into WebSocket frame Pydantic models.
 """
 from __future__ import annotations
 from collections.abc import AsyncGenerator
 from typing import Any
-from shared.schemas import WsFloatingDomain, WsStreamEnd, WsStreamStart, WsStreamText
+from app.schemas import WsFloatingDomain, WsStreamEnd, WsStreamStart, WsStreamText
 WsFrame = WsStreamStart | WsStreamText | WsStreamEnd | WsFloatingDomain
--- a/app/core/ws_context.py
+++ b/app/core/ws_context.py
@@ -0,0 +1,92 @@
 """WebSocket client executor context.
 Holds a per-request async callback that tools call to execute CRUD
 operations on the Electron client's local SQLite / LanceDB databases.
 The callback sends a `tool_call` WS frame and awaits the `tool_result`.
 """
 from __future__ import annotations
 from contextvars import ContextVar
 from typing import Any, Callable, Coroutine
 from uuid import uuid4
 # Holds the execute callback for the current WS session.
 # Set by the chat WS handler before the orchestrator runs; cleared after.
 _client_executor: ContextVar[Callable[[dict], Coroutine[Any, Any, dict]]] = ContextVar(
    "_client_executor"
 )
 # Optional collector that captures raw execute_on_client results.
 # Set by _tool_loop / _tool_loop_stream to populate ChatAgent.tool_results.
 _tool_result_collector: ContextVar[list[dict] | None] = ContextVar(
    "_tool_result_collector", default=None
 )
 def set_tool_result_collector(lst: list[dict]) -> None:
    """Register *lst* as the collector for this async context."""
    _tool_result_collector.set(lst)
 def clear_tool_result_collector() -> None:
    """Clear the collector (best-effort)."""
    _tool_result_collector.set(None)
 def set_client_executor(fn: Callable[[dict], Coroutine[Any, Any, dict]]) -> None:
    """Bind *fn* as the executor for the current async context (task/coroutine)."""
    _client_executor.set(fn)
 def clear_client_executor() -> None:
    """Remove the executor binding (best-effort; ContextVar resets on task exit)."""
    try:
        _client_executor.set(None)  # type: ignore[arg-type]
    except Exception:
        pass
 async def execute_on_client(
    action: str,
    table: str | None = None,
    data: dict[str, Any] | None = None,
    filters: dict[str, Any] | None = None,
    vector: list[float] | None = None,
    limit: int | None = None,
 ) -> dict[str, Any]:
    """Send a CRUD/vector operation to the Electron client and return the result.
    Builds a ``tool_call`` payload, invokes the per-session WS callback,
    and returns the ``tool_result`` dict from Electron.
    Raises ``RuntimeError`` if no executor is set (i.e. called outside a WS session).
    """
    callback = _client_executor.get(None)
    if callback is None:
        raise RuntimeError(
            "execute_on_client() called outside a WebSocket session — "
            "no client executor is set."
        )
    payload: dict[str, Any] = {"id": str(uuid4()), "action": action}
    if table is not None:
        payload["table"] = table
    if data is not None:
        payload["data"] = data
    if filters is not None:
        payload["filters"] = {k: v for k, v in filters.items() if v is not None}
    if vector is not None:
        payload["vector"] = vector
    if limit is not None:
        payload["limit"] = limit
    result = await callback(payload)
    collector = _tool_result_collector.get(None)
    if collector is not None:
        collector.append({
            "action": action,
            "table": table,
            "data": result,
        })
    return result
--- a/shared/db.py
+++ b/shared/db.py
@@ -1,7 +1,15 @@
-"""Database engine, session factory, and declarative base.
+"""Database engine, session factory, and base model.
-All services use the async SQLAlchemy API via ``get_session()``.
+All app code uses the async SQLAlchemy API.  Alembic migrations use the
-Alembic migrations use the synchronous psycopg2 URL (see alembic/env.py).
+synchronous psycopg2 URL for the CLI (see alembic/env.py).
 Usage in routes:
    from app.db import get_session
    from sqlalchemy.ext.asyncio import AsyncSession
    async def my_route(db: AsyncSession = Depends(get_session)):
        result = await db.execute(select(User).where(User.email == email))
        user = result.scalar_one_or_none()
 """
 from __future__ import annotations
@@ -11,7 +19,7 @@ from collections.abc import AsyncGenerator
 from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker, create_async_engine
 from sqlalchemy.orm import DeclarativeBase
-from shared.config import settings
+from app.config.settings import settings
 engine = create_async_engine(
    settings.DATABASE_URL,
--- a/services/batch-agent/app/integrations/init.py
+++ b/services/batch-agent/app/integrations/init.py
@@ -1,11 +1,20 @@
 """Cloud provider integration utilities.
 Adapted for Batch Agent Service: import from shared.config instead of app.config.
 Provides:
-  * Shared message dataclasses (EmailMessage, ChatMessage)
+  * Shared message dataclasses (``EmailMessage``, ``ChatMessage``) used by
-  * get_provider() — factory for Gmail/MS Graph clients
+    both the Gmail and MS Graph clients and consumed by ``agent_runner``.
-  * encrypt_token() / decrypt_token() — Fernet-based OAuth token encryption
+  * ``get_provider()`` — factory that returns the correct client given a
    provider name and decrypted OAuth credentials dict.
  * ``encrypt_token()`` / ``decrypt_token()`` — Fernet-based at-rest
    encryption for OAuth tokens stored in ``cloud_agent_configs``.
 Encryption rationale
 --------------------
 Unlike user content (which is E2E-encrypted client-side and **never**
 decrypted server-side), OAuth tokens *must* be decrypted server-side
 because the backend makes provider API calls on behalf of the user.
 The Fernet key lives solely in ``OAUTH_ENCRYPTION_KEY`` env var — it
 is never returned to clients.
 """
 from __future__ import annotations
@@ -18,7 +27,7 @@ from typing import TYPE_CHECKING
 from cryptography.fernet import Fernet, InvalidToken
-from shared.config import settings
+from app.config.settings import settings
 if TYPE_CHECKING:
    from app.integrations.gmail import GmailClient
@@ -26,9 +35,13 @@ if TYPE_CHECKING:
 logger = logging.getLogger(__name__)
 # ── Shared message types ──────────────────────────────────────────────────
@dataclass
 class EmailMessage:
    """A single email message fetched from Gmail or Outlook."""
    id: str
    subject: str
    sender: str
@@ -38,6 +51,7 @@ class EmailMessage:
    @property
    def as_text(self) -> str:
        """Return a human-readable text representation for LLM extraction."""
        date_str = self.date.strftime("%Y-%m-%d %H:%M")
        labels_str = f" [{', '.join(self.labels)}]" if self.labels else ""
        return (
@@ -50,6 +64,8 @@ class EmailMessage:
@dataclass
 class ChatMessage:
    """A single Teams chat or channel message fetched from MS Graph."""
    id: str
    content: str
    sender: str
@@ -58,6 +74,7 @@ class ChatMessage:
    @property
    def as_text(self) -> str:
        """Return a human-readable text representation for LLM extraction."""
        date_str = self.date.strftime("%Y-%m-%d %H:%M")
        channel_str = f" [channel: {self.channel}]" if self.channel else ""
        return (
@@ -67,7 +84,15 @@ class ChatMessage:
        )
 # ── Fernet helpers ────────────────────────────────────────────────────────
 def _get_fernet() -> Fernet:
    """Return a ``Fernet`` instance using ``settings.OAUTH_ENCRYPTION_KEY``.
    Raises ``RuntimeError`` if ``OAUTH_ENCRYPTION_KEY`` is not set — callers
    must ensure this is configured before persisting OAuth tokens.
    """
    key = settings.OAUTH_ENCRYPTION_KEY
    if not key:
        raise RuntimeError(
@@ -78,6 +103,15 @@ def _get_fernet() -> Fernet:
 def encrypt_token(token_info: dict) -> str:
    """Fernet-encrypt an OAuth credential dict and return a base64 string.
    Stores the full ``{access_token, refresh_token, token_uri, client_id,
    client_secret, scopes, expiry}`` dict (or equivalent MSAL shape).
    Raises:
        RuntimeError: OAUTH_ENCRYPTION_KEY is not configured.
        ValueError: ``token_info`` is not a non-empty dict.
    """
    if not isinstance(token_info, dict) or not token_info:
        raise ValueError("token_info must be a non-empty dict")
    plaintext = json.dumps(token_info).encode("utf-8")
@@ -85,6 +119,13 @@ def encrypt_token(token_info: dict) -> str:
 def decrypt_token(encrypted: str) -> dict:
    """Decrypt a Fernet-encrypted token string and return the credential dict.
    Raises:
        RuntimeError: OAUTH_ENCRYPTION_KEY is not configured.
        ValueError: The encrypted string is invalid or was encrypted with a
            different key.
    """
    try:
        plaintext = _get_fernet().decrypt(encrypted.encode("utf-8"))
        return json.loads(plaintext)
@@ -92,10 +133,25 @@ def decrypt_token(encrypted: str) -> dict:
        raise ValueError(f"Failed to decrypt OAuth token: {exc}") from exc
 # ── Provider factory ──────────────────────────────────────────────────────
 def get_provider(
    provider: str,
    credentials_info: dict,
 ) -> "GmailClient | MSGraphClient":
    """Return the correct provider client for *provider*.
    Parameters
    ----------
    provider:
        One of ``"gmail"``, ``"outlook"``, ``"teams"``.
    credentials_info:
        Decrypted OAuth credential dict (Google or Microsoft shape).
    Raises:
        ValueError: Unknown provider name.
    """
    if provider == "gmail":
        from app.integrations.gmail import GmailClient
        return GmailClient(credentials_info)
--- a/services/batch-agent/app/integrations/gmail.py
+++ b/services/batch-agent/app/integrations/gmail.py
@@ -1,7 +1,26 @@
 """Gmail API client for cloud agent integration.
-Adapted for Batch Agent Service: import from app.integrations instead of
+Wraps the Google Gmail REST API to fetch email messages matching a
-app.integrations (same relative path within the service).
+``filter_config`` dict.  Uses the official ``google-api-python-client``
 library (synchronous) wrapped in ``asyncio.to_thread()`` to avoid
 blocking the event loop.
 Token refresh is handled transparently: when the stored access token has
 expired, ``google.auth.transport.requests.Request`` will use the refresh
 token to obtain a fresh one.  The caller is responsible for persisting
 any refreshed credentials back to ``CloudAgentConfig.oauth_token_encrypted``
 (see ``agent_runner.run_cloud_agent``).
 Credential dict shape (Google OAuth2):
    {
        "token": "<access_token>",
        "refresh_token": "<refresh_token>",
        "token_uri": "https://oauth2.googleapis.com/token",
        "client_id": "<client_id>",
        "client_secret": "<client_secret>",
        "scopes": ["https://www.googleapis.com/auth/gmail.readonly"],
        "expiry": "2025-01-01T00:00:00Z"  # optional ISO-8601
    }
 """
 from __future__ import annotations
@@ -19,8 +38,13 @@ from app.integrations import EmailMessage
 logger = logging.getLogger(__name__)
 # Gmail search date format — e.g. "after:2025/01/01"
 _GMAIL_DATE_FMT = "%Y/%m/%d"
 # Maximum characters of body text forwarded to the LLM.
 _BODY_TRUNCATE = 8_000
 # Maximum messages retrieved per run (prevents runaway quota usage).
 _MAX_MESSAGES = 200
@@ -28,9 +52,20 @@ def _build_gmail_query(
    filter_config: dict[str, Any] | None,
    since: datetime | None,
 ) -> str:
    """Build a Gmail search query string from *filter_config* and *since*.
    Supported ``filter_config`` keys:
        labels (list[str]):  Gmail label names, e.g. ``["INBOX", "work"]``
        senders (list[str]): Sender addresses or domains to include
        date_range (dict):   ``{from: "<YYYY-MM-DD>", to: "<YYYY-MM-DD>"}``
    A hard ``since`` date (from last run) always overrides ``date_range.from``
    when it is earlier.
    """
    parts: list[str] = []
    cfg = filter_config or {}
    # Labels — joined with OR when multiple given.
    labels: list[str] = cfg.get("labels", [])
    if labels:
        if len(labels) == 1:
@@ -39,14 +74,17 @@ def _build_gmail_query(
            label_expr = " OR ".join(f"label:{lbl}" for lbl in labels)
            parts.append(f"({label_expr})")
    # Senders — each prefixed with "from:".
    senders: list[str] = cfg.get("senders", [])
    for sender in senders:
        parts.append(f"from:{sender}")
    # Date range.
    date_range: dict = cfg.get("date_range", {})
    from_str: str | None = date_range.get("from")
    to_str: str | None = date_range.get("to")
    # Determine effective "from" date: most recent of filter_config.date_range.from and since.
    effective_since: datetime | None = since
    if from_str:
        try:
@@ -72,12 +110,18 @@ def _build_gmail_query(
 def _strip_html(raw_html: str) -> str:
    """Remove HTML tags and decode entities to get plain text."""
    no_tags = re.sub(r"<[^>]+>", " ", raw_html)
    decoded = html.unescape(no_tags)
    return re.sub(r"\s+", " ", decoded).strip()
 def _parse_body(payload: dict[str, Any]) -> str:
    """Recursively extract the plain-text body from a Gmail message payload.
    Prefers ``text/plain``; falls back to ``text/html`` (stripped of tags).
    Returns an empty string if no body can be extracted.
    """
    mime_type: str = payload.get("mimeType", "")
    body: dict = payload.get("body", {})
    parts: list[dict] = payload.get("parts", [])
@@ -95,6 +139,7 @@ def _parse_body(payload: dict[str, Any]) -> str:
            return _strip_html(raw)
        return ""
    # Multipart — prefer text/plain part, fall back to text/html.
    plain_fallback = ""
    for part in parts:
        part_mime = part.get("mimeType", "")
@@ -110,6 +155,7 @@ def _parse_body(payload: dict[str, Any]) -> str:
 def _parse_date(raw: str) -> datetime:
    """Parse an RFC 2822 email date header into a UTC ``datetime``."""
    try:
        parsed = email.utils.parsedate_to_datetime(raw)
        if parsed.tzinfo is None:
@@ -120,6 +166,16 @@ def _parse_date(raw: str) -> datetime:
 class GmailClient:
    """Fetch email messages from a Gmail account via the Gmail REST API.
    Parameters
    ----------
    credentials_info:
        Decrypted OAuth2 credential dict.  Must contain at minimum
        ``token`` (access token) or ``refresh_token`` + ``token_uri`` +
        ``client_id`` + ``client_secret``.
    """
    def __init__(self, credentials_info: dict[str, Any]) -> None:
        from google.oauth2.credentials import Credentials
@@ -144,20 +200,38 @@ class GmailClient:
            expiry=expiry,
        )
    # ── Public API ─────────────────────────────────────────────────────────
    async def fetch_messages(
        self,
        filter_config: dict[str, Any] | None = None,
        since: datetime | None = None,
    ) -> list[EmailMessage]:
        """Return up to ``_MAX_MESSAGES`` emails matching *filter_config*.
        Runs the synchronous Google API calls inside ``asyncio.to_thread()``
        to avoid blocking the async event loop.
        Token refresh is performed automatically when the access token has
        expired.  After the call, ``self.refreshed_credentials`` may be
        consulted to detect whether new credentials should be persisted.
        """
        query = _build_gmail_query(filter_config, since)
        logger.debug("gmail: executing search query %r", query)
        return await asyncio.to_thread(self._fetch_sync, query)
    @property
    def refreshed_credentials(self) -> dict[str, Any] | None:
        """Return updated credential dict if the access token was refreshed.
        If the credentials were refreshed during ``fetch_messages()``, returns
        a new dict that should be re-encrypted and written back to the DB.
        Returns ``None`` if no refresh occurred.
        """
        creds = self._credentials
        if not creds.valid and creds.expired:
            return None
        # Check whether the token changed from what was stored.
        if creds.token != self._credentials_info.get("token"):
            result = {
                "token": creds.token,
@@ -172,11 +246,15 @@ class GmailClient:
            return result
        return None
    # ── Internal sync worker ───────────────────────────────────────────────
    def _fetch_sync(self, query: str) -> list[EmailMessage]:
        """Synchronous worker — called inside ``asyncio.to_thread()``."""
        import googleapiclient.discovery
        import googleapiclient.errors
        from google.auth.transport.requests import Request
        # Refresh token if needed before building the service.
        if self._credentials.expired and self._credentials.refresh_token:
            try:
                self._credentials.refresh(Request())
@@ -186,8 +264,9 @@ class GmailClient:
        service = googleapiclient.discovery.build(
            "gmail", "v1", credentials=self._credentials, cache_discovery=False
        )
-        user_api = service.users()
+        user_api = service.users()  # type: ignore[attr-defined]
        # ── List matching message IDs ──────────────────────────────────────
        ids: list[str] = []
        page_token: str | None = None
        while len(ids) < _MAX_MESSAGES:
@@ -214,10 +293,12 @@ class GmailClient:
                break
        if not ids:
            logger.debug("gmail: no messages matched query %r", query)
            return []
        logger.info("gmail: fetching %d message(s)", len(ids))
        # ── Fetch individual message details ──────────────────────────────
        messages: list[EmailMessage] = []
        for msg_id in ids:
            try:
@@ -245,8 +326,10 @@ class GmailClient:
                    date=date,
                    labels=labels,
                ))
            except googleapiclient.errors.HttpError as exc:
                logger.warning("gmail: skipping message %s — HTTP error: %s", msg_id, exc)
            except Exception as exc:
-                logger.warning("gmail: skipping message %s: %s", msg_id, exc)
+                logger.warning("gmail: skipping message %s — unexpected error: %s", msg_id, exc)
        logger.info("gmail: returned %d message(s)", len(messages))
        return messages
--- a/services/batch-agent/app/integrations/ms_graph.py
+++ b/services/batch-agent/app/integrations/ms_graph.py
@@ -1,30 +1,52 @@
-"""Microsoft Graph API client for Outlook and Teams.
+"""Microsoft Graph API client for Outlook and Teams cloud agent integration.
-Adapted for Batch Agent Service: import settings from shared.config.
+Handles two data sources:
 * **Outlook email** (``provider="outlook"``) — ``fetch_emails()`` calls
  ``/me/messages`` with an OData ``$filter`` built from ``filter_config``.
 * **Teams messages** (``provider="teams"``) — ``fetch_messages()`` calls
  ``/me/chats/getAllMessages`` filtered by date.
 Authentication uses MSAL ``PublicClientApplication`` to acquire a token
 from a stored refresh token.  The ``httpx.AsyncClient`` (already a project
 dependency) is used for all API calls.
 Credential dict shape (Microsoft OAuth2 / MSAL):
    {
        "access_token":  "<access_token>",
        "refresh_token": "<refresh_token>",
        "token_type":    "Bearer",
        "scope":         "Mail.Read ChannelMessage.Read.All offline_access",
        "expires_in":    3600
    }
 """
 from __future__ import annotations
 import logging
 import re
-from datetime import datetime, timezone
+from datetime import datetime, timedelta, timezone
 from typing import Any
 import httpx
-from shared.config import settings
+from app.config.settings import settings
 from app.integrations import ChatMessage, EmailMessage
 logger = logging.getLogger(__name__)
 _GRAPH_BASE = "https://graph.microsoft.com/v1.0"
 # Max items fetched per run.
 _MAX_EMAILS = 200
 _MAX_MESSAGES = 200
 # Max characters of body forwarded to the LLM.
 _BODY_TRUNCATE = 8_000
 def _strip_html(raw: str) -> str:
    """Strip HTML tags and collapse whitespace."""
    no_tags = re.sub(r"<[^>]+>", " ", raw)
    import html as _html
    decoded = _html.unescape(no_tags)
@@ -32,6 +54,7 @@ def _strip_html(raw: str) -> str:
 def _odata_datetime(dt: datetime) -> str:
    """Format a datetime as an OData datetime literal (UTC, ISO 8601)."""
    utc = dt.astimezone(timezone.utc)
    return utc.strftime("%Y-%m-%dT%H:%M:%SZ")
@@ -40,14 +63,29 @@ def _build_email_filter(
    filter_config: dict[str, Any] | None,
    since: datetime | None,
 ) -> str:
    """Build an OData ``$filter`` expression for the ``/me/messages`` endpoint.
    Supported ``filter_config`` keys:
        senders (list[str]):  Sender email addresses.
        date_range (dict):    ``{from: "<ISO-8601>", to: "<ISO-8601>"}``
        folders (list[str]):  Folder display names (not directly filterable
                              via OData, so ignored here — callers iterate
                              folder IDs separately if needed; listed for
                              completeness).
    A hard ``since`` date always overrides ``date_range.from`` when it is
    earlier.
    """
    clauses: list[str] = []
    cfg = filter_config or {}
    # Senders.
    senders: list[str] = cfg.get("senders", [])
    if senders:
        sender_clauses = [f"from/emailAddress/address eq '{s}'" for s in senders]
        clauses.append("(" + " or ".join(sender_clauses) + ")")
    # Date range.
    date_range: dict = cfg.get("date_range", {})
    from_str: str | None = date_range.get("from")
@@ -79,16 +117,33 @@ def _build_email_filter(
 class MSGraphClient:
    """Fetch emails and Teams messages via the Microsoft Graph REST API.
    Parameters
    ----------
    credentials_info:
        Decrypted MSAL credential dict.
    """
    def __init__(self, credentials_info: dict[str, Any]) -> None:
        self._credentials_info = credentials_info
        self._access_token: str = credentials_info.get("access_token", "")
        self._original_access_token: str = self._access_token
        self._refresh_token: str | None = credentials_info.get("refresh_token")
    # ── Token management ───────────────────────────────────────────────────
    def _auth_headers(self) -> dict[str, str]:
        return {"Authorization": f"Bearer {self._access_token}"}
    async def _refresh_access_token(self) -> None:
        """Use MSAL to exchange the refresh token for a fresh access token.
        Updates ``self._access_token`` and ``self._credentials_info`` in-place.
        Raises:
            RuntimeError: MSAL reports an auth error.
        """
        import msal
        app = msal.ConfidentialClientApplication(
@@ -109,6 +164,7 @@ class MSGraphClient:
            raise RuntimeError(f"MS Graph token refresh failed: {error}")
        self._access_token = result["access_token"]
        # MSAL may issue a new refresh token.
        if "refresh_token" in result:
            self._refresh_token = result["refresh_token"]
            self._credentials_info["refresh_token"] = result["refresh_token"]
@@ -116,10 +172,16 @@ class MSGraphClient:
    @property
    def refreshed_credentials(self) -> dict[str, Any] | None:
        """Return updated credential dict if the access token was refreshed.
        Returns ``None`` if no change was made.
        """
        if self._access_token != self._original_access_token:
            return {**self._credentials_info, "access_token": self._access_token}
        return None
    # ── HTTP helpers ───────────────────────────────────────────────────────
    async def _get(
        self,
        client: httpx.AsyncClient,
@@ -128,8 +190,10 @@ class MSGraphClient:
        *,
        retry_on_401: bool = True,
    ) -> dict[str, Any]:
        """GET *url* with auth; refresh token on 401 and retry once."""
        resp = await client.get(url, params=params, headers=self._auth_headers())
        if resp.status_code == 401 and retry_on_401 and self._refresh_token:
            logger.debug("ms_graph: 401 on %s — refreshing token", url)
            await self._refresh_access_token()
            resp = await client.get(url, params=params, headers=self._auth_headers())
        if resp.status_code == 429:
@@ -137,11 +201,22 @@ class MSGraphClient:
        resp.raise_for_status()
        return resp.json()
    # ── Public API ─────────────────────────────────────────────────────────
    async def fetch_emails(
        self,
        filter_config: dict[str, Any] | None = None,
        since: datetime | None = None,
    ) -> list[EmailMessage]:
        """Return up to ``_MAX_EMAILS`` Outlook messages matching *filter_config*.
        Parameters
        ----------
        filter_config:
            Optional dict with ``senders``, ``date_range``, ``folders`` keys.
        since:
            Hard lower-bound on email date (from last agent run).
        """
        odata_filter = _build_email_filter(filter_config, since)
        params: dict[str, Any] = {
            "$top": 50,
@@ -162,7 +237,7 @@ class MSGraphClient:
                    if len(emails) >= _MAX_EMAILS:
                        break
                url = data.get("@odata.nextLink", "")
-                params = {}
+                params = {}  # nextLink already contains encoded params.
        logger.info("ms_graph: fetched %d Outlook email(s)", len(emails))
        return emails
@@ -172,6 +247,13 @@ class MSGraphClient:
        filter_config: dict[str, Any] | None = None,
        since: datetime | None = None,
    ) -> list[ChatMessage]:
        """Return up to ``_MAX_MESSAGES`` Teams messages matching *filter_config*.
        Fetches from ``/me/chats/getAllMessages`` (personal + group chats).
        The ``filter_config.channels`` key is checked as a text-filter on
        the channel name post-fetch (the API doesn't support channel OData
        filter directly on ``getAllMessages``).
        """
        cfg = filter_config or {}
        channel_filter: list[str] = [c.lower() for c in cfg.get("channels", [])]
        params: dict[str, Any] = {"$top": 50}
@@ -186,9 +268,11 @@ class MSGraphClient:
                try:
                    data = await self._get(client, url, params if url.startswith(_GRAPH_BASE) else None)
                except httpx.HTTPStatusError as exc:
                    # getAllMessages requires specific licensing; degrade gracefully.
                    if exc.response.status_code in (403, 404):
                        logger.warning(
-                            "ms_graph: /me/chats/getAllMessages not available (%d)",
+                            "ms_graph: /me/chats/getAllMessages not available (%d) — "
                            "check Teams license or permissions",
                            exc.response.status_code,
                        )
                        break
@@ -208,6 +292,8 @@ class MSGraphClient:
        logger.info("ms_graph: fetched %d Teams message(s)", len(messages))
        return messages
    # ── Parsers ────────────────────────────────────────────────────────────
    @staticmethod
    def _parse_email(item: dict[str, Any]) -> EmailMessage:
        subject: str = item.get("subject", "(no subject)") or "(no subject)"
--- a/app/main.py
+++ b/app/main.py
@@ -0,0 +1,68 @@
 from contextlib import asynccontextmanager
 import logging
 from fastapi import FastAPI
 from fastapi.middleware.cors import CORSMiddleware
 logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s %(levelname)s %(name)s: %(message)s",
 )
 logging.getLogger("sqlalchemy.engine").setLevel(logging.WARNING)
 logging.getLogger("sqlalchemy.pool").setLevel(logging.WARNING)
 from app.api.middleware.rate_limit import TierRateLimitMiddleware
 from app.api.middleware.sanitizer import SanitizerMiddleware
 from app.config.settings import settings
@asynccontextmanager
 async def lifespan(app: FastAPI):
    # Startup: ensure agent tool modules are loaded.
    import app.agents  # noqa: F401
    yield
    # Shutdown: dispose SQLAlchemy connection pool
    from app.db import engine
    await engine.dispose()
 def create_app() -> FastAPI:
    app = FastAPI(
        title="Adiuva Cloud API",
        version="0.1.0",
        docs_url="/docs" if settings.ENV == "dev" else None,
        redoc_url=None,
        lifespan=lifespan,
    )
    app.add_middleware(
        CORSMiddleware,
        allow_origins=settings.CORS_ORIGINS,
        allow_credentials=True,
        allow_methods=["*"],
        allow_headers=["*"],
    )
    # Middleware stack (Starlette inserts at position 0, so last-added = outermost).
    # Request flow:  TierRateLimit → Sanitizer → CORS → Router
    # Response flow: Router → CORS → Sanitizer → TierRateLimit
    app.add_middleware(SanitizerMiddleware)
    app.add_middleware(TierRateLimitMiddleware)
    from app.api.routes import agents, auth, billing, chat, device_ws
    app.include_router(auth.router,       prefix="/api/v1")
    app.include_router(chat.router,       prefix="/api/v1")
    app.include_router(billing.router,    prefix="/api/v1")
    app.include_router(agents.router,     prefix="/api/v1")
    app.include_router(device_ws.router,  prefix="/api/v1")
    @app.get("/api/v1/health", tags=["health"])
    async def health() -> dict:
        return {"status": "ok", "version": app.version}
    return app
 app = create_app()
--- a/shared/models.py
+++ b/shared/models.py
@@ -1,14 +1,19 @@
 """SQLAlchemy ORM models for all persistent tables.
-Centralized here so that Alembic migrations and all services share
+Only auth, billing, agent config, and memory data live here.
-the same model definitions.  Each service only queries the tables it owns.
+User content (notes, tasks, etc.) lives exclusively on the client.
-Ownership:
+Table inventory:
-  Auth Service      → users, refresh_tokens, subscriptions
+  users               — account credentials + tier
-  Chat Service      → memory_core, memory_associative, memory_episodic, memory_proactive
+  refresh_tokens      — hashed refresh token store
-  Batch Agent       → local_agent_configs, cloud_agent_configs, agent_run_logs
+  subscriptions       — Stripe subscription records
-  Billing Service   → subscriptions (shared write with Auth)
+  local_agent_configs — per-device batch agent configs
-  (excluded MVP)    → storage_records, backup_metadata, plugins, plugin_*, revenue_events
+  cloud_agent_configs — OAuth-backed cloud agent configs
  agent_run_logs      — execution history for all agents
  memory_core         — per-user persistent key/value preferences (encrypted)
  memory_associative  — per-user semantic memory with embeddings (encrypted)
  memory_episodic     — per-user session summaries (encrypted)
  memory_proactive    — per-user behavioral patterns (encrypted)
 """
 from __future__ import annotations
@@ -17,7 +22,6 @@ import uuid
 from datetime import datetime, timezone
 from sqlalchemy import (
    BigInteger,
    Boolean,
    DateTime,
    Enum,
@@ -27,13 +31,12 @@ from sqlalchemy import (
    JSON,
    String,
    Text,
    UniqueConstraint,
    Uuid,
    func,
 )
 from sqlalchemy.orm import Mapped, mapped_column, relationship
-from shared.db import Base
+from app.db import Base
 # ── Helpers ──────────────────────────────────────────────────────────────
@@ -49,14 +52,12 @@ def _now() -> datetime:
 # ── Enum types ────────────────────────────────────────────────────────────
 TierEnum = Enum("free", "pro", "power", "team", name="billing_tier")
 PluginStatusEnum = Enum("pending_review", "approved", "rejected", name="plugin_status")
 ReviewDecisionEnum = Enum("approved", "rejected", name="review_decision")
 AgentTypeEnum = Enum("local", "cloud", name="agent_type")
 AgentStatusEnum = Enum("running", "success", "error", "partial", name="agent_run_status")
 CloudProviderEnum = Enum("gmail", "teams", "outlook", name="cloud_provider")
-# ── Auth models ───────────────────────────────────────────────────────────
+# ── Models ────────────────────────────────────────────────────────────────
 class User(Base):
@@ -71,6 +72,8 @@ class User(Base):
    password_hash: Mapped[str] = mapped_column(String(255), nullable=False)
    tier: Mapped[str] = mapped_column(TierEnum, nullable=False, default="free")
    stripe_customer_id: Mapped[str | None] = mapped_column(String(255), nullable=True)
    # Per-user Fernet key (base64-urlsafe, 44 chars). Generated on registration.
    # Used to encrypt/decrypt all memory rows for this user.
    encryption_key: Mapped[str | None] = mapped_column(String(64), nullable=True)
    created_at: Mapped[datetime] = mapped_column(
        DateTime(timezone=True), nullable=False, server_default=func.now()
@@ -126,159 +129,6 @@ class Subscription(Base):
    user: Mapped[User] = relationship(back_populates="subscription")
 # ── Storage models (excluded from MVP, kept for Alembic) ──────────────
 class StorageRecord(Base):
    __tablename__ = "storage_records"
    id: Mapped[str] = mapped_column(
        Uuid(as_uuid=False), primary_key=True, default=_uuid
    )
    user_id: Mapped[str] = mapped_column(
        Uuid(as_uuid=False), ForeignKey("users.id", ondelete="CASCADE"), nullable=False, index=True
    )
    table_name: Mapped[str] = mapped_column(String(100), nullable=False)
    s3_key: Mapped[str] = mapped_column(String(500), nullable=False)
    checksum: Mapped[str] = mapped_column(String(64), nullable=False)
    size_bytes: Mapped[int] = mapped_column(Integer, nullable=False)
    created_at: Mapped[datetime] = mapped_column(
        DateTime(timezone=True), nullable=False, server_default=func.now()
    )
    updated_at: Mapped[datetime] = mapped_column(
        DateTime(timezone=True), nullable=False, server_default=func.now(), onupdate=func.now()
    )
 class BackupMetadata(Base):
    __tablename__ = "backup_metadata"
    id: Mapped[str] = mapped_column(
        Uuid(as_uuid=False), primary_key=True, default=_uuid
    )
    user_id: Mapped[str] = mapped_column(
        Uuid(as_uuid=False), ForeignKey("users.id", ondelete="CASCADE"), nullable=False, index=True
    )
    s3_key: Mapped[str] = mapped_column(String(500), nullable=False)
    version: Mapped[int] = mapped_column(Integer, nullable=False)
    timestamp: Mapped[int] = mapped_column(BigInteger, nullable=False)
    checksum: Mapped[str] = mapped_column(String(64), nullable=False)
    size_bytes: Mapped[int] = mapped_column(Integer, nullable=False)
    created_at: Mapped[datetime] = mapped_column(
        DateTime(timezone=True), nullable=False, server_default=func.now()
    )
 # ── Plugin models (excluded from MVP, kept for Alembic) ───────────────
 class Plugin(Base):
    __tablename__ = "plugins"
    id: Mapped[str] = mapped_column(String(255), primary_key=True)
    name: Mapped[str] = mapped_column(String(255), nullable=False)
    description: Mapped[str] = mapped_column(Text, nullable=False, default="")
    version: Mapped[str] = mapped_column(String(50), nullable=False, default="1.0.0")
    author_id: Mapped[str | None] = mapped_column(
        Uuid(as_uuid=False), ForeignKey("users.id", ondelete="SET NULL"), nullable=True
    )
    author_name: Mapped[str] = mapped_column(String(255), nullable=False, default="")
    category: Mapped[str] = mapped_column(String(100), nullable=False, default="")
    price_cents: Mapped[int] = mapped_column(Integer, nullable=False, default=0)
    permissions: Mapped[str] = mapped_column(Text, nullable=False, default="[]")
    status: Mapped[str] = mapped_column(PluginStatusEnum, nullable=False, default="pending_review")
    s3_package_key: Mapped[str | None] = mapped_column(String(500), nullable=True)
    install_count: Mapped[int] = mapped_column(Integer, nullable=False, default=0)
    avg_rating: Mapped[float] = mapped_column(Float, nullable=False, default=0.0)
    rejection_reason: Mapped[str | None] = mapped_column(Text, nullable=True)
    submitted_at: Mapped[datetime] = mapped_column(
        DateTime(timezone=True), nullable=False, server_default=func.now()
    )
    created_at: Mapped[datetime] = mapped_column(
        DateTime(timezone=True), nullable=False, server_default=func.now()
    )
    installations: Mapped[list[PluginInstallation]] = relationship(
        back_populates="plugin", cascade="all, delete-orphan"
    )
    reviews: Mapped[list[PluginReview]] = relationship(
        back_populates="plugin", cascade="all, delete-orphan"
    )
    revenue_events: Mapped[list[RevenueEvent]] = relationship(
        back_populates="plugin", cascade="all, delete-orphan"
    )
 class PluginInstallation(Base):
    __tablename__ = "plugin_installations"
    __table_args__ = (UniqueConstraint("plugin_id", "user_id", name="uq_plugin_user"),)
    id: Mapped[str] = mapped_column(
        Uuid(as_uuid=False), primary_key=True, default=_uuid
    )
    plugin_id: Mapped[str] = mapped_column(
        String(255), ForeignKey("plugins.id", ondelete="CASCADE"), nullable=False, index=True
    )
    user_id: Mapped[str] = mapped_column(
        Uuid(as_uuid=False), ForeignKey("users.id", ondelete="CASCADE"), nullable=False, index=True
    )
    installed_at: Mapped[datetime] = mapped_column(
        DateTime(timezone=True), nullable=False, server_default=func.now()
    )
    plugin: Mapped[Plugin] = relationship(back_populates="installations")
 class PluginReview(Base):
    __tablename__ = "plugin_reviews"
    id: Mapped[str] = mapped_column(
        Uuid(as_uuid=False), primary_key=True, default=_uuid
    )
    plugin_id: Mapped[str] = mapped_column(
        String(255), ForeignKey("plugins.id", ondelete="CASCADE"), nullable=False, index=True
    )
    reviewer_id: Mapped[str | None] = mapped_column(
        Uuid(as_uuid=False), ForeignKey("users.id", ondelete="SET NULL"), nullable=True
    )
    decision: Mapped[str] = mapped_column(ReviewDecisionEnum, nullable=False)
    notes: Mapped[str | None] = mapped_column(Text, nullable=True)
    reviewed_at: Mapped[datetime] = mapped_column(
        DateTime(timezone=True), nullable=False, server_default=func.now()
    )
    created_at: Mapped[datetime] = mapped_column(
        DateTime(timezone=True), nullable=False, server_default=func.now()
    )
    plugin: Mapped[Plugin] = relationship(back_populates="reviews")
 class RevenueEvent(Base):
    __tablename__ = "revenue_events"
    id: Mapped[str] = mapped_column(
        Uuid(as_uuid=False), primary_key=True, default=_uuid
    )
    plugin_id: Mapped[str] = mapped_column(
        String(255), ForeignKey("plugins.id", ondelete="CASCADE"), nullable=False, index=True
    )
    user_id: Mapped[str] = mapped_column(
        Uuid(as_uuid=False), ForeignKey("users.id", ondelete="CASCADE"), nullable=False, index=True
    )
    amount_cents: Mapped[int] = mapped_column(Integer, nullable=False, default=0)
    developer_share_cents: Mapped[int] = mapped_column(Integer, nullable=False, default=0)
    stripe_transfer_id: Mapped[str | None] = mapped_column(String(255), nullable=True)
    paid_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True), nullable=True)
    created_at: Mapped[datetime] = mapped_column(
        DateTime(timezone=True), nullable=False, server_default=func.now()
    )
    plugin: Mapped[Plugin] = relationship(back_populates="revenue_events")
 # ── Agent models ──────────────────────────────────────────────────────────
 class LocalAgentConfig(Base):
    __tablename__ = "local_agent_configs"
@@ -353,6 +203,8 @@ class AgentRunLog(Base):
    id: Mapped[str] = mapped_column(
        Uuid(as_uuid=False), primary_key=True, default=_uuid
    )
    # Plain string — not a FK because it references either local_agent_configs or cloud_agent_configs
    # depending on agent_type. Query by (agent_id, agent_type) to locate the source config.
    agent_id: Mapped[str] = mapped_column(String(255), nullable=False, index=True)
    agent_type: Mapped[str] = mapped_column(AgentTypeEnum, nullable=False)
    user_id: Mapped[str] = mapped_column(
@@ -381,11 +233,15 @@ class AgentRunLog(Base):
    )
-# ── Memory models ─────────────────────────────────────────────────────────
+# ── Memory models ─────────────────────────────────────────────────────────────
 class MemoryCore(Base):
-    """Per-user persistent key/value preferences, encrypted at rest."""
+    """Per-user persistent key/value preferences, encrypted at rest.
    Examples: preferred_language, timezone, work_style.
    Decrypted in-memory only using User.encryption_key.
    """
    __tablename__ = "memory_core"
@@ -402,7 +258,11 @@ class MemoryCore(Base):
 class MemoryAssociative(Base):
-    """Per-user semantic memory: encrypted content + pgvector embedding."""
+    """Per-user semantic memory: encrypted content + pgvector embedding for similarity search.
    Production: ``embedding`` column is ``vector(1536)`` via pgvector.
    Tests (SQLite): stored as JSON list.
    """
    __tablename__ = "memory_associative"
@@ -412,6 +272,7 @@ class MemoryAssociative(Base):
        nullable=False, index=True,
    )
    content_encrypted: Mapped[str] = mapped_column(Text, nullable=False)
    # JSON-encoded float list in SQLite tests; vector(1536) in Postgres via migration.
    embedding: Mapped[list | None] = mapped_column(JSON, nullable=True)
    entity_type: Mapped[str | None] = mapped_column(String(100), nullable=True)
    entity_id: Mapped[str | None] = mapped_column(String(255), nullable=True)
@@ -421,7 +282,10 @@ class MemoryAssociative(Base):
 class MemoryEpisodic(Base):
-    """Per-user session summaries, encrypted at rest."""
+    """Per-user session summaries, encrypted at rest.
    One row per session interaction; used to recall recent conversations.
    """
    __tablename__ = "memory_episodic"
@@ -438,7 +302,11 @@ class MemoryEpisodic(Base):
 class MemoryProactive(Base):
-    """Per-user inferred behavioral patterns, encrypted at rest."""
+    """Per-user inferred behavioral patterns, encrypted at rest.
    Confidence in [0.0, 1.0]; only patterns above threshold are injected.
    Source: 'inferred' (from episodes) or 'explicit' (user-stated).
    """
    __tablename__ = "memory_proactive"
--- a/shared/schemas.py
+++ b/shared/schemas.py
@@ -1,7 +1,6 @@
 """Pydantic schemas — API request/response contracts.
-Shared across all services. Mirrors the TypeScript types from
+Mirrors the TypeScript types from the Electron app (src/shared/api-types.ts).
 the Electron app (src/shared/api-types.ts).
 """
 from __future__ import annotations
@@ -51,88 +50,6 @@ class ChatResponse(BaseModel):
    response: str
 # ── Backup ───────────────────────────────────────────────────────────
 class BackupMetadata(BaseModel):
    version: int
    timestamp: int
    checksum: str
    chunk_count: int
 # ── Cloud Storage (E2E encrypted blobs) ──────────────────────────────
 class StorageRecord(BaseModel):
    id: str
    user_id: str
    table: str
    blob: bytes
    checksum: str
    created_at: int
    updated_at: int
 class StorageRecordCreate(BaseModel):
    table: str
    blob: bytes
    checksum: str
 class StorageRecordUpdate(BaseModel):
    blob: bytes
    checksum: str
 # ── Cloud Vector Store (E2E encrypted vectors) ────────────────────────
 class VectorItem(BaseModel):
    id: str
    blob: bytes
    checksum: str
 class VectorUpsertRequest(BaseModel):
    vectors: list[VectorItem]
 class VectorSearchRequest(BaseModel):
    query_blob: bytes
    top_k: int = 10
 class VectorSearchResult(BaseModel):
    id: str
    score: float
    blob: bytes
 class VectorSearchResponse(BaseModel):
    results: list[VectorSearchResult]
 # ── Plugin Marketplace ────────────────────────────────────────────────
 class PluginManifest(BaseModel):
    id: str
    name: str
    description: str
    version: str
    author: str
    permissions: list[str]
    category: str
    price_cents: int = 0
 class PluginListResponse(BaseModel):
    plugins: list[PluginManifest]
    total: int
    page: int
 class PluginInstallRequest(BaseModel):
    plugin_id: str
 # ── WebSocket Frame Protocol ──────────────────────────────────────────
 class WsFrameType(str, Enum):
@@ -210,10 +127,11 @@ class WsDeviceHello(BaseModel):
    agent_ids: list[str] = Field(default_factory=list)
 # ── WebSocket v3 Frame Models ─────────────────────────────────────────
 class WsFloatingScope(BaseModel):
-    """Scope for a floating request."""
+    """Scope for a floating request — narrows the agent to a specific entity."""
    type: Literal["task", "project", "note", "timeline"]
    id: str | None = None
@@ -295,7 +213,7 @@ class AgentCreationCheckResponse(BaseModel):
 class AgentTriggerRequest(BaseModel):
    directory: str = Field(min_length=1)
    device_id: str = Field(default="")
-    agent_id: str | None = None
+    agent_id: str | None = None  # FE stable agent ID (electron-store UUID)
    what_to_extract: list[str] = Field(min_length=1)
    actions_by_type: dict[str, list[str]] | None = None
    batch_interval: str = Field(min_length=1)
@@ -315,3 +233,7 @@ class AgentRunLogResponse(BaseModel):
    errors: list[str]
    started_at: int
    completed_at: int | None
 # ── Chatbot Journey ───────────────────────────────────────────────────
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -1,34 +1,27 @@
 # ── Adiuva Microservices ─────────────────────────────────────────────
 # docker compose up --build
 # docker compose up --build auth ws-gateway chat   # subset
 services:
-
+  app:
-  # ═══════════════════════════════════════════════════════════════════
+    build: .
  #  Infrastructure
  # ═══════════════════════════════════════════════════════════════════
  traefik:
    image: traefik:v3.1
    ports:
-      - "80:80"
+      - "8080:8000"
-      - "443:443"
+    env_file:
-      - "8080:8080"   # dashboard (dev only)
+      - path: .env
        required: false
    environment:
-      CF_DNS_API_TOKEN: ${CF_DNS_API_TOKEN:-}
+      DATABASE_URL: postgresql+asyncpg://postgres:postgres@db:5432/adiuva
      GITHUB_COPILOT_TOKEN_DIR: /root/.config/litellm/github_copilot
    volumes:
-      - /var/run/docker.sock:/var/run/docker.sock:ro
+      - copilot_tokens:/root/.config/litellm/github_copilot
-      - ./traefik/traefik.yml:/etc/traefik/traefik.yml:ro
+    depends_on:
-      - ./traefik/dynamic:/etc/traefik/dynamic:ro
+      db:
-      - traefik_acme:/etc/traefik/acme
+        condition: service_healthy
    restart: unless-stopped
  db:
    image: pgvector/pgvector:pg16
    environment:
-      POSTGRES_USER: ${POSTGRES_USER:-postgres}
+      POSTGRES_USER: postgres
-      POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-postgres}
+      POSTGRES_PASSWORD: postgres
-      POSTGRES_DB: ${POSTGRES_DB:-adiuva}
+      POSTGRES_DB: adiuva
    volumes:
      - postgres_data:/var/lib/postgresql/data
    healthcheck:
@@ -38,161 +31,11 @@ services:
      retries: 5
    restart: unless-stopped
-  redis:
+  # Optional Redis for future rate-limit or caching needs
-    image: redis:7-alpine
+  # redis:
-    command: redis-server --maxmemory 256mb --maxmemory-policy allkeys-lru
+  #   image: redis:7-alpine
    volumes:
      - redis_data:/data
    healthcheck:
      test: ["CMD", "redis-cli", "ping"]
      interval: 5s
      timeout: 3s
      retries: 5
    restart: unless-stopped
  # ── Optional infrastructure (uncomment as needed) ────────────────
  # minio:
  #   image: minio/minio:latest
  #   command: server /data --console-address ":9001"
  #   ports:
  #     - "9000:9000"
  #     - "9001:9001"
  #   environment:
  #     MINIO_ROOT_USER: minioadmin
  #     MINIO_ROOT_PASSWORD: minioadmin
  #   volumes:
  #     - minio_data:/data
  #   healthcheck:
  #     test: ["CMD", "mc", "ready", "local"]
  #     interval: 5s
  #     timeout: 5s
  #     retries: 5
  #   restart: unless-stopped
  # qdrant:
  #   image: qdrant/qdrant:latest
  #   ports:
  #     - "6333:6333"
  #     - "6334:6334"
  #   volumes:
  #     - qdrant_data:/qdrant/storage
  #   restart: unless-stopped
  # ═══════════════════════════════════════════════════════════════════
  #  Migrations (run once, then exit)
  # ═══════════════════════════════════════════════════════════════════
  migrate:
    build:
      context: .
      dockerfile: Dockerfile
    command: ["python", "-m", "alembic", "upgrade", "head"]
    env_file:
      - path: .env
        required: false
    environment:
      DATABASE_URL: postgresql+asyncpg://${POSTGRES_USER:-postgres}:${POSTGRES_PASSWORD:-postgres}@db:5432/${POSTGRES_DB:-adiuva}
    depends_on:
      db:
        condition: service_healthy
    restart: "no"
  # ═══════════════════════════════════════════════════════════════════
  #  Application Services
  # ═══════════════════════════════════════════════════════════════════
  auth:
    build:
      context: .
      dockerfile: services/auth/Dockerfile
    env_file:
      - path: .env
        required: false
    environment:
      DATABASE_URL: postgresql+asyncpg://${POSTGRES_USER:-postgres}:${POSTGRES_PASSWORD:-postgres}@db:5432/${POSTGRES_DB:-adiuva}
      REDIS_URL: redis://redis:6379/0
    depends_on:
      db:
        condition: service_healthy
      migrate:
        condition: service_completed_successfully
    restart: unless-stopped
  ws-gateway:
    build:
      context: .
      dockerfile: services/ws-gateway/Dockerfile
    env_file:
      - path: .env
        required: false
    environment:
      DATABASE_URL: postgresql+asyncpg://${POSTGRES_USER:-postgres}:${POSTGRES_PASSWORD:-postgres}@db:5432/${POSTGRES_DB:-adiuva}
      REDIS_URL: redis://redis:6379/0
    depends_on:
      redis:
        condition: service_healthy
      auth:
        condition: service_started
    restart: unless-stopped
  chat:
    build:
      context: .
      dockerfile: services/chat/Dockerfile
    env_file:
      - path: .env
        required: false
    environment:
      DATABASE_URL: postgresql+asyncpg://${POSTGRES_USER:-postgres}:${POSTGRES_PASSWORD:-postgres}@db:5432/${POSTGRES_DB:-adiuva}
      REDIS_URL: redis://redis:6379/0
    depends_on:
      db:
        condition: service_healthy
      redis:
        condition: service_healthy
      migrate:
        condition: service_completed_successfully
    restart: unless-stopped
  batch-agent:
    build:
      context: .
      dockerfile: services/batch-agent/Dockerfile
    env_file:
      - path: .env
        required: false
    environment:
      DATABASE_URL: postgresql+asyncpg://${POSTGRES_USER:-postgres}:${POSTGRES_PASSWORD:-postgres}@db:5432/${POSTGRES_DB:-adiuva}
      REDIS_URL: redis://redis:6379/0
    depends_on:
      db:
        condition: service_healthy
      redis:
        condition: service_healthy
      migrate:
        condition: service_completed_successfully
    restart: unless-stopped
  billing:
    build:
      context: .
      dockerfile: services/billing/Dockerfile
    env_file:
      - path: .env
        required: false
    environment:
      DATABASE_URL: postgresql+asyncpg://${POSTGRES_USER:-postgres}:${POSTGRES_PASSWORD:-postgres}@db:5432/${POSTGRES_DB:-adiuva}
    depends_on:
      db:
        condition: service_healthy
      migrate:
        condition: service_completed_successfully
    restart: unless-stopped
 volumes:
  postgres_data:
-  redis_data:
+  copilot_tokens:
  traefik_acme:
  # minio_data:
  # qdrant_data:
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,36 @@
 fastapi>=0.115.0
 uvicorn[standard]>=0.34.0
 gunicorn>=22.0.0
 langchain>=0.3.0
 langchain-openai>=0.3.0
 langchain-litellm>=0.1.0
 litellm>=1.50.0
 pydantic>=2.10.0
 pydantic-settings>=2.7.0
 python-jose[cryptography]>=3.3.0
 stripe>=11.0.0
 boto3>=1.35.0
 slowapi>=0.1.9
 sqlalchemy>=2.0.0
 asyncpg>=0.30.0
 alembic>=1.14.0
 bcrypt>=4.2.0
 python-dotenv>=1.0.0
 httpx>=0.28.0
 websockets>=14.0
 psycopg2-binary>=2.9.0
 pytest>=8.0.0
 pytest-asyncio>=0.24.0
 aiosqlite>=0.20.0
 moto[s3]>=5.0.0
 pinecone>=5.0.0
 qdrant-client>=1.7.0
 croniter>=3.0.0
 google-api-python-client>=2.130.0
 google-auth>=2.29.0
 google-auth-oauthlib>=1.2.0
 google-auth-httplib2>=0.2.0
 msal>=1.28.0
 cryptography>=42.0.0
 langfuse>=2.0.0
 ruff>=0.8.0
--- a/services/auth/.env.example
+++ b/services/auth/.env.example
@@ -1,19 +0,0 @@
 # ── Auth Service ──────────────────────────────────────────────────────────────
 # This file contains env vars specific to the Auth Service.
 # Shared vars (DATABASE_URL, REDIS_URL, etc.) come from the root .env
 # or from docker-compose environment.
 # ── JWT RS256 Keys ────────────────────────────────────────────────────────────
 # Generate keypair:
 #   openssl genpkey -algorithm RSA -out private.pem -pkeyopt rsa_keygen_bits:2048
 #   openssl rsa -in private.pem -pubout -out public.pem
 #
 # Paste PEM content with literal \n for newlines:
 #   JWT_PRIVATE_KEY=-----BEGIN PRIVATE KEY-----\nMIIEvQ...
 #   JWT_PUBLIC_KEY=-----BEGIN PUBLIC KEY-----\nMIIBIj...
 # PRIVATE KEY — used to SIGN JWTs. NEVER share outside this service.
 JWT_PRIVATE_KEY=
 # PUBLIC KEY — used to VERIFY JWTs.
 JWT_PUBLIC_KEY=
--- a/services/auth/Dockerfile
+++ b/services/auth/Dockerfile
@@ -1,36 +0,0 @@
 # ── builder ──────────────────────────────────────────────────────────────────
 FROM python:3.12-slim AS builder
 WORKDIR /build
 # Install shared + service deps in one layer
 COPY services/auth/requirements.txt ./requirements.txt
 RUN pip install --upgrade pip && \
    pip install --no-cache-dir --prefix=/install -r requirements.txt
 # ── runtime ──────────────────────────────────────────────────────────────────
 FROM python:3.12-slim AS runtime
 RUN addgroup --system appgroup && adduser --system --ingroup appgroup appuser
 WORKDIR /app
 COPY --from=builder /install /usr/local
 # Copy shared module (available to all services)
 COPY shared/ shared/
 # Copy service source
 COPY services/auth/app/ app/
 RUN chown -R appuser:appgroup /app
 USER appuser
 EXPOSE 8000
 CMD ["gunicorn", "app.main:app", \
     "-k", "uvicorn.workers.UvicornWorker", \
     "--bind", "0.0.0.0:8000", \
     "--workers", "2", \
     "--timeout", "30"]
--- a/services/auth/README.md
+++ b/services/auth/README.md
@@ -1,16 +0,0 @@
 # Auth Service
 Owns: user registration, login, JWT RS256 issuance, token refresh, `/me` endpoint.
 ## Tables owned
 - `users`
 - `refresh_tokens`
 - `subscriptions` (read; Billing Service writes)
 ## Endpoints
 - `POST /auth/register`
 - `POST /auth/login`
 - `POST /auth/refresh`
 - `GET /auth/me`
 - `PUT /auth/me`
 - `GET /auth/verify` (ForwardAuth for Traefik)
--- a/services/auth/app/config.py
+++ b/services/auth/app/config.py
@@ -1,34 +0,0 @@
 """Auth Service — local configuration.
 Contains secrets that ONLY the Auth Service needs (e.g., JWT private key).
 These are NOT in shared/config.py to prevent other services from accessing them.
 """
 from pydantic import field_validator
 from pydantic_settings import BaseSettings, SettingsConfigDict
 class AuthSettings(BaseSettings):
    # RS256 private key (PEM format). Used to SIGN JWTs.
    # Only the Auth Service has this. Generate with:
    #   openssl genpkey -algorithm RSA -out private.pem -pkeyopt rsa_keygen_bits:2048
    # Then set the env var (newlines as \n):
    #   JWT_PRIVATE_KEY="-----BEGIN PRIVATE KEY-----\nMIIEv..."
    JWT_PRIVATE_KEY: str = ""
    # RS256 public key (PEM format). Used to VERIFY JWTs.
    # Derived from the private key:
    #   openssl rsa -in private.pem -pubout -out public.pem
    JWT_PUBLIC_KEY: str = ""
    @field_validator("JWT_PRIVATE_KEY", "JWT_PUBLIC_KEY", mode="before")
    @classmethod
    def _expand_pem_newlines(cls, v: str) -> str:
        if isinstance(v, str) and r"\n" in v:
            return v.replace(r"\n", "\n")
        return v
    model_config = SettingsConfigDict(env_file=".env", env_file_encoding="utf-8")
 auth_settings = AuthSettings()
--- a/services/auth/app/main.py
+++ b/services/auth/app/main.py
@@ -1,62 +0,0 @@
 """Auth Service — JWT issuance, user management, ForwardAuth verification.
 Standalone FastAPI service extracted from the adiuva-api monolith.
 Owns: users, refresh_tokens, subscriptions (read).
 """
 import sys
 from contextlib import asynccontextmanager
 from pathlib import Path
 # Ensure the repo root is on sys.path so "shared" is importable.
 # In Docker, COPY shared/ puts it at /app/shared/ (already importable).
 # In local dev, we need to add the repo root (two levels up from this file).
 _repo_root = str(Path(__file__).resolve().parents[3])
 if _repo_root not in sys.path:
    sys.path.insert(0, _repo_root)
 from fastapi import FastAPI
 from fastapi.middleware.cors import CORSMiddleware
 from shared.config import settings
@asynccontextmanager
 async def lifespan(app: FastAPI):
    yield
    from shared.db import engine
    await engine.dispose()
 def create_app() -> FastAPI:
    app = FastAPI(
        title="Adiuva Auth Service",
        version="0.1.0",
        docs_url="/docs" if settings.ENV == "dev" else None,
        redoc_url=None,
        lifespan=lifespan,
    )
    app.add_middleware(
        CORSMiddleware,
        allow_origins=settings.CORS_ORIGINS,
        allow_credentials=True,
        allow_methods=["*"],
        allow_headers=["*"],
    )
    from app.routes import router
    from app.verify import router as verify_router
    app.include_router(router, prefix="/api/v1")
    app.include_router(verify_router, prefix="/api/v1")
    @app.get("/api/v1/health", tags=["health"])
    async def health() -> dict:
        return {"status": "ok", "service": "auth", "version": app.version}
    return app
 app = create_app()
--- a/services/auth/app/verify.py
+++ b/services/auth/app/verify.py
@@ -1,66 +0,0 @@
 """ForwardAuth verification endpoint for Traefik.
 Traefik calls GET /api/v1/auth/verify on every request to a protected
 service.  This endpoint validates the JWT from the Authorization header
 and returns identity headers that Traefik injects into downstream requests.
 Downstream services NEVER validate JWTs themselves — they trust the
 X-User-Id, X-User-Email, X-User-Tier headers injected by Traefik.
 """
 from __future__ import annotations
 from fastapi import APIRouter, Request, Response
 from fastapi import status as http_status
 from jose import JWTError, jwt
 from sqlalchemy import select
 from shared.config import settings
 from shared.db import async_session
 from shared.models import Subscription
 from app.config import auth_settings
 router = APIRouter(tags=["auth"])
@router.get("/auth/verify")
 async def verify(request: Request) -> Response:
    """Validate JWT and return identity headers for Traefik ForwardAuth.
    Returns 200 with X-User-* headers on success, 401 on failure.
    Traefik copies response headers to the downstream request.
    """
    auth_header = request.headers.get("Authorization", "")
    if not auth_header.startswith("Bearer "):
        return Response(status_code=http_status.HTTP_401_UNAUTHORIZED)
    token = auth_header[7:]  # strip "Bearer "
    try:
        payload = jwt.decode(
            token, auth_settings.JWT_PUBLIC_KEY, algorithms=["RS256"]
        )
        user_id: str | None = payload.get("sub")
        email: str | None = payload.get("email")
        if not user_id or not email:
            return Response(status_code=http_status.HTTP_401_UNAUTHORIZED)
    except JWTError:
        return Response(status_code=http_status.HTTP_401_UNAUTHORIZED)
    # Live tier lookup from subscriptions table
    async with async_session() as db:
        result = await db.execute(
            select(Subscription.tier).where(Subscription.user_id == user_id)
        )
        default_tier = "power" if settings.ENV == "dev" else "free"
        tier: str = result.scalar_one_or_none() or default_tier
    return Response(
        status_code=http_status.HTTP_200_OK,
        headers={
            "X-User-Id": user_id,
            "X-User-Email": email,
            "X-User-Tier": tier,
        },
    )
--- a/services/auth/requirements.txt
+++ b/services/auth/requirements.txt
@@ -1,11 +0,0 @@
 fastapi>=0.115.0
 uvicorn[standard]>=0.34.0
 gunicorn>=22.0.0
 pydantic>=2.10.0
 pydantic-settings>=2.7.0
 python-jose[cryptography]>=3.3.0
 sqlalchemy>=2.0.0
 asyncpg>=0.30.0
 bcrypt>=4.2.0
 cryptography>=42.0.0
 python-dotenv>=1.0.0
--- a/services/batch-agent/Dockerfile
+++ b/services/batch-agent/Dockerfile
@@ -1,36 +0,0 @@
 # ── builder ──────────────────────────────────────────────────────────────────
 FROM python:3.12-slim AS builder
 WORKDIR /build
 COPY services/batch-agent/requirements.txt ./requirements.txt
 RUN pip install --upgrade pip && \
    pip install --no-cache-dir --prefix=/install -r requirements.txt
 # ── runtime ──────────────────────────────────────────────────────────────────
 FROM python:3.12-slim AS runtime
 RUN addgroup --system appgroup && adduser --system --ingroup appgroup appuser
 WORKDIR /app
 COPY --from=builder /install /usr/local
 # Shared module
 COPY shared/ shared/
 # Service source
 COPY services/batch-agent/app/ app/
 RUN chown -R appuser:appgroup /app
 USER appuser
 EXPOSE 8000
 # Batch runs are long-lived — use a longer timeout than chat (300s vs 120s)
 CMD ["gunicorn", "app.main:app", \
     "-k", "uvicorn.workers.UvicornWorker", \
     "--bind", "0.0.0.0:8000", \
     "--workers", "2", \
     "--timeout", "300"]
--- a/services/batch-agent/README.md
+++ b/services/batch-agent/README.md
@@ -1,23 +0,0 @@
 # Batch Agent Service
 Owns: agent_runner, journey builder, filesystem_agent, integrations (Gmail, MS Graph).
 ## Tables owned
 - `local_agent_configs`
 - `cloud_agent_configs`
 - `agent_run_logs`
 ## Endpoints
 - `GET /agents/catalog`
 - `POST /agents/can-create`
 - `POST /agents/trigger`
 - `GET /agents/{id}/history`
 ## Redis channels
 - Subscribe: `batch:request:{user_id}`
 - Publish: `ws:out:{user_id}` (journey replies + tool calls)
 - BRPOP: `tool:result:{call_id}` (30s timeout)
 - SET+EX: `journey:{user_id}` (session state, TTL 1800s)
 ## TODO
 - [ ] Integrate Langfuse tracing (reuse `services/chat/app/tracing.py` pattern — `trace_span()`, `get_langfuse_callback()`, prompt management). Each batch agent run should create a trace with input/output, link prompts, and pass the LangChain `CallbackHandler` to LLM calls.
--- a/services/batch-agent/app/agents/init.py
+++ b/services/batch-agent/app/agents/init.py
@@ -1 +0,0 @@
 """Batch Agent Service domain agents and filesystem tools."""
--- a/services/batch-agent/app/llm.py
+++ b/services/batch-agent/app/llm.py
@@ -1,76 +0,0 @@
 """LLM factory — centralised model instantiation via LiteLLM.
 Identical to services/chat/app/llm.py. Uses shared.config.settings.
 """
 from __future__ import annotations
 import os
 import warnings
 from openai import AsyncOpenAI
 import litellm
 from langchain_openai import ChatOpenAI
 from langchain_litellm import ChatLiteLLM
 from shared.config import settings
 litellm.drop_params = True
 warnings.filterwarnings(
    "ignore",
    message=r"PydanticSerializationUnexpectedValue\(Expected `ResponseAPIUsage`",
    category=UserWarning,
 )
 def _api_key_for_model(model: str) -> str | None:
    if model.startswith("anthropic/"):
        return settings.ANTHROPIC_API_KEY or None
    if model.startswith("gemini/") or model.startswith("google/"):
        return settings.GOOGLE_API_KEY or None
    if model.startswith("cerebras/"):
        return settings.CEREBRAS_API_KEY or None
    if model.startswith("github/"):
        return settings.GITHUB_TOKEN or None
    if model.startswith("github_copilot/"):
        return None
    return settings.OPENAI_API_KEY or None
 def get_llm(
    *,
    model: str | None = None,
    temperature: float = 0,
    callbacks: list | None = None,
 ) -> ChatOpenAI | ChatLiteLLM:
    model = model or settings.LLM_MODEL
    if settings.GITHUB_COPILOT_TOKEN_DIR:
        os.environ.setdefault("GITHUB_COPILOT_TOKEN_DIR", settings.GITHUB_COPILOT_TOKEN_DIR)
    if settings.GITHUB_TOKEN:
        os.environ.setdefault("GITHUB_TOKEN", settings.GITHUB_TOKEN)
    if "/" in model:
        return ChatLiteLLM(model=model, temperature=temperature, callbacks=callbacks)
    return ChatOpenAI(
        model=model,
        temperature=temperature,
        api_key=_api_key_for_model(model),
        callbacks=callbacks,
    )
 async def embed(text: str) -> list[float]:
    model = settings.LLM_EMBED_MODEL
    if model.startswith("github_copilot/") or "/" in model:
        response = await litellm.aembedding(model=model, input=[text])
        return response.data[0]["embedding"]
    client = AsyncOpenAI(api_key=settings.OPENAI_API_KEY)
    response = await client.embeddings.create(model=model, input=text)
    return response.data[0].embedding
--- a/services/batch-agent/app/main.py
+++ b/services/batch-agent/app/main.py
@@ -1,79 +0,0 @@
 """Batch Agent Service — FastAPI application.
 Owns: agent_runner (local directory + cloud connectors), journey builder,
 filesystem_agent, integrations (Gmail, MS Graph).
 Communicates with WS Gateway via Redis:
  - Subscribes to  batch:request:{user_id}  (journey_start, journey_message)
  - Publishes to   ws:out:{user_id}         (journey replies + tool calls)
  - BRPOP on       tool:result:{call_id}    (tool-call round-trip, 30s timeout)
  - SET+EX on      journey:{user_id}        (journey session state, TTL 1800s)
 """
 from __future__ import annotations
 import asyncio
 import logging
 import sys
 from pathlib import Path
 # Ensure the repo root is on sys.path so ``shared`` is importable when
 # running locally (in Docker the COPY already places it at /app/shared/).
 _repo_root = str(Path(__file__).resolve().parents[3])
 if _repo_root not in sys.path:
    sys.path.insert(0, _repo_root)
 from contextlib import asynccontextmanager
 from typing import AsyncGenerator
 from fastapi import FastAPI
 from fastapi.middleware.cors import CORSMiddleware
 from app.redis_consumer import start_consumer
 from app.routes import router
 logger = logging.getLogger(__name__)
@asynccontextmanager
 async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]:
    # Initialise Langfuse tracing (no-op if keys are missing)
    from app.tracing import init_langfuse
    init_langfuse()
    logger.info("batch-agent: starting Redis consumer")
    task = asyncio.create_task(start_consumer())
    yield
    task.cancel()
    try:
        await task
    except asyncio.CancelledError:
        pass
    from app.tracing import shutdown as shutdown_langfuse
    shutdown_langfuse()
    from shared.db import engine
    await engine.dispose()
    from shared.redis import redis_client
    await redis_client.aclose()
    logger.info("batch-agent: Redis consumer stopped")
 app = FastAPI(title="Adiuva Batch Agent Service", lifespan=lifespan)
 app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_methods=["GET", "POST"],
    allow_headers=["*"],
 )
 app.include_router(router)
@app.get("/health")
 async def health() -> dict[str, str]:
    return {"status": "ok", "service": "batch-agent"}
--- a/services/batch-agent/app/redis_consumer.py
+++ b/services/batch-agent/app/redis_consumer.py
@@ -1,183 +0,0 @@
 """Redis consumer for the Batch Agent Service.
 Subscribes to  batch:request:*  (pattern) and dispatches:
  - journey_start   → handle_journey_start
  - journey_message → handle_journey_message
  - agent_trigger   → run_local_agent / run_cloud_agent
 Results are published back to  ws:out:{user_id}  via Redis.
 """
 from __future__ import annotations
 import asyncio
 import json
 import logging
 from typing import Any
 from shared.redis import redis_client, batch_request_channel, ws_out_channel
 import app.tracing as tracing
 from shared.ws_context import set_current_user, clear_current_user
 logger = logging.getLogger(__name__)
 async def _publish_to_user(user_id: str, payload: dict[str, Any]) -> None:
    """Publish a frame to the user's WS outbound channel."""
    channel = ws_out_channel(user_id)
    await redis_client.publish(channel, json.dumps(payload))
 async def _handle_journey_start(user_id: str, data: dict[str, Any]) -> None:
    """Handle a journey_start request from WS Gateway."""
    from app.journey import handle_journey_start
    session_id = data.get("session_id", "")
    set_current_user(user_id)
    try:
        with tracing.trace_span(
            name="journey_start",
            user_id=user_id,
            session_id=session_id,
            input=data.get("directory", ""),
            metadata={"data_types": data.get("data_types", [])},
            tags=["journey"],
        ) as span:
            langfuse_handler = tracing.get_langfuse_callback()
            reply = await handle_journey_start(user_id, data, langfuse_handler=langfuse_handler)
            tracing.link_prompt_to_trace(span, "journey_system")
            span.update(output=reply.get("message", "")[:500])
            await _publish_to_user(user_id, reply)
        tracing.flush()
    except Exception as exc:
        logger.error("batch-agent: journey_start failed user=%s: %s", user_id, exc)
        await _publish_to_user(user_id, {
            "type": "journey_reply",
            "session_id": session_id,
            "message": f"Journey setup failed: {exc}",
            "done": True,
            "prompt_template": None,
        })
    finally:
        clear_current_user()
 async def _handle_journey_message(user_id: str, data: dict[str, Any]) -> None:
    """Handle a journey_message from WS Gateway."""
    from app.journey import handle_journey_message
    session_id = data.get("session_id", "")
    set_current_user(user_id)
    try:
        with tracing.trace_span(
            name="journey_message",
            user_id=user_id,
            session_id=session_id,
            input=data.get("message", "")[:200],
            tags=["journey"],
        ) as span:
            langfuse_handler = tracing.get_langfuse_callback()
            reply = await handle_journey_message(user_id, data, langfuse_handler=langfuse_handler)
            tracing.link_prompt_to_trace(span, "journey_system")
            span.update(output=reply.get("message", "")[:500])
            await _publish_to_user(user_id, reply)
        tracing.flush()
    except Exception as exc:
        logger.error("batch-agent: journey_message failed user=%s: %s", user_id, exc)
        await _publish_to_user(user_id, {
            "type": "journey_reply",
            "session_id": session_id,
            "message": f"Journey processing failed: {exc}",
            "done": True,
            "prompt_template": None,
        })
    finally:
        clear_current_user()
 async def _handle_agent_trigger(user_id: str, data: dict[str, Any]) -> None:
    """Handle an agent_trigger request from the REST route (forwarded via Redis)."""
    from app.agent_runner import run_local_agent
    run_context = data.get("run_context", {})
    agent_id = run_context.get("agent_id", "")
    set_current_user(user_id)
    try:
        with tracing.trace_span(
            name="agent_trigger",
            user_id=user_id,
            trace_id=run_context.get("run_id"),
            input={"agent_id": agent_id, "directory": data.get("directory", "")},
            metadata={"data_types": data.get("data_types", [])},
            tags=["batch", "agent_run"],
        ) as span:
            langfuse_handler = tracing.get_langfuse_callback()
            await run_local_agent(user_id, data, langfuse_handler=langfuse_handler)
            tracing.link_prompt_to_trace(span, "batch_processing")
            span.update(output={"status": "completed"})
        tracing.flush()
    except Exception as exc:
        logger.error("batch-agent: agent_trigger failed user=%s: %s", user_id, exc)
        await _publish_to_user(user_id, {
            "type": "run_complete",
            "status": "error",
            "run_context": run_context,
        })
    finally:
        clear_current_user()
 async def _dispatch(user_id: str, message_data: dict[str, Any]) -> None:
    """Route a batch request to the correct handler."""
    msg_type = message_data.get("type", "")
    if msg_type == "journey_start":
        await _handle_journey_start(user_id, message_data)
    elif msg_type == "journey_message":
        await _handle_journey_message(user_id, message_data)
    elif msg_type == "agent_trigger":
        await _handle_agent_trigger(user_id, message_data)
    elif msg_type == "device_online":
        logger.info("batch-agent: device_online user=%s device=%s", user_id, message_data.get("device_id", "?"))
    else:
        logger.warning("batch-agent: unknown message type %r from user=%s", msg_type, user_id)
 async def start_consumer() -> None:
    """Subscribe to batch:request:* and dispatch incoming frames."""
    pubsub = redis_client.pubsub()
    await pubsub.psubscribe("batch:request:*")
    logger.info("batch-agent: subscribed to batch:request:*")
    try:
        async for message in pubsub.listen():
            if message["type"] != "pmessage":
                continue
            channel: str = message["channel"]
            if isinstance(channel, bytes):
                channel = channel.decode()
            # Extract user_id from channel: batch:request:{user_id}
            parts = channel.split(":", 2)
            if len(parts) < 3:
                continue
            user_id = parts[2]
            raw = message["data"]
            if isinstance(raw, bytes):
                raw = raw.decode()
            try:
                data = json.loads(raw)
            except json.JSONDecodeError:
                logger.warning("batch-agent: invalid JSON on channel %s", channel)
                continue
            # Dispatch in a separate task to avoid blocking the consumer
            asyncio.create_task(_dispatch(user_id, data))
    except asyncio.CancelledError:
        logger.info("batch-agent: consumer shutting down")
    finally:
        await pubsub.punsubscribe("batch:request:*")
--- a/services/batch-agent/app/routes.py
+++ b/services/batch-agent/app/routes.py
@@ -1,208 +0,0 @@
 """Agent REST routes — catalog, billing checks, trigger.
 Adapted for Batch Agent Service: uses shared.db, shared.models, shared.schemas.
 Agent trigger dispatches via Redis to the consumer instead of spawning
 an in-process background task.
 """
 from __future__ import annotations
 import json
 import uuid
 from datetime import datetime, timezone
 from fastapi import APIRouter, Header, HTTPException, status
 from sqlalchemy import func, select
 from sqlalchemy.ext.asyncio import AsyncSession
 from shared.db import async_session
 from shared.models import AgentRunLog
 from shared.redis import redis_client, batch_request_channel
 from app.agent_runner import is_agent_running
 router = APIRouter(prefix="/agents", tags=["agents"])
 # ── Tier feature limits ───────────────────────────────────────────────
 # Mirrors app/billing/tier_manager.py FEATURES dict.
 FEATURES: dict[str, dict] = {
    "free":  {"batch_active": 1,  "batch_runs_per_day": 3},
    "pro":   {"batch_active": 5,  "batch_runs_per_day": 20},
    "power": {"batch_active": 20, "batch_runs_per_day": 100},
    "team":  {"batch_active": -1, "batch_runs_per_day": -1},
 }
 def _dt_ms(dt: datetime) -> int:
    return int(dt.timestamp() * 1000)
 def _dt_ms_opt(dt: datetime | None) -> int | None:
    return int(dt.timestamp() * 1000) if dt else None
 def _to_data_types(values: list[str]) -> list[str]:
    normalize = {
        "task": "tasks",           "tasks": "tasks",
        "note": "notes",           "notes": "notes",
        "timeline": "timelines",   "timelines": "timelines",   "timelineEvents": "timelines",
        "project": "projects",     "projects": "projects",
    }
    seen: set[str] = set()
    result: list[str] = []
    for v in values:
        mapped = normalize.get(v)
        if mapped and mapped not in seen:
            seen.add(mapped)
            result.append(mapped)
    return result
 def _enforce_agent_limit(tier: str, current_count: int) -> int:
    limit: int = FEATURES.get(tier, FEATURES["free"])["batch_active"]
    if limit != -1 and current_count >= limit:
        raise HTTPException(
            status_code=status.HTTP_403_FORBIDDEN,
            detail=f"Agent limit ({limit}) reached for your tier. Upgrade to create more.",
        )
    return limit
 async def _enforce_run_frequency(tier: str, user_id: str) -> None:
    limit: int = FEATURES.get(tier, FEATURES["free"])["batch_runs_per_day"]
    if limit == -1:
        return
    today_start = datetime.now(timezone.utc).replace(
        hour=0, minute=0, second=0, microsecond=0
    )
    async with async_session() as db:
        result = await db.execute(
            select(func.count(AgentRunLog.id)).where(
                AgentRunLog.user_id == user_id,
                AgentRunLog.started_at >= today_start,
            )
        )
        runs_today: int = result.scalar_one()
    if runs_today >= limit:
        raise HTTPException(
            status_code=status.HTTP_402_PAYMENT_REQUIRED,
            detail=f"Daily batch run limit ({limit}) reached for your tier.",
        )
 # ── Catalog ───────────────────────────────────────────────────────────
@router.get("/catalog")
 async def get_agent_catalog(
    x_user_id: str = Header(..., alias="X-User-Id"),
 ) -> list[dict]:
    return [
        {
            "type": "local_directory",
            "name": "Local Directory Monitor",
            "description": "Watches local directories, extracts data from files using AI",
        },
        {
            "type": "gmail",
            "name": "Gmail Connector",
            "description": "Scans Gmail inbox, extracts tasks/notes from emails",
        },
        {
            "type": "teams",
            "name": "Microsoft Teams Connector",
            "description": "Monitors Teams messages, extracts action items",
        },
        {
            "type": "outlook",
            "name": "Outlook Connector",
            "description": "Scans Outlook inbox, extracts tasks/notes",
        },
    ]
 # ── Can-create check ─────────────────────────────────────────────────
@router.post("/can-create")
 async def can_create_agent(
    body: dict,
    x_user_id: str = Header(..., alias="X-User-Id"),
    x_user_tier: str = Header("free", alias="X-User-Tier"),
 ) -> dict:
    active_agents = body.get("active_agents", 0)
    limit: int = FEATURES.get(x_user_tier, FEATURES["free"])["batch_active"]
    allowed = limit == -1 or active_agents < limit
    return {
        "allowed": allowed,
        "tier": x_user_tier,
        "active_agents": active_agents,
        "limit": limit,
    }
 # ── Trigger ──────────────────────────────────────────────────────────
@router.post("/trigger", status_code=status.HTTP_202_ACCEPTED)
 async def trigger_agent_run(
    body: dict,
    x_user_id: str = Header(..., alias="X-User-Id"),
    x_user_tier: str = Header("free", alias="X-User-Tier"),
 ) -> dict:
    """Trigger a local agent run — creates run log and dispatches via Redis."""
    active_agents = body.get("active_agents", 0)
    _enforce_agent_limit(x_user_tier, active_agents)
    await _enforce_run_frequency(x_user_tier, x_user_id)
    stable_agent_id = body.get("agent_id") or str(uuid.uuid4())
    if is_agent_running(stable_agent_id):
        raise HTTPException(
            status_code=status.HTTP_409_CONFLICT,
            detail="Agent is already running.",
        )
    # Create run log in DB
    async with async_session() as db:
        run_log = AgentRunLog(
            agent_id=stable_agent_id,
            agent_type="local",
            user_id=x_user_id,
            status="running",
        )
        db.add(run_log)
        await db.commit()
        await db.refresh(run_log)
        run_log_id = run_log.id
    run_context = {
        "type": "agent_batch",
        "run_id": run_log_id,
        "agent_id": stable_agent_id,
    }
    # Dispatch to the Redis consumer for processing
    trigger_data = {
        "type": "agent_trigger",
        "directory": body.get("directory", ""),
        "directory_paths": [body.get("directory", "")] if body.get("directory") else [],
        "data_types": _to_data_types(body.get("what_to_extract", [])),
        "file_extensions": body.get("file_extensions", []),
        "prompt_template": body.get("custom_agent_prompt", ""),
        "device_id": body.get("device_id", ""),
        "run_context": run_context,
    }
    channel = batch_request_channel(x_user_id)
    await redis_client.publish(channel, json.dumps(trigger_data))
    return {
        "id": run_log_id,
        "agent_id": stable_agent_id,
        "agent_type": "local",
        "status": "running",
        "items_processed": 0,
        "items_created": 0,
        "errors": [],
        "started_at": _dt_ms(run_log.started_at),
        "completed_at": None,
    }
--- a/services/batch-agent/app/tracing.py
+++ b/services/batch-agent/app/tracing.py
@@ -1,336 +0,0 @@
 """Langfuse tracing & prompt management for the Batch Agent Service (v4 SDK).
 Provides:
 - ``init_langfuse()`` — initialise the singleton client at startup
 - ``trace_span()`` — context manager that creates a trace + span
 - ``get_langfuse_callback()`` — LangChain callback handler (auto-inherits trace)
 - ``get_prompt()`` — fetch a managed prompt from Langfuse by name
 - ``flush()`` / ``shutdown()`` — lifecycle management
 All functions gracefully degrade to no-ops when Langfuse is not configured,
 so the service works identically with or without observability keys.
 Requires ``langfuse >= 3.0.0`` (v4 / "Fast Preview" SDK).
 """
 from __future__ import annotations
 import logging
 from contextlib import contextmanager
 from typing import Any
 from shared.config import settings
 logger = logging.getLogger(__name__)
 # ── State ────────────────────────────────────────────────────────────────
 _initialised: bool = False
 _disabled: bool = False
 def _is_configured() -> bool:
    return bool(settings.LANGFUSE_SECRET_KEY and settings.LANGFUSE_PUBLIC_KEY)
 def init_langfuse() -> None:
    """Initialise the Langfuse singleton. Call once at startup."""
    global _initialised, _disabled
    if _initialised or _disabled:
        return
    if not _is_configured():
        _disabled = True
        logger.info("tracing: Langfuse keys not set — tracing disabled")
        return
    try:
        from langfuse import Langfuse
        Langfuse(
            secret_key=settings.LANGFUSE_SECRET_KEY,
            public_key=settings.LANGFUSE_PUBLIC_KEY,
            host=settings.LANGFUSE_HOST,
        )
        _initialised = True
        logger.info("tracing: Langfuse client initialised (host=%s)", settings.LANGFUSE_HOST)
    except Exception as exc:
        _disabled = True
        logger.warning("tracing: failed to initialise Langfuse: %s", exc)
 def _get_client() -> Any | None:
    """Return the singleton Langfuse client, or *None* if disabled."""
    if _disabled:
        return None
    if not _initialised:
        init_langfuse()
    if _disabled:
        return None
    try:
        from langfuse import get_client
        return get_client()
    except Exception:
        return None
 # ── Null span (no-op when Langfuse is disabled) ─────────────────────────
 class _NullSpan:
    """Drop-in replacement when Langfuse is disabled."""
    def update(self, **_: Any) -> None: ...
    def set_trace_io(self, **_: Any) -> None: ...
    def score_trace(self, **_: Any) -> None: ...
 # ── Trace context manager ───────────────────────────────────────────────
@contextmanager
 def trace_span(
    *,
    name: str,
    user_id: str,
    session_id: str | None = None,
    trace_id: str | None = None,
    input: Any = None,
    metadata: dict[str, Any] | None = None,
    tags: list[str] | None = None,
 ):
    """Context manager that creates a Langfuse trace/span.
    Yields the span object (or a ``_NullSpan`` if Langfuse is disabled).
    A ``CallbackHandler`` created inside this block auto-inherits the trace
    context, so there is no need to pass trace IDs manually.
    """
    lf = _get_client()
    if lf is None:
        yield _NullSpan()
        return
    try:
        from langfuse import Langfuse, propagate_attributes
        trace_ctx: dict[str, str] = {}
        if trace_id is not None:
            trace_ctx["trace_id"] = Langfuse.create_trace_id(seed=trace_id)
        with lf.start_as_current_observation(
            as_type="span",
            name=name,
            input=input,
            metadata=metadata or {},
            **({"trace_context": trace_ctx} if trace_ctx else {}),
        ) as span:
            with propagate_attributes(
                user_id=user_id,
                session_id=session_id,
                tags=tags or [],
            ):
                yield span
    except Exception as exc:
        logger.warning("tracing: trace_span(%s) failed: %s", name, exc)
        yield _NullSpan()
 # ── LangChain callback handler ──────────────────────────────────────────
 def get_langfuse_callback() -> Any | None:
    """Return a LangChain ``CallbackHandler`` that auto-inherits the current trace.
    Must be called inside a ``trace_span()`` block for proper linking.
    Returns *None* when Langfuse is disabled.
    """
    if _disabled and not _initialised:
        return None
    try:
        from langfuse.langchain import CallbackHandler
        return CallbackHandler()
    except Exception as exc:
        logger.warning("tracing: get_langfuse_callback failed: %s", exc)
        return None
 # ── Prompt management ────────────────────────────────────────────────────
 def get_prompt(
    name: str,
    *,
    version: int | None = None,
    label: str | None = None,
    fallback: str | None = None,
    cache_ttl_seconds: int = 300,
 ) -> str | None:
    """Fetch a managed prompt from Langfuse by name (without variable compilation).
    Returns the raw prompt string, or *fallback* if the prompt is not
    found or Langfuse is disabled.
    """
    lf = _get_client()
    if lf is None:
        return fallback
    try:
        kwargs: dict[str, Any] = {
            "name": name,
            "cache_ttl_seconds": cache_ttl_seconds,
        }
        if version is not None:
            kwargs["version"] = version
        if label is not None:
            kwargs["label"] = label
        prompt = lf.get_prompt(**kwargs)
        return prompt.prompt
    except Exception as exc:
        logger.warning("tracing: get_prompt(%s) failed: %s", name, exc)
        return fallback
 def compile_prompt(
    name: str,
    *,
    fallback: str,
    variables: dict[str, str],
    version: int | None = None,
    label: str | None = None,
    cache_ttl_seconds: int = 300,
 ) -> str:
    """Fetch a managed prompt from Langfuse and compile it with ``{{variables}}``.
    If the prompt exists in Langfuse, uses the SDK's ``.compile(**variables)``
    which replaces ``{{key}}`` placeholders.  If Langfuse is disabled or the
    prompt is not found, falls back to ``fallback.format(**variables)`` (Python
    ``{key}`` placeholders).
    This means:
      - Langfuse prompts use ``{{variable}}`` syntax.
      - Hardcoded fallback strings use Python ``{variable}`` syntax.
    """
    lf = _get_client()
    if lf is None:
        return fallback.format(**variables)
    try:
        kwargs: dict[str, Any] = {
            "name": name,
            "cache_ttl_seconds": cache_ttl_seconds,
        }
        if version is not None:
            kwargs["version"] = version
        if label is not None:
            kwargs["label"] = label
        prompt = lf.get_prompt(**kwargs)
        return prompt.compile(**variables)
    except Exception as exc:
        logger.warning("tracing: compile_prompt(%s) failed, using fallback: %s", name, exc)
        return fallback.format(**variables)
 def get_prompt_object(
    name: str,
    *,
    version: int | None = None,
    label: str | None = None,
    cache_ttl_seconds: int = 300,
 ) -> Any | None:
    """Fetch the raw Langfuse prompt *object* (not the compiled string).
    Returns ``None`` when Langfuse is disabled or the prompt is not found.
    Use this when you need to pass the prompt to ``start_observation(prompt=...)``
    for linking the prompt to a trace in the Langfuse UI.
    """
    lf = _get_client()
    if lf is None:
        return None
    try:
        kwargs: dict[str, Any] = {
            "name": name,
            "cache_ttl_seconds": cache_ttl_seconds,
        }
        if version is not None:
            kwargs["version"] = version
        if label is not None:
            kwargs["label"] = label
        return lf.get_prompt(**kwargs)
    except Exception as exc:
        logger.warning("tracing: get_prompt_object(%s) failed: %s", name, exc)
        return None
 def link_prompt_to_trace(
    span: Any,
    prompt_name: str,
    *,
    version: int | None = None,
    label: str | None = None,
 ) -> None:
    """Link a Langfuse managed prompt to a span/observation.
    Uses the SDK v4 ``prompt=`` parameter so that the prompt version
    appears linked in the Langfuse UI with metrics tracking.
    """
    lf = _get_client()
    if lf is None or isinstance(span, _NullSpan):
        return
    try:
        prompt = get_prompt_object(prompt_name, version=version, label=label)
        if prompt is not None:
            span.update(prompt=prompt)
    except Exception as exc:
        logger.warning("tracing: link_prompt_to_trace(%s) failed: %s", prompt_name, exc)
 # ── Scoring helper ───────────────────────────────────────────────────────
 def score_trace(
    trace_id: str,
    name: str,
    value: float,
    *,
    comment: str | None = None,
 ) -> None:
    """Post a score to a trace (e.g. user feedback, latency, quality)."""
    lf = _get_client()
    if lf is None:
        return
    try:
        lf.create_score(trace_id=trace_id, name=name, value=value, comment=comment)
    except Exception as exc:
        logger.warning("tracing: score_trace failed: %s", exc)
 # ── Shutdown ─────────────────────────────────────────────────────────────
 def flush() -> None:
    """Flush pending Langfuse events."""
    lf = _get_client()
    if lf is not None:
        try:
            lf.flush()
        except Exception as exc:
            logger.warning("tracing: flush failed: %s", exc)
 def shutdown() -> None:
    """Flush and close the Langfuse client."""
    global _initialised, _disabled
    lf = _get_client()
    if lf is not None:
        try:
            lf.flush()
            lf.shutdown()
        except Exception as exc:
            logger.warning("tracing: shutdown failed: %s", exc)
    _initialised = False
    _disabled = False
--- a/services/batch-agent/eval/init.py
+++ b/services/batch-agent/eval/init.py
@@ -1 +0,0 @@
 """Batch Agent E2E evaluation harness."""
--- a/services/batch-agent/eval/main.py
+++ b/services/batch-agent/eval/main.py
@@ -1,5 +0,0 @@
 """Allow running the eval package as ``python -m eval``."""
 from eval.cli import main
 main()
--- a/services/batch-agent/eval/cli.py
+++ b/services/batch-agent/eval/cli.py
@@ -1,285 +0,0 @@
 """CLI entry point for the batch agent evaluation harness.
 Usage::
    # From services/batch-agent/:
    python -m eval run                                # all agent fixtures, default model
    python -m eval run --fixture=classify-invoices    # single fixture
    python -m eval run --models=gpt-4o,gpt-5.3-codex  # multiple models
    python -m eval run --mode=step1                   # only step1 fixtures
    python -m eval run --no-judge                     # skip LLM judge scoring
    python -m eval interactive                        # interactive journey session
    python -m eval interactive --fixture=journey-invoice-setup
    python -m eval interactive --model=gpt-4o
    python -m eval interactive --judge-model=github_copilot/gpt-4o-mini
    python -m eval list                               # list all fixtures
    python -m eval sync                               # sync fixtures to Langfuse datasets
 """
 from __future__ import annotations
 import argparse
 import asyncio
 import logging
 import sys
 from pathlib import Path
 # Ensure the service root and repo root are in sys.path.
 # Service root must come BEFORE repo root so its ``app/`` package
 # shadows the monolith ``app/`` in the repo root.
 _SERVICE_ROOT = Path(__file__).resolve().parent.parent
 _REPO_ROOT = _SERVICE_ROOT.parent.parent
 _sr = str(_SERVICE_ROOT)
 _rr = str(_REPO_ROOT)
 if _rr not in sys.path:
    sys.path.insert(0, _rr)
 # Always force service root to position 0 (python -m may have already
 # added CWD further down the list, which loses to repo root).
 if _sr in sys.path:
    sys.path.remove(_sr)
 sys.path.insert(0, _sr)
 from eval.config import discover_fixtures, discover_journey_fixtures
 from eval.runner import run_fixture_eval, print_results
 from eval.interactive import run_interactive
 from eval import langfuse_eval
 def _setup_logging(verbose: bool) -> None:
    level = logging.DEBUG if verbose else logging.INFO
    logging.basicConfig(
        level=level,
        format="%(asctime)s %(name)-20s %(levelname)-5s %(message)s",
        datefmt="%H:%M:%S",
    )
    # Quiet noisy libraries
    for name in ("httpx", "httpcore", "openai", "litellm", "urllib3"):
        logging.getLogger(name).setLevel(logging.WARNING)
 def _parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(
        description="Batch Agent E2E evaluation harness",
        prog="python -m eval",
    )
    sub = parser.add_subparsers(dest="command", required=True)
    # ── run ───────────────────────────────────────────────────────
    run_cmd = sub.add_parser("run", help="Run evaluations")
    run_cmd.add_argument(
        "--fixture", "-f",
        help="Run only the named fixture (default: all)",
    )
    run_cmd.add_argument(
        "--models", "-m",
        default="github_copilot/gpt-5.3-codex",
        help="Comma-separated list of models to test (default: github_copilot/gpt-5.3-codex)",
    )
    run_cmd.add_argument(
        "--mode",
        default=None,
        choices=["step1", "step2", "full"],
        help="Only run fixtures with this mode (default: all)",
    )
    run_cmd.add_argument(
        "--no-judge",
        action="store_true",
        help="Skip LLM-as-judge scoring",
    )
    run_cmd.add_argument(
        "--judge-model",
        default="gpt-4o",
        help="Model for LLM judge (default: gpt-4o)",
    )
    run_cmd.add_argument(
        "--fixtures-dir",
        default=None,
        help="Path to fixtures directory (default: eval/fixtures/)",
    )
    run_cmd.add_argument("-v", "--verbose", action="store_true")
    # ── list ──────────────────────────────────────────────────────
    list_cmd = sub.add_parser("list", help="List available fixtures")
    list_cmd.add_argument("--fixtures-dir", default=None)
    list_cmd.add_argument("-v", "--verbose", action="store_true")
    # ── sync ──────────────────────────────────────────────────────
    sync_cmd = sub.add_parser("sync", help="Sync fixtures to Langfuse datasets")
    sync_cmd.add_argument("--fixture", "-f", default=None, help="Sync only the named fixture")
    sync_cmd.add_argument("--fixtures-dir", default=None)
    sync_cmd.add_argument("-v", "--verbose", action="store_true")
    # ── interactive ───────────────────────────────────────────────
    inter_cmd = sub.add_parser("interactive", help="Interactive journey session (human-in-the-loop)")
    inter_cmd.add_argument(
        "--fixture", "-f",
        help="Journey fixture to use (default: pick interactively)",
    )
    inter_cmd.add_argument(
        "--model", "-m",
        default="github_copilot/gpt-5.3-codex",
        help="Model for the journey AI (default: github_copilot/gpt-5.3-codex)",
    )
    inter_cmd.add_argument(
        "--judge-model",
        default="gpt-4o",
        help="Model for LLM judge (default: gpt-4o)",
    )
    inter_cmd.add_argument(
        "--fixtures-dir",
        default=None,
        help="Path to fixtures directory (default: eval/fixtures/)",
    )
    inter_cmd.add_argument(
        "--data-dir",
        default=None,
        help="Override sample data directory (e.g. path to private test files not in git)",
    )
    inter_cmd.add_argument("-v", "--verbose", action="store_true")
    return parser.parse_args()
 def _fixtures_dir(arg: str | None) -> Path | None:
    if arg:
        return Path(arg)
    return None
 async def _cmd_run(args: argparse.Namespace) -> None:
    fixtures = discover_fixtures(_fixtures_dir(args.fixtures_dir))
    if not fixtures:
        print("No fixtures found. Create YAML files in eval/fixtures/.")
        return
    if args.fixture:
        fixtures = [f for f in fixtures if f.name == args.fixture]
        if not fixtures:
            print(f"Fixture '{args.fixture}' not found.")
            return
    models = [m.strip() for m in args.models.split(",")]
    all_results = []
    for fixture in fixtures:
        if args.mode and fixture.mode != args.mode:
            continue
        results = await run_fixture_eval(
            fixture,
            models=models,
            use_llm_judge=not args.no_judge,
            judge_model=args.judge_model,
        )
        all_results.extend(results)
    print_results(all_results)
 def _cmd_list(args: argparse.Namespace) -> None:
    fixtures = discover_fixtures(_fixtures_dir(args.fixtures_dir))
    journey_fixtures = discover_journey_fixtures(_fixtures_dir(args.fixtures_dir))
    if not fixtures and not journey_fixtures:
        print("No fixtures found.")
        return
    if fixtures:
        print(f"\n{'[Agent Fixtures]'}")
        print(f"{'Name':<30} {'Mode':<6} {'Types':<25} {'Expected'}")
        print("-" * 90)
        for f in fixtures:
            types = ", ".join(f.data_types)
            n_expected = len(f.expected) + len(f.expected_classification)
            print(f"{f.name:<30} {f.mode:<6} {types:<25} {n_expected}")
    if journey_fixtures:
        print(f"\n{'[Journey Fixtures]'}")
        print(f"{'Name':<30} {'Types':<25} {'Messages':<10} {'Criteria'}")
        print("-" * 90)
        for f in journey_fixtures:
            types = ", ".join(f.data_types)
            print(f"{f.name:<30} {types:<25} {len(f.user_messages):<10} {len(f.expected_template_criteria)}")
    print()
 def _cmd_sync(args: argparse.Namespace) -> None:
    fixtures = discover_fixtures(_fixtures_dir(args.fixtures_dir))
    journey_fixtures = discover_journey_fixtures(_fixtures_dir(args.fixtures_dir))
    if args.fixture:
        fixtures = [f for f in fixtures if f.name == args.fixture]
        journey_fixtures = [f for f in journey_fixtures if f.name == args.fixture]
    if not fixtures and not journey_fixtures:
        print("No fixtures to sync.")
        return
    for fixture in fixtures:
        name = langfuse_eval.sync_fixture_to_dataset(fixture)
        if name:
            print(f"Synced: {fixture.name} → {name}")
        else:
            print(f"Skipped: {fixture.name} (Langfuse not configured)")
    for fixture in journey_fixtures:
        name = langfuse_eval.sync_journey_fixture_to_dataset(fixture)
        if name:
            print(f"Synced: {fixture.name} → {name}")
        else:
            print(f"Skipped: {fixture.name} (Langfuse not configured)")
 async def _cmd_interactive(args: argparse.Namespace) -> None:
    journey_fixtures = discover_journey_fixtures(_fixtures_dir(args.fixtures_dir))
    if not journey_fixtures:
        print("No journey fixtures found. Create YAML files with type: journey in eval/fixtures/.")
        return
    if args.fixture:
        fixtures = [f for f in journey_fixtures if f.name == args.fixture]
        if not fixtures:
            print(f"Journey fixture '{args.fixture}' not found.")
            return
        fixture = fixtures[0]
    elif len(journey_fixtures) == 1:
        fixture = journey_fixtures[0]
    else:
        # Let user pick
        print("\nAvailable journey fixtures:")
        for i, f in enumerate(journey_fixtures, 1):
            print(f"  {i}. {f.name} — {f.description[:60]}")
        print()
        try:
            choice = int(input("Pick a fixture number: ").strip()) - 1
            fixture = journey_fixtures[choice]
        except (ValueError, IndexError, EOFError, KeyboardInterrupt):
            print("Invalid choice.")
            return
    await run_interactive(
        fixture,
        model=args.model,
        judge_model=args.judge_model,
        data_dir=Path(args.data_dir).resolve() if args.data_dir else None,
    )
 def main() -> None:
    args = _parse_args()
    _setup_logging(args.verbose)
    if args.command == "run":
        asyncio.run(_cmd_run(args))
    elif args.command == "interactive":
        asyncio.run(_cmd_interactive(args))
    elif args.command == "list":
        _cmd_list(args)
    elif args.command == "sync":
        _cmd_sync(args)
 if __name__ == "__main__":
    main()
--- a/services/batch-agent/eval/config.py
+++ b/services/batch-agent/eval/config.py
@@ -1,220 +0,0 @@
 """Eval configuration — YAML fixture loader and dataclasses.
 Fixtures come in two families:
 1. **Agent fixtures** — test the batch agent pipeline.
   Three modes controlled by ``mode``:
   ``step1``  — classification prompt only.
   ``step2``  — processing prompt only.
   ``full``   — both steps in sequence.
 2. **Journey fixtures** — test the prompt-template builder conversation
   (unchanged).
 """
 from __future__ import annotations
 import logging
 from dataclasses import dataclass, field
 from pathlib import Path
 from typing import Any, Literal
 import yaml
 logger = logging.getLogger(__name__)
 EvalMode = Literal["step1", "step2", "full"]
@dataclass
 class ExpectedRecord:
    """A single expected extraction result.
    Only the fields specified are checked — unspecified fields are ignored.
    """
    table: str  # tasks | notes | timelines | projects
    fields: dict[str, Any]  # field_name → expected_value
@dataclass
 class ExpectedClassification:
    """Expected output of step-1 classification for one file."""
    file: str  # relative path to the sample file
    project_id: str  # expected matched project id, or "new"
    domains: list[str]  # expected domain list
    new_project_name: str | None = None
@dataclass
 class EvalFixture:
    """A complete test scenario loaded from YAML.
    ``mode`` determines which pipeline steps are exercised:
    - **step1**: only ``_classify_file``
    - **step2**: only the processing LLM + tool loop
    - **full**: both steps in sequence (``run_local_agent``)
    """
    name: str
    description: str
    mode: EvalMode
    directory: str  # relative path to sample files
    data_types: list[str]
    file_extensions: list[str]
    models: list[str]  # if empty, use CLI default
    fixture_path: Path = field(default_factory=lambda: Path("."))
    # ── Step-1 inputs (classification) ───────────────────────────
    domain_definitions: str = ""
    projects_list: list[dict[str, Any]] = field(default_factory=list)
    custom_step1_prompt: str = ""
    # ── Step-2 inputs (processing) ───────────────────────────────
    existing_context: str = ""
    project_context: str = ""
    custom_prompt_section: str = ""
    # ── Seed records for mock executor ───────────────────────────
    seed_records: dict[str, list[dict]] = field(default_factory=dict)
    # ── Expected outputs ─────────────────────────────────────────
    expected_classification: list[ExpectedClassification] = field(default_factory=list)
    expected: list[ExpectedRecord] = field(default_factory=list)
    @property
    def fixture_dir(self) -> Path:
        """Absolute path to the sample files directory."""
        return self.fixture_path.parent / self.directory
    @classmethod
    def from_yaml(cls, path: Path) -> "EvalFixture":
        """Load a fixture from a YAML file."""
        raw = yaml.safe_load(path.read_text(encoding="utf-8"))
        mode: EvalMode = raw.get("mode", "full")
        # Parse expected records (step2/full)
        expected: list[ExpectedRecord] = []
        for table, records in (raw.get("expected") or {}).items():
            for rec in records:
                expected.append(ExpectedRecord(table=table, fields=rec))
        # Parse expected classification (step1/full)
        expected_classification: list[ExpectedClassification] = []
        for item in raw.get("expected_classification") or []:
            expected_classification.append(ExpectedClassification(
                file=item["file"],
                project_id=item["project_id"],
                domains=item.get("domains", []),
                new_project_name=item.get("new_project_name"),
            ))
        return cls(
            name=raw["name"],
            description=raw.get("description", ""),
            mode=mode,
            directory=raw.get("directory", "sample_files"),
            data_types=raw.get("data_types", ["tasks"]),
            file_extensions=raw.get("file_extensions", []),
            models=raw.get("models", []),
            fixture_path=path,
            # Step-1 inputs
            domain_definitions=raw.get("domain_definitions", ""),
            projects_list=raw.get("projects_list", []),
            # Step-2 inputs
            existing_context=raw.get("existing_context", ""),
            project_context=raw.get("project_context", ""),
            custom_prompt_section=raw.get("custom_prompt_section", ""),
            # Shared
            seed_records=raw.get("seed_records", {}),
            expected_classification=expected_classification,
            expected=expected,
        )
 def discover_fixtures(fixtures_dir: Path | None = None) -> list[EvalFixture]:
    """Find and load all YAML fixtures in the fixtures directory."""
    if fixtures_dir is None:
        fixtures_dir = Path(__file__).parent / "fixtures"
    fixtures: list[EvalFixture] = []
    if not fixtures_dir.is_dir():
        logger.warning("eval: fixtures directory not found: %s", fixtures_dir)
        return fixtures
    for yaml_path in sorted(fixtures_dir.glob("*.yaml")):
        try:
            raw = yaml.safe_load(yaml_path.read_text(encoding="utf-8"))
            if raw.get("type") == "journey":
                continue  # Skip journey fixtures
            fixtures.append(EvalFixture.from_yaml(yaml_path))
            logger.info("eval: loaded fixture %s from %s", fixtures[-1].name, yaml_path.name)
        except Exception as exc:
            logger.error("eval: failed to load fixture %s: %s", yaml_path.name, exc)
    return fixtures
 # ── Journey fixtures ─────────────────────────────────────────────────────
@dataclass
 class JourneyFixture:
    """A journey test scenario — tests the prompt_template builder conversation."""
    name: str
    description: str
    directory: str  # relative path to sample files
    data_types: list[str]
    expected_template_criteria: list[str]  # what the template should contain/satisfy
    user_messages: list[str] = field(default_factory=list)  # for automated journey runs (unused in interactive mode)
    models: list[str] = field(default_factory=list)
    fixture_path: Path = field(default_factory=lambda: Path("."))
    @property
    def fixture_dir(self) -> Path:
        """Absolute path to the sample files directory."""
        return self.fixture_path.parent / self.directory
    @classmethod
    def from_yaml(cls, path: Path) -> "JourneyFixture":
        """Load a journey fixture from a YAML file."""
        raw = yaml.safe_load(path.read_text(encoding="utf-8"))
        return cls(
            name=raw["name"],
            description=raw.get("description", ""),
            directory=raw.get("directory", "sample_files"),
            data_types=raw.get("data_types", ["tasks"]),
            user_messages=raw.get("user_messages", []),
            expected_template_criteria=raw.get("expected_template_criteria", []),
            models=raw.get("models", []),
            fixture_path=path,
        )
 def discover_journey_fixtures(fixtures_dir: Path | None = None) -> list[JourneyFixture]:
    """Find and load all journey YAML fixtures in the fixtures directory."""
    if fixtures_dir is None:
        fixtures_dir = Path(__file__).parent / "fixtures"
    fixtures: list[JourneyFixture] = []
    if not fixtures_dir.is_dir():
        logger.warning("eval: fixtures directory not found: %s", fixtures_dir)
        return fixtures
    for yaml_path in sorted(fixtures_dir.glob("*.yaml")):
        try:
            raw = yaml.safe_load(yaml_path.read_text(encoding="utf-8"))
            if raw.get("type") != "journey":
                continue
            fixtures.append(JourneyFixture.from_yaml(yaml_path))
            logger.info("eval: loaded journey fixture %s from %s", fixtures[-1].name, yaml_path.name)
        except Exception as exc:
            logger.error("eval: failed to load journey fixture %s: %s", yaml_path.name, exc)
    return fixtures
--- a/services/batch-agent/eval/fixtures/classify_invoices.yaml
+++ b/services/batch-agent/eval/fixtures/classify_invoices.yaml
@@ -1,40 +0,0 @@
 # Fixture: classify-invoices (step1)
 # Tests _STEP1_SYSTEM_PROMPT — file classification and project matching.
 # Verifies that the LLM correctly matches files to existing projects
 # and identifies the right data domains.
 name: classify-invoices
 mode: step1
 description: >
  Test file classification on Italian freelance invoices and meeting notes.
  Verifies project matching and domain identification.
 directory: sample_files/invoices
 data_types: [tasks, notes, timelines]
 file_extensions: [txt, md]
 # ── Step-1 prompt variables ──────────────────────────────────────
 domain_definitions: |
  - tasks: Action items, deliverables, things to do — anything that someone needs to complete.
  - notes: Meeting summaries, decisions, reference information — permanent knowledge entries.
  - timelines: Project milestones, deadlines, scheduled events — specific dates that mark a point in the progress of a project.
 projects_list:
  - id: "proj-web-redesign"
    name: "Redesign Sito Web Corporate"
    status: "active"
    aiSummary: "Corporate website redesign for Studio Architettura Bianchi"
  - id: "proj-ecommerce"
    name: "E-Commerce FashionStore"
    status: "active"
    aiSummary: "Next.js e-commerce platform for FashionStore srl"
 # ── Expected classification results ─────────────────────────────
 expected_classification:
  - file: "sample_files/invoices/fattura_042.txt"
    project_id: "proj-web-redesign"
    domains: [tasks, notes, timelines]
  - file: "sample_files/invoices/meeting_ecommerce.md"
    project_id: "proj-ecommerce"
    domains: [tasks, notes, timelines]
--- a/services/batch-agent/eval/fixtures/full_invoices.yaml
+++ b/services/batch-agent/eval/fixtures/full_invoices.yaml
@@ -1,108 +0,0 @@
 # Fixture: full-invoices (full)
 # Tests both _STEP1_SYSTEM_PROMPT and _PROCESSING_SYSTEM_PROMPT in sequence
 # via run_local_agent(). Verifies end-to-end classification + extraction.
 name: full-invoices
 mode: full
 description: >
  End-to-end test: classify Italian invoices/meeting notes into the
  correct project, then extract tasks, notes, and timeline events.
 directory: sample_files/invoices
 data_types: [tasks, notes, timelines]
 file_extensions: [txt, md]
 # ── Step-1 prompt variables ──────────────────────────────────────
 domain_definitions: |
  - tasks: Action items, deliverables, things to do — anything that someone needs to complete.
  - notes: Meeting summaries, decisions, reference information — permanent knowledge entries.
  - timelines: Project milestones, deadlines, scheduled events — specific dates that mark a point in the progress of a project.
 projects_list:
  - id: "proj-web-redesign"
    name: "Redesign Sito Web Corporate"
    status: "active"
    aiSummary: "Corporate website redesign for Studio Architettura Bianchi"
  - id: "proj-ecommerce"
    name: "E-Commerce FashionStore"
    status: "active"
    aiSummary: "Next.js e-commerce platform for FashionStore srl"
 # ── Step-2 prompt variables ──────────────────────────────────────
 existing_context: |
  Existing tasks:
    (none)
  Existing notes:
    (none)
  Existing timelines:
    (none)
 project_context: ""
 custom_prompt_section: |
  User instructions:
  Estrai i dati dai file come segue:
  - TASK: ogni azione da fare, deliverable, o item con scadenza.
    Mappa "URGENTE" o "ALTA PRIORITÀ" → priority: high.
    Mappa "media priorità" → priority: medium.
    Mappa "bassa priorità" → priority: low.
    Se un item è marcato come "completato" o [x], impostalo status: done.
    Altrimenti status: todo.
  - NOTE: riassunti di meeting, decisioni prese, note tecniche.
  - TIMELINE: date di scadenza, milestone, meeting futuri.
  Imposta sempre isAiSuggested=1.
 # ── Seed records (pre-existing DB state) ─────────────────────────
 seed_records:
  projects:
    - id: "proj-web-redesign"
      name: "Redesign Sito Web Corporate"
      status: "active"
      aiSummary: "Corporate website redesign for Studio Architettura Bianchi"
    - id: "proj-ecommerce"
      name: "E-Commerce FashionStore"
      status: "active"
      aiSummary: "Next.js e-commerce platform for FashionStore srl"
  tasks: []
  notes: []
  timelines: []
 # ── Expected classification (step 1) ─────────────────────────────
 expected_classification:
  - file: "sample_files/invoices/fattura_042.txt"
    project_id: "proj-web-redesign"
    domains: [tasks, notes, timelines]
  - file: "sample_files/invoices/meeting_ecommerce.md"
    project_id: "proj-ecommerce"
    domains: [tasks, notes, timelines]
 # ── Expected extractions (step 2) ────────────────────────────────
 expected:
  tasks:
    - title: "Sviluppo frontend React"
      priority: "high"
      status: "todo"
    - title: "Integrazione API backend"
      priority: "medium"
      status: "todo"
    - title: "Testing cross-browser e fix bug responsive"
      status: "todo"
    - title: "Preparare wireframe homepage"
      priority: "high"
      status: "todo"
    - title: "Setup progetto Next.js e configurare CI/CD"
      priority: "medium"
      status: "todo"
    - title: "Ricerca plugin Stripe per gestione abbonamenti"
      priority: "low"
      status: "todo"
  notes:
    - title: "Meeting Kickoff Progetto E-Commerce"
  timelines:
    - title: "MVP E-Commerce pronto"
    - title: "Meeting di revisione"
--- a/services/batch-agent/eval/fixtures/journey_invoice_setup.yaml
+++ b/services/batch-agent/eval/fixtures/journey_invoice_setup.yaml
@@ -1,28 +0,0 @@
 # Journey Fixture: journey-invoice-setup
 # Used by `python -m eval interactive` for human-in-the-loop testing
 # of the journey chatbot's prompt-building conversation.
 type: journey
 name: journey-invoice-setup
 description: >
  Interactive test for the journey chatbot — explore a directory of
  Italian invoices and meeting notes, answer the chatbot's questions,
  and verify it produces a well-structured prompt_template for data
  extraction.
 directory: sample_files/invoices
 data_types: [tasks, notes, timelines, projects]
 # Criteria the generated prompt_template must satisfy
 # Each is scored 0-1 by an LLM judge
 expected_template_criteria:
  - "Mentions creating tasks from action items and work descriptions"
  - "Mentions creating notes from meeting summaries"
  - "Mentions extracting timeline events from deadlines and meeting dates"
  - "Mentions creating projects from relevant information"
  - "Sets isAiSuggested=1 on all created records"
  - "Does NOT include projectId assignment logic"
  - "Uses camelCase field names (title, status, priority, dueDate, content)"
 # Models to test (empty = use CLI --models default)
 models: []
--- a/services/batch-agent/eval/fixtures/process_invoices.yaml
+++ b/services/batch-agent/eval/fixtures/process_invoices.yaml
@@ -1,81 +0,0 @@
 # Fixture: process-invoices (step2)
 # Tests _PROCESSING_SYSTEM_PROMPT — data extraction & tool calling.
 # The classification step is skipped; prompt variables are injected directly.
 name: process-invoices
 mode: step2
 description: >
  Test data extraction from Italian freelance invoices.
  Verifies correct record creation via tool calls with the right
  fields, priorities, and status values.
 directory: sample_files/invoices
 data_types: [tasks, notes, timelines]
 file_extensions: [txt, md]
 # ── Step-2 prompt variables ──────────────────────────────────────
 existing_context: |
  Existing tasks:
    (none)
  Existing notes:
    (none)
  Existing timelines:
    (none)
 project_context: >
  Project: Redesign Sito Web Corporate (id: proj-web-redesign).
  Always set projectId to this id on every record you create.
 custom_prompt_section: |
  User instructions:
  Estrai i dati dai file come segue:
  - TASK: ogni azione da fare, deliverable, o item con scadenza.
    Mappa "URGENTE" o "ALTA PRIORITÀ" → priority: high.
    Mappa "media priorità" → priority: medium.
    Mappa "bassa priorità" → priority: low.
    Se un item è marcato come "completato" o [x], impostalo status: done.
    Altrimenti status: todo.
  - NOTE: riassunti di meeting, decisioni prese, note tecniche.
    Il titolo deve essere descrittivo. Il content deve includere tutti i dettagli.
  - TIMELINE: date di scadenza, milestone, meeting futuri.
  Imposta sempre isAiSuggested=1.
 # ── Seed records (pre-existing DB state) ─────────────────────────
 seed_records:
  projects:
    - id: "proj-web-redesign"
      name: "Redesign Sito Web Corporate"
      status: "active"
  tasks: []
  notes: []
  timelines: []
 # ── Expected extractions ─────────────────────────────────────────
 expected:
  tasks:
    - title: "Sviluppo frontend React"
      priority: "high"
      status: "todo"
    - title: "Integrazione API backend"
      priority: "medium"
      status: "todo"
    - title: "Testing cross-browser e fix bug responsive"
      status: "todo"
    - title: "Preparare wireframe homepage"
      priority: "high"
      status: "todo"
    - title: "Setup progetto Next.js e configurare CI/CD"
      priority: "medium"
      status: "todo"
    - title: "Ricerca plugin Stripe per gestione abbonamenti"
      priority: "low"
      status: "todo"
  notes:
    - title: "Meeting Kickoff Progetto E-Commerce"
  timelines:
    - title: "MVP E-Commerce pronto"
    - title: "Meeting di revisione"
--- a/services/batch-agent/eval/fixtures/sample_files/invoices/fattura_042.txt
+++ b/services/batch-agent/eval/fixtures/sample_files/invoices/fattura_042.txt
@@ -1,18 +0,0 @@
 FATTURA N. 2026-0042
 Data: 15 Marzo 2026
 Cliente: Studio Architettura Bianchi
 Progetto: Redesign Sito Web Corporate
 Descrizione lavori:
 - Sviluppo frontend React (40 ore) — URGENTE, completare entro 20 marzo
 - Integrazione API backend (20 ore) — priorità media
 - Design UI/UX mockup homepage (8 ore) — completato
 - Testing cross-browser e fix bug responsive (12 ore) — da iniziare
 Totale: €4.800,00 + IVA
 Note:
 Meeting di revisione previsto per il 18 marzo alle 10:00.
 Il cliente ha richiesto modifiche al layout mobile della sezione contatti.
 Attendere conferma budget aggiuntivo per sezione blog.
--- a/services/batch-agent/eval/fixtures/sample_files/invoices/meeting_ecommerce.md
+++ b/services/batch-agent/eval/fixtures/sample_files/invoices/meeting_ecommerce.md
@@ -1,25 +0,0 @@
 # Meeting Notes - Kickoff Progetto E-Commerce
 **Data:** 10 Marzo 2026
 **Partecipanti:** Marco R., Giulia T., Cliente (FashionStore srl)
 ## Decisioni prese
 1. **Piattaforma**: Next.js + Stripe per i pagamenti
 2. **Timeline**: MVP pronto entro 30 aprile 2026
 3. **Budget**: €12.000 totale, €4.000 anticipo già ricevuto
 ## Action items
 - [ ] Marco: preparare wireframe homepage entro 14 marzo — ALTA PRIORITÀ
 - [ ] Giulia: setup progetto Next.js e configurare CI/CD — media priorità
 - [ ] Marco: ricerca plugin Stripe per gestione abbonamenti — bassa priorità
 - [x] Giulia: inviare contratto firmato al cliente — COMPLETATO
 ## Note aggiuntive
 Il cliente vuole un design minimalista, ispirato a Zara.com.
 Colori primari: nero, bianco, oro.
 Font: Inter per body, Playfair Display per headings.
 Prossimo meeting: 24 marzo 2026 ore 15:00.
--- a/services/batch-agent/eval/interactive.py
+++ b/services/batch-agent/eval/interactive.py
@@ -1,471 +0,0 @@
 """Interactive journey session — human-in-the-loop CLI conversation.
 Flow:
 1. Show the system prompt used by the journey AI.
 2. Start the journey (AI explores files, asks first question).
 3. User types responses in the terminal — AI replies.
 4. User types `/done` to end the conversation.
 5. User writes a comment about the interaction quality.
 6. LLM judge scores the conversation + generated template.
 7. Results are reported to Langfuse.
 Usage::
    python -m eval interactive                        # pick a fixture interactively
    python -m eval interactive --fixture=journey-invoice-setup
    python -m eval interactive --model=gpt-4o
    python -m eval interactive --judge-model=github_copilot/gpt-4o-mini
 """
 from __future__ import annotations
 import asyncio
 import json
 import logging
 import sys
 import time
 import uuid
 from dataclasses import dataclass, field
 from typing import Any
 from langchain_core.messages import HumanMessage, SystemMessage
 from eval.config import JourneyFixture, discover_journey_fixtures
 from eval.mock_executor import MockExecutor
 from eval import langfuse_eval
 logger = logging.getLogger(__name__)
 # ── Special commands ─────────────────────────────────────────────────────
 _CMD_DONE = "/done"
 _CMD_QUIT = "/quit"
 _CMD_TEMPLATE = "/template"
 _CMD_HELP = "/help"
 _HELP_TEXT = f"""\
  {_CMD_DONE}       — End the conversation and proceed to evaluation
  {_CMD_QUIT}       — Abort without evaluation
  {_CMD_TEMPLATE}   — Show the generated template (if any)
  {_CMD_HELP}       — Show this help"""
 # ── Terminal colours (ANSI) ──────────────────────────────────────────────
 _C_RESET = "\033[0m"
 _C_BOLD = "\033[1m"
 _C_DIM = "\033[2m"
 _C_CYAN = "\033[36m"
 _C_GREEN = "\033[32m"
 _C_YELLOW = "\033[33m"
 _C_MAGENTA = "\033[35m"
 _C_RED = "\033[31m"
 _C_BLUE = "\033[34m"
 def _print_header(text: str) -> None:
    print(f"\n{_C_BOLD}{_C_CYAN}{'═' * 80}")
    print(f"  {text}")
    print(f"{'═' * 80}{_C_RESET}\n")
 def _print_ai(text: str) -> None:
    print(f"\n{_C_GREEN}{_C_BOLD}AI:{_C_RESET} {text}\n")
 def _print_system(text: str) -> None:
    print(f"{_C_DIM}{text}{_C_RESET}")
 def _print_score(label: str, score: float) -> None:
    if score >= 0.7:
        color = _C_GREEN
        tag = "PASS"
    elif score >= 0.4:
        color = _C_YELLOW
        tag = "PARTIAL"
    else:
        color = _C_RED
        tag = "FAIL"
    print(f"  {color}{tag:>7}{_C_RESET} ({score:.1f}) {label}")
 # ── Result type ──────────────────────────────────────────────────────────
@dataclass
 class InteractiveResult:
    fixture_name: str
    model: str
    judge_model: str
    prompt_template: str | None
    conversation: list[dict[str, str]]
    user_comment: str
    done: bool
    criteria_scores: dict[str, float]
    overall_score: float
    judge_reasoning: str
    elapsed_seconds: float
    def summary(self) -> dict[str, Any]:
        return {
            "fixture": self.fixture_name,
            "model": self.model,
            "judge_model": self.judge_model,
            "done": self.done,
            "turns": len([c for c in self.conversation if c["role"] == "user"]),
            "overall_score": round(self.overall_score, 3),
            "user_comment": self.user_comment,
            "criteria_scores": {k: round(v, 3) for k, v in self.criteria_scores.items()},
            "elapsed_s": round(self.elapsed_seconds, 1),
        }
 # ── LLM judge ────────────────────────────────────────────────────────────
 _INTERACTIVE_JUDGE_SYSTEM = """\
 You are an evaluation judge for AI-generated prompt templates produced during
 an interactive conversation between a human and a journey chatbot.
 The chatbot explored a directory and through multi-turn conversation with the
 user produced a prompt_template — an instruction set for a data-extraction agent.
 You have access to:
 - The full conversation transcript
 - The generated prompt_template (if any)
 - The user's own comment about the interaction
 - A list of quality criteria
 Score each criterion from 0 to 1:
  - 1.0: Fully satisfied
  - 0.5: Partially satisfied
  - 0.0: Not satisfied
 Also provide an overall_quality score (0-1) evaluating the conversation flow,
 how well the AI understood the user, and the template quality.
 Respond with ONLY a JSON object:
 {
  "criteria_scores": {"criterion_1": 0.8, ...},
  "overall_quality": 0.85,
  "reasoning": "Brief explanation covering both conversation quality and template accuracy"
 }
 """
 async def _judge_interactive(
    conversation: list[dict[str, str]],
    prompt_template: str | None,
    user_comment: str,
    criteria: list[str],
    *,
    judge_model: str = "gpt-4o-mini",
 ) -> tuple[dict[str, float], float, str]:
    """Score an interactive session. Returns (criteria_scores, overall_quality, reasoning)."""
    from shared.llm import get_llm
    llm = get_llm(model=judge_model, temperature=0)
    conv_text = "\n".join(
        f"{'USER' if t['role'] == 'user' else 'AI'}: {t['content']}"
        for t in conversation
    )
    criteria_text = "\n".join(f"  {i+1}. {c}" for i, c in enumerate(criteria))
    user_content = (
        f"## Conversation transcript\n```\n{conv_text}\n```\n\n"
        f"## Generated prompt_template\n```\n{prompt_template or '(none — conversation did not complete)'}\n```\n\n"
        f"## User's comment\n{user_comment}\n\n"
        f"## Criteria to evaluate\n{criteria_text}"
    )
    try:
        response = await llm.ainvoke([
            SystemMessage(content=_INTERACTIVE_JUDGE_SYSTEM),
            HumanMessage(content=user_content),
        ])
        raw = response.content.strip()
        if raw.startswith("```"):
            raw = raw.split("```")[1]
            if raw.startswith("json"):
                raw = raw[4:]
        parsed = json.loads(raw.strip())
        scores_raw = parsed.get("criteria_scores", parsed.get("scores", {}))
        criteria_scores: dict[str, float] = {}
        for i, criterion in enumerate(criteria):
            key_candidates = [f"criterion_{i+1}", criterion, criterion[:50], str(i + 1)]
            score = 0.0
            for key in key_candidates:
                if key in scores_raw:
                    score = float(scores_raw[key])
                    break
            if score == 0.0 and i < len(scores_raw):
                score = float(list(scores_raw.values())[i])
            criteria_scores[criterion] = score
        overall = float(parsed.get("overall_quality", 0.0))
        reasoning = str(parsed.get("reasoning", ""))
        return criteria_scores, overall, reasoning
    except Exception as exc:
        logger.warning("interactive judge failed: %s", exc)
        return {c: 0.0 for c in criteria}, 0.0, f"Judge error: {exc}"
 # ── Interactive session ──────────────────────────────────────────────────
 async def run_interactive(
    fixture: JourneyFixture,
    *,
    model: str = "gpt-4o",
    judge_model: str = "gpt-4o-mini",
    data_dir: Path | None = None,
 ) -> InteractiveResult:
    """Run an interactive journey session in the terminal.
    Parameters
    ----------
    data_dir :
        If set, overrides the fixture's sample-file directory.  The LLM
        will explore this folder instead of the default
        ``fixtures/sample_files/…``.  Useful for private test data that
        shouldn't be committed to git.
    """
    from shared.config import settings
    from shared.ws_context import set_current_user, clear_current_user
    from app.journey import (
        handle_journey_start,
        handle_journey_message,
        _build_system_prompt,
    )
    # When --data-dir is given, the MockExecutor's root becomes
    # data_dir's parent and the journey directory is data_dir's name.
    # This way the LLM sees a meaningful directory name (not ".") and
    # MockExecutor resolves paths correctly.
    # Otherwise, use the fixture's YAML parent and its relative path.
    if data_dir:
        mock_root = data_dir.parent
        journey_directory = data_dir.name
    else:
        mock_root = fixture.fixture_path.parent
        journey_directory = fixture.directory
    mock = MockExecutor(
        fixture_dir=mock_root,
        seed_records={},
    )
    original_model = settings.LLM_MODEL
    settings.LLM_MODEL = model
    eval_user_id = f"interactive-{uuid.uuid4().hex[:8]}"
    # ── Show system prompt ───────────────────────────────────────
    system_prompt = _build_system_prompt(journey_directory, fixture.data_types)
    _print_header("SYSTEM PROMPT")
    print(f"{_C_DIM}{system_prompt}{_C_RESET}")
    _print_header(f"INTERACTIVE JOURNEY  |  fixture: {fixture.name}  |  model: {model}")
    print(f"  Data dir: {mock_root}")
    print(f"  Type your responses. Commands: {_CMD_DONE}, {_CMD_QUIT}, {_CMD_TEMPLATE}, {_CMD_HELP}")
    print(f"  Judge model: {judge_model}")
    print(f"  Criteria: {len(fixture.expected_template_criteria)}")
    print()
    conversation: list[dict[str, str]] = []
    prompt_template: str | None = None
    done = False
    start_time = time.time()
    try:
        set_current_user(eval_user_id)
        with mock.patch():
            # ── Start ────────────────────────────────────────────
            _print_system("Starting journey... (AI is exploring your files)")
            start_frame: dict[str, Any] = {
                "agent_type": "local",
                "directory": journey_directory,
                "data_types": fixture.data_types,
                "session_id": f"interactive-{uuid.uuid4().hex[:8]}",
            }
            reply = await handle_journey_start(eval_user_id, start_frame)
            session_id = reply["session_id"]
            conversation.append({"role": "assistant", "content": reply["message"]})
            _print_ai(reply["message"])
            if reply["done"]:
                prompt_template = reply.get("prompt_template")
                done = True
                _print_system("Journey completed on first reply (template generated).")
            # ── Conversation loop ────────────────────────────────
            while not done:
                try:
                    user_input = input(f"{_C_BOLD}{_C_BLUE}YOU:{_C_RESET} ").strip()
                except (EOFError, KeyboardInterrupt):
                    print()
                    user_input = _CMD_QUIT
                if not user_input:
                    continue
                # Handle commands
                if user_input.lower() == _CMD_QUIT:
                    _print_system("Aborted — no evaluation will be performed.")
                    settings.LLM_MODEL = original_model
                    clear_current_user()
                    return InteractiveResult(
                        fixture_name=fixture.name, model=model, judge_model=judge_model,
                        prompt_template=None, conversation=conversation,
                        user_comment="(aborted)", done=False,
                        criteria_scores={}, overall_score=0.0,
                        judge_reasoning="Session aborted by user.",
                        elapsed_seconds=time.time() - start_time,
                    )
                if user_input.lower() == _CMD_HELP:
                    print(_HELP_TEXT)
                    continue
                if user_input.lower() == _CMD_TEMPLATE:
                    if prompt_template:
                        print(f"\n{_C_MAGENTA}{prompt_template}{_C_RESET}\n")
                    else:
                        _print_system("No template generated yet.")
                    continue
                if user_input.lower() == _CMD_DONE:
                    _print_system("Ending conversation...")
                    break
                # ── Send message to AI ───────────────────────────
                conversation.append({"role": "user", "content": user_input})
                _print_system("AI is thinking...")
                msg_frame: dict[str, Any] = {
                    "session_id": session_id,
                    "message": user_input,
                }
                reply = await handle_journey_message(eval_user_id, msg_frame)
                conversation.append({"role": "assistant", "content": reply["message"]})
                _print_ai(reply["message"])
                if reply["done"]:
                    prompt_template = reply.get("prompt_template")
                    done = True
                    _print_system("Journey completed — template generated!")
    except Exception as exc:
        logger.error("interactive journey failed: %s", exc)
        _print_system(f"Error: {exc}")
    finally:
        settings.LLM_MODEL = original_model
        clear_current_user()
    elapsed = time.time() - start_time
    turns = len([c for c in conversation if c["role"] == "user"])
    # ── Show template if generated ───────────────────────────────
    if prompt_template:
        _print_header("GENERATED TEMPLATE")
        print(f"{_C_MAGENTA}{prompt_template}{_C_RESET}\n")
    else:
        _print_system("No template was generated during this session.")
    # ── User comment ─────────────────────────────────────────────
    _print_header("YOUR EVALUATION")
    print("  Write your comment about this interaction (press Enter twice to finish):")
    print()
    comment_lines: list[str] = []
    try:
        while True:
            line = input()
            if line == "" and comment_lines and comment_lines[-1] == "":
                comment_lines.pop()  # remove trailing empty
                break
            comment_lines.append(line)
    except (EOFError, KeyboardInterrupt):
        pass
    user_comment = "\n".join(comment_lines).strip() or "(no comment)"
    # ── Judge ────────────────────────────────────────────────────
    _print_header("LLM JUDGE EVALUATION")
    _print_system(f"Scoring with {judge_model}...")
    criteria_scores, overall_quality, judge_reasoning = await _judge_interactive(
        conversation=conversation,
        prompt_template=prompt_template,
        user_comment=user_comment,
        criteria=fixture.expected_template_criteria,
        judge_model=judge_model,
    )
    # ── Display scores ───────────────────────────────────────────
    print()
    for criterion, score in criteria_scores.items():
        _print_score(criterion, score)
    overall = (
        sum(criteria_scores.values()) / len(criteria_scores)
        if criteria_scores
        else 0.0
    )
    print(f"\n  {_C_BOLD}Criteria avg:      {overall:.2f}{_C_RESET}")
    print(f"  {_C_BOLD}Overall quality:   {overall_quality:.2f}{_C_RESET}")
    print(f"  {_C_BOLD}Turns:             {turns}{_C_RESET}")
    print(f"  {_C_BOLD}Time:              {elapsed:.1f}s{_C_RESET}")
    print(f"\n  {_C_DIM}Judge: {judge_reasoning}{_C_RESET}")
    print(f"  {_C_DIM}Your comment: {user_comment}{_C_RESET}\n")
    result = InteractiveResult(
        fixture_name=fixture.name,
        model=model,
        judge_model=judge_model,
        prompt_template=prompt_template,
        conversation=conversation,
        user_comment=user_comment,
        done=done,
        criteria_scores=criteria_scores,
        overall_score=overall_quality,
        judge_reasoning=judge_reasoning,
        elapsed_seconds=elapsed,
    )
    # ── Report to Langfuse ───────────────────────────────────────
    trace_id = langfuse_eval.log_eval_trace(
        fixture_name=fixture.name,
        model=model,
        prompt_variant="interactive",
        prompt_template=prompt_template or "(not generated)",
        actual_mutations=[{
            "conversation": conversation[:30],
            "user_comment": user_comment,
        }],
        scores_summary=result.summary(),
        langfuse_prompt_names=["journey_system"],
    )
    if trace_id:
        from eval.scorer import EvalScores
        scores_obj = EvalScores(
            fixture_name=fixture.name,
            model=model,
            prompt_variant="interactive",
            precision=overall,
            recall=float(done),
            f1=overall,
            llm_judge_score=overall_quality,
            llm_judge_reasoning=judge_reasoning,
        )
        langfuse_eval.post_eval_scores(scores_obj, trace_id=trace_id)
        _print_system(f"Results reported to Langfuse (trace: {trace_id})")
    else:
        _print_system("Langfuse not configured — results not reported.")
    return result
--- a/services/batch-agent/eval/journey_runner.py
+++ b/services/batch-agent/eval/journey_runner.py
@@ -1,385 +0,0 @@
 """Journey eval runner — tests the prompt_template builder conversation.
 For each (journey_fixture × model) combination:
 1. Build a MockExecutor (for filesystem tools used during journey)
 2. Patch execute_on_client
 3. Override LLM_MODEL
 4. Call handle_journey_start to kick off the conversation
 5. Feed simulated user_messages via handle_journey_message
 6. Collect the generated prompt_template
 7. Score it against expected_template_criteria (via LLM judge)
 8. Report to Langfuse
 """
 from __future__ import annotations
 import asyncio
 import copy
 import json
 import logging
 import time
 import uuid
 from dataclasses import dataclass, field
 from pathlib import Path
 from typing import Any
 from langchain_core.messages import HumanMessage, SystemMessage
 from eval.config import JourneyFixture
 from eval.mock_executor import MockExecutor
 from eval import langfuse_eval
 logger = logging.getLogger(__name__)
 # ── Result type ──────────────────────────────────────────────────────────
@dataclass
 class JourneyEvalResult:
    """Result of one journey eval run."""
    fixture_name: str
    model: str
    prompt_template: str | None  # the generated template (None if journey failed)
    conversation_turns: int
    done: bool  # whether journey reached completion
    criteria_scores: dict[str, float]  # criterion → 0-1 score
    overall_score: float  # average of criteria scores
    judge_reasoning: str
    elapsed_seconds: float
    def summary(self) -> dict[str, Any]:
        return {
            "fixture": self.fixture_name,
            "model": self.model,
            "done": self.done,
            "turns": self.conversation_turns,
            "overall_score": round(self.overall_score, 3),
            "criteria_scores": {k: round(v, 3) for k, v in self.criteria_scores.items()},
            "elapsed_s": round(self.elapsed_seconds, 1),
        }
 # ── LLM judge for template quality ──────────────────────────────────────
 _JOURNEY_JUDGE_SYSTEM = """\
 You are an evaluation judge for AI-generated prompt templates.
 A journey chatbot explored a user's directory structure and through
 conversation produced a prompt_template — an instruction set for a
 data-extraction agent.
 Your task: evaluate the generated template against a list of criteria.
 Score each criterion from 0 to 1:
  - 1.0: Fully satisfied, clearly present in the template
  - 0.5: Partially satisfied or ambiguously addressed
  - 0.0: Not satisfied, missing from the template
 Respond with ONLY a JSON object:
 {
  "scores": {"criterion_1": 0.8, "criterion_2": 1.0, ...},
  "reasoning": "Brief explanation"
 }
 """
 async def _judge_template(
    prompt_template: str,
    criteria: list[str],
    *,
    judge_model: str = "gpt-4o-mini",
 ) -> tuple[dict[str, float], str]:
    """Use an LLM to evaluate a generated prompt_template against criteria.
    Returns (criteria_scores, reasoning).
    """
    from shared.llm import get_llm
    llm = get_llm(model=judge_model, temperature=0)
    criteria_text = "\n".join(f"  {i+1}. {c}" for i, c in enumerate(criteria))
    user_content = (
        f"## Generated prompt_template\n```\n{prompt_template}\n```\n\n"
        f"## Criteria to evaluate\n{criteria_text}"
    )
    try:
        response = await llm.ainvoke([
            SystemMessage(content=_JOURNEY_JUDGE_SYSTEM),
            HumanMessage(content=user_content),
        ])
        raw = response.content.strip()
        if raw.startswith("```"):
            raw = raw.split("```")[1]
            if raw.startswith("json"):
                raw = raw[4:]
        parsed = json.loads(raw.strip())
        scores_raw = parsed.get("scores", {})
        # Map criterion keys back to the original criteria text
        criteria_scores: dict[str, float] = {}
        for i, criterion in enumerate(criteria):
            # Try matching by index key or exact criterion text
            key_candidates = [
                f"criterion_{i+1}",
                criterion,
                criterion[:50],
                str(i + 1),
            ]
            score = 0.0
            for key in key_candidates:
                if key in scores_raw:
                    score = float(scores_raw[key])
                    break
            # If no match found, try values in order
            if score == 0.0 and i < len(scores_raw):
                score = float(list(scores_raw.values())[i])
            criteria_scores[criterion] = score
        reasoning = str(parsed.get("reasoning", ""))
        return criteria_scores, reasoning
    except Exception as exc:
        logger.warning("journey_eval: LLM judge failed: %s", exc)
        return {c: 0.0 for c in criteria}, f"Judge error: {exc}"
 # ── Journey runner ───────────────────────────────────────────────────────
 async def run_single_journey_eval(
    fixture: JourneyFixture,
    model: str,
    *,
    judge_model: str = "gpt-4o-mini",
    data_dir: Path | None = None,
 ) -> JourneyEvalResult:
    """Execute one journey eval: start \u2192 messages \u2192 score template."""
    from shared.config import settings
    # When data_dir is given, use its parent as MockExecutor root
    # and its name as the journey directory so the LLM sees a
    # meaningful path (not ".").
    if data_dir:
        mock_root = data_dir.parent
        journey_directory = data_dir.name
    else:
        mock_root = fixture.fixture_path.parent
        journey_directory = fixture.directory
    mock = MockExecutor(
        fixture_dir=mock_root,
        seed_records={},
    )
    original_model = settings.LLM_MODEL
    settings.LLM_MODEL = model
    eval_user_id = f"eval-journey-{uuid.uuid4().hex[:8]}"
    logger.info(
        "journey_eval: starting %s | model=%s",
        fixture.name, model,
    )
    start_time = time.time()
    prompt_template: str | None = None
    conversation: list[dict[str, str]] = []
    done = False
    try:
        from shared.ws_context import set_current_user, clear_current_user
        from app.journey import handle_journey_start, handle_journey_message, _sessions
        set_current_user(eval_user_id)
        with mock.patch():
            # ── Start the journey ────────────────────────────────
            start_frame: dict[str, Any] = {
                "agent_type": "local",
                "directory": journey_directory,
                "data_types": fixture.data_types,
                "session_id": f"eval-{uuid.uuid4().hex[:8]}",
            }
            reply = await handle_journey_start(eval_user_id, start_frame)
            session_id = reply["session_id"]
            conversation.append({"role": "assistant", "content": reply["message"]})
            logger.info(
                "journey_eval: start reply (%d chars), done=%s",
                len(reply["message"]), reply["done"],
            )
            if reply["done"]:
                prompt_template = reply.get("prompt_template")
                done = True
            else:
                # ── Send user messages ───────────────────────────
                for i, user_msg in enumerate(fixture.user_messages):
                    if done:
                        break
                    conversation.append({"role": "user", "content": user_msg})
                    msg_frame: dict[str, Any] = {
                        "session_id": session_id,
                        "message": user_msg,
                    }
                    reply = await handle_journey_message(eval_user_id, msg_frame)
                    conversation.append({"role": "assistant", "content": reply["message"]})
                    logger.info(
                        "journey_eval: turn %d reply (%d chars), done=%s",
                        i + 1, len(reply["message"]), reply["done"],
                    )
                    if reply["done"]:
                        prompt_template = reply.get("prompt_template")
                        done = True
                # If not done after all user messages, send a final nudge
                if not done:
                    nudge = "Please generate the final prompt_template now. I'm satisfied with the configuration."
                    conversation.append({"role": "user", "content": nudge})
                    nudge_frame: dict[str, Any] = {
                        "session_id": session_id,
                        "message": nudge,
                    }
                    reply = await handle_journey_message(eval_user_id, nudge_frame)
                    conversation.append({"role": "assistant", "content": reply["message"]})
                    if reply["done"]:
                        prompt_template = reply.get("prompt_template")
                        done = True
    except Exception as exc:
        logger.error("journey_eval: pipeline failed for %s/%s: %s", fixture.name, model, exc)
    finally:
        settings.LLM_MODEL = original_model
        from shared.ws_context import clear_current_user
        clear_current_user()
    elapsed = time.time() - start_time
    turns = len([c for c in conversation if c["role"] == "user"])
    logger.info(
        "journey_eval: completed in %.1fs — %d turns, done=%s, template=%s",
        elapsed, turns, done, "yes" if prompt_template else "no",
    )
    # ── Score the template ───────────────────────────────────────
    criteria_scores: dict[str, float] = {}
    judge_reasoning = ""
    if prompt_template and fixture.expected_template_criteria:
        criteria_scores, judge_reasoning = await _judge_template(
            prompt_template,
            fixture.expected_template_criteria,
            judge_model=judge_model,
        )
    elif not prompt_template:
        criteria_scores = {c: 0.0 for c in fixture.expected_template_criteria}
        judge_reasoning = "No prompt_template was generated — journey did not complete."
    overall = (
        sum(criteria_scores.values()) / len(criteria_scores)
        if criteria_scores
        else 0.0
    )
    result = JourneyEvalResult(
        fixture_name=fixture.name,
        model=model,
        prompt_template=prompt_template,
        conversation_turns=turns,
        done=done,
        criteria_scores=criteria_scores,
        overall_score=overall,
        judge_reasoning=judge_reasoning,
        elapsed_seconds=elapsed,
    )
    # ── Report to Langfuse ───────────────────────────────────────
    trace_id = langfuse_eval.log_eval_trace(
        fixture_name=fixture.name,
        model=model,
        prompt_variant="journey",
        prompt_template=prompt_template or "(not generated)",
        actual_mutations=[{"conversation": conversation[:20]}],
        scores_summary=result.summary(),
        langfuse_prompt_names=["journey_system"],
    )
    if trace_id:
        from eval.scorer import EvalScores
        scores_obj = EvalScores(
            fixture_name=fixture.name,
            model=model,
            prompt_variant="journey",
            precision=overall,
            recall=float(done),
            f1=overall,
            llm_judge_score=overall,
            llm_judge_reasoning=judge_reasoning,
        )
        langfuse_eval.post_eval_scores(scores_obj, trace_id=trace_id)
    return result
 async def run_journey_fixture_eval(
    fixture: JourneyFixture,
    models: list[str],
    *,
    judge_model: str = "gpt-4o-mini",
    data_dir: Path | None = None,
 ) -> list[JourneyEvalResult]:
    """Run all models for a journey fixture."""
    langfuse_eval.sync_journey_fixture_to_dataset(fixture)
    results: list[JourneyEvalResult] = []
    for model in models:
        result = await run_single_journey_eval(
            fixture, model, judge_model=judge_model,
            data_dir=data_dir,
        )
        results.append(result)
    return results
 def print_journey_results(results: list[JourneyEvalResult]) -> None:
    """Print a formatted summary of journey eval results."""
    if not results:
        print("\nNo journey eval results.")
        return
    print("\n" + "=" * 95)
    print(f"{'Fixture':<25} {'Model':<25} {'Done':>5} {'Turns':>6} {'Score':>7} {'Time':>7}")
    print("-" * 95)
    for r in results:
        done_str = "yes" if r.done else "NO"
        print(
            f"{r.fixture_name:<25} {r.model:<25} {done_str:>5} "
            f"{r.conversation_turns:>6} {r.overall_score:>7.2f} {r.elapsed_seconds:>6.1f}s"
        )
    print("=" * 95)
    # Criteria breakdown
    for r in results:
        if r.criteria_scores:
            print(f"\n[{r.model}] Criteria scores:")
            for criterion, score in r.criteria_scores.items():
                indicator = "PASS" if score >= 0.7 else "PARTIAL" if score >= 0.4 else "FAIL"
                print(f"  {indicator:>7} ({score:.1f}) {criterion}")
        if r.judge_reasoning:
            print(f"  Judge: {r.judge_reasoning}")
        if r.prompt_template:
            preview = r.prompt_template[:200].replace("\n", " ")
            print(f"  Template preview: {preview}...")
    print()
--- a/services/batch-agent/eval/langfuse_eval.py
+++ b/services/batch-agent/eval/langfuse_eval.py
@@ -1,327 +0,0 @@
 """Langfuse evaluation integration — datasets, runs, and scoring.
 Uses the Langfuse Python SDK v4 (OpenTelemetry-based) to:
 1. **Sync fixtures → Langfuse datasets**: Each YAML fixture becomes a dataset,
   each prompt variant + expected pair becomes a dataset item.
 2. **Track eval runs**: Each (fixture × model × prompt_variant) execution
   is recorded as a trace with linked scores.
 3. **Post scores**: precision, recall, F1, field_accuracy, llm_judge are
   posted as numeric scores on the trace.
 """
 from __future__ import annotations
 import logging
 import os
 from typing import Any
 from shared.config import settings
 from eval.config import EvalFixture
 from eval.scorer import EvalScores
 logger = logging.getLogger(__name__)
 def _get_langfuse():
    """Get or create a Langfuse client instance (SDK v4)."""
    if not settings.LANGFUSE_SECRET_KEY or not settings.LANGFUSE_PUBLIC_KEY:
        return None
    try:
        os.environ.setdefault("LANGFUSE_SECRET_KEY", settings.LANGFUSE_SECRET_KEY)
        os.environ.setdefault("LANGFUSE_PUBLIC_KEY", settings.LANGFUSE_PUBLIC_KEY)
        if settings.LANGFUSE_HOST:
            os.environ.setdefault("LANGFUSE_HOST", settings.LANGFUSE_HOST)
        from langfuse import get_client
        return get_client()
    except Exception as exc:
        logger.warning("langfuse_eval: failed to create client: %s", exc)
        return None
 def sync_fixture_to_dataset(fixture: EvalFixture) -> str | None:
    """Create or update a Langfuse dataset from a fixture.
    Each prompt variant becomes a separate dataset item with:
    - input: {directory, data_types, prompt_template, seed_records}
    - expected_output: {expected records}
    Returns the dataset name, or None if Langfuse is unavailable.
    """
    lf = _get_langfuse()
    if lf is None:
        logger.info("langfuse_eval: Langfuse not configured — skipping dataset sync")
        return None
    dataset_name = f"batch-eval-{fixture.name}"
    try:
        lf.create_dataset(
            name=dataset_name,
            description=fixture.description,
            metadata={
                "data_types": ",".join(fixture.data_types),
                "file_extensions": ",".join(fixture.file_extensions) if fixture.file_extensions else "",
            },
        )
    except Exception:
        # Dataset may already exist — that's fine
        pass
    # Build expected_output appropriate to the fixture's mode
    expected_output: dict[str, Any] = {}
    if fixture.mode in ("step1", "full") and fixture.expected_classification:
        expected_output["classifications"] = [
            {"file": ec.file, "project_id": ec.project_id, "domains": ec.domains}
            for ec in fixture.expected_classification
        ]
    if fixture.mode in ("step2", "full") and fixture.expected:
        for rec in fixture.expected:
            expected_output.setdefault(rec.table, []).append(rec.fields)
    item_id = f"{fixture.name}--{fixture.mode}"
    try:
        lf.create_dataset_item(
            dataset_name=dataset_name,
            id=item_id,
            input={
                "directory": fixture.directory,
                "data_types": fixture.data_types,
                "mode": fixture.mode,
                "seed_records": fixture.seed_records,
            },
            expected_output=expected_output,
            metadata={"mode": fixture.mode},
        )
    except Exception as exc:
        logger.warning(
            "langfuse_eval: failed to upsert dataset item %s: %s", item_id, exc
        )
    lf.flush()
    logger.info("langfuse_eval: synced fixture '%s' → dataset '%s'", fixture.name, dataset_name)
    return dataset_name
 def sync_journey_fixture_to_dataset(fixture) -> str | None:
    """Create or update a Langfuse dataset from a journey fixture.
    Each journey fixture becomes a single dataset item with:
    - input: {directory, data_types, user_messages}
    - expected_output: {criteria}
    """
    lf = _get_langfuse()
    if lf is None:
        logger.info("langfuse_eval: Langfuse not configured — skipping journey dataset sync")
        return None
    dataset_name = f"journey-eval-{fixture.name}"
    try:
        lf.create_dataset(
            name=dataset_name,
            description=fixture.description,
            metadata={"type": "journey", "data_types": ",".join(fixture.data_types)},
        )
    except Exception:
        pass  # Dataset may already exist
    item_id = f"{fixture.name}--journey"
    try:
        lf.create_dataset_item(
            dataset_name=dataset_name,
            id=item_id,
            input={
                "directory": fixture.directory,
                "data_types": fixture.data_types,
                "user_messages": fixture.user_messages,
            },
            expected_output={
                "criteria": fixture.expected_template_criteria,
            },
            metadata={"type": "journey"},
        )
    except Exception as exc:
        logger.warning("langfuse_eval: failed to upsert journey dataset item %s: %s", item_id, exc)
    lf.flush()
    logger.info("langfuse_eval: synced journey fixture '%s' → dataset '%s'", fixture.name, dataset_name)
    return dataset_name
 def create_eval_run(
    dataset_name: str,
    run_name: str,
    *,
    metadata: dict[str, Any] | None = None,
 ) -> str:
    """Create a dataset run in Langfuse. Returns the run name.
    Note: In SDK v4, dataset runs are created implicitly via
    dataset.run_experiment(). This function is kept for backwards
    compatibility but may not create a run.
    """
    lf = _get_langfuse()
    if lf is None:
        return run_name
    try:
        if hasattr(lf, "create_dataset_run"):
            lf.create_dataset_run(
                dataset_name=dataset_name,
                run_name=run_name,
                metadata=metadata or {},
            )
            lf.flush()
        else:
            logger.debug("langfuse_eval: create_dataset_run not available in SDK v4")
    except Exception as exc:
        logger.warning("langfuse_eval: failed to create run %s: %s", run_name, exc)
    return run_name
 def post_eval_scores(
    scores: EvalScores,
    *,
    trace_id: str | None = None,
    dataset_name: str | None = None,
    run_name: str | None = None,
 ) -> None:
    """Post evaluation scores to Langfuse.
    If trace_id is provided, scores are attached to that trace.
    """
    lf = _get_langfuse()
    if lf is None:
        return
    score_data = [
        ("precision", scores.precision),
        ("recall", scores.recall),
        ("f1", scores.f1),
    ]
    # Only post field_accuracy when there are field-level scores (step2/full)
    if scores.field_scores:
        score_data.append(("field_accuracy", scores.field_accuracy))
    if scores.llm_judge_score is not None:
        score_data.append(("llm_judge", scores.llm_judge_score))
    for name, value in score_data:
        try:
            lf.create_score(
                name=name,
                value=value,
                trace_id=trace_id,
                data_type="NUMERIC",
                comment=f"{scores.fixture_name} | {scores.model} | {scores.prompt_variant}",
            )
        except Exception as exc:
            logger.warning("langfuse_eval: failed to post score %s: %s", name, exc)
    lf.flush()
    logger.info(
        "langfuse_eval: posted %d scores for %s/%s/%s",
        len(score_data), scores.fixture_name, scores.model, scores.prompt_variant,
    )
 def log_eval_trace(
    *,
    fixture_name: str,
    model: str,
    prompt_variant: str,
    prompt_template: str,
    actual_mutations: list[dict],
    scores_summary: dict[str, Any],
    step1_results: list[dict] | None = None,
    dataset_name: str | None = None,
    run_name: str | None = None,
    dataset_item_id: str | None = None,
    langfuse_prompt_names: list[str] | None = None,
 ) -> str | None:
    """Create a Langfuse trace for one eval execution and link it to a dataset run.
    Uses SDK v4 observation API (traces are created implicitly by root spans).
    ``langfuse_prompt_names`` can contain one or two prompt names to link
    (e.g. ``["batch_file_classifier", "batch_processing"]`` for full mode).
    Each prompt gets its own generation-type observation for per-version
    metrics tracking.
    Returns the trace_id, or None if Langfuse is unavailable.
    """
    lf = _get_langfuse()
    if lf is None:
        return None
    try:
        from langfuse import propagate_attributes
        # Fetch prompt objects for linking
        prompt_objs: list[tuple[str, Any]] = []
        for pname in (langfuse_prompt_names or []):
            try:
                obj = lf.get_prompt(name=pname, cache_ttl_seconds=300)
                prompt_objs.append((pname, obj))
                logger.info("langfuse_eval: linked prompt '%s' (type=%s)", pname, type(obj).__name__)
            except Exception as exc:
                logger.warning("langfuse_eval: prompt '%s' not found — %s", pname, exc)
        # Build trace output dict
        trace_output: dict[str, Any] = {"scores": scores_summary}
        if step1_results:
            trace_output["classifications"] = step1_results
        if actual_mutations:
            trace_output["mutations"] = actual_mutations[:50]
        with propagate_attributes(
            trace_name=f"eval-{fixture_name}",
            metadata={
                "eval": "true",
                "fixture": fixture_name,
                "model": model,
                "prompt_variant": prompt_variant,
            },
            tags=["eval", f"model:{model}", f"variant:{prompt_variant}"],
        ):
            # Root span for the eval run
            span = lf.start_observation(name=f"eval-{fixture_name}")
            span.update(
                input={
                    "prompt_template": prompt_template,
                    "model": model,
                    "prompt_variant": prompt_variant,
                },
                output=trace_output,
            )
            trace_id = span.trace_id
            # Create a generation-type observation per linked prompt
            for pname, pobj in prompt_objs:
                gen = lf.start_observation(
                    name=f"prompt-{pname}",
                    prompt=pobj,
                    as_type="generation",
                )
                gen.end()
            # Link to dataset run if available
            if dataset_name and run_name and dataset_item_id:
                try:
                    dataset = lf.get_dataset(dataset_name)
                    for item in dataset.items:
                        if item.id == dataset_item_id:
                            item.link(span, run_name)
                            break
                except Exception as exc:
                    logger.warning("langfuse_eval: failed to link trace to dataset run: %s", exc)
            span.end()
        lf.flush()
        return trace_id
    except Exception as exc:
        logger.warning("langfuse_eval: failed to create eval trace: %s", exc)
        return None
--- a/services/batch-agent/eval/mock_executor.py
+++ b/services/batch-agent/eval/mock_executor.py
@@ -1,258 +0,0 @@
 """Mock executor — intercepts execute_on_client for offline E2E testing.
 Patches ``execute_on_client`` at all usage sites so agent pipeline runs don't
 require a live Electron client or Redis.  Instead:
 - **Filesystem actions** (list_directory, read_file_content, get_file_metadata)
  are served from local fixture files on disk.
 - **Read actions** (select, get) return preseeded records from an in-memory
  store provided by the test fixture.
 - **Write actions** (insert, update, delete) are captured as *mutations* and
  stored for later comparison against expected results.
 """
 from __future__ import annotations
 import json
 import os
 import time
 import uuid
 from dataclasses import dataclass, field
 from pathlib import Path
 from typing import Any
 from contextlib import contextmanager, asynccontextmanager
 from unittest.mock import AsyncMock, patch
@dataclass
 class Mutation:
    """A single recorded write operation."""
    action: str  # insert | update | delete
    table: str
    data: dict[str, Any]
    timestamp: float = field(default_factory=time.time)
 # ── Fake DB helpers (used to bypass async_session in full mode) ───────
 class _FakeRow:
    """Mimics an AgentRunLog row returned by SQLAlchemy."""
    id = 0
    status = "running"
    items_processed = 0
    items_created = 0
    errors: list[str] = []
    completed_at = None
    def __setattr__(self, name: str, value: Any) -> None:
        object.__setattr__(self, name, value)
 class _FakeResult:
    """Mimics a SQLAlchemy ``Result`` with ``scalar_one_or_none``."""
    def __init__(self, row: _FakeRow) -> None:
        self._row = row
    def scalar_one_or_none(self) -> _FakeRow:
        return self._row
@dataclass
 class MockExecutor:
    """In-memory executor that replaces Redis-based tool round-trip.
    Parameters
    ----------
    fixture_dir : Path
        Directory containing sample files for filesystem tool calls.
    seed_records : dict[str, list[dict]]
        Pre-existing records per table, e.g. ``{"tasks": [...], "projects": [...]}``.
        The executor returns these for ``select`` / ``get`` actions and auto-updates
        them on ``insert`` / ``update`` / ``delete`` so subsequent selects reflect changes.
    """
    fixture_dir: Path
    seed_records: dict[str, list[dict]] = field(default_factory=dict)
    mutations: list[Mutation] = field(default_factory=list)
    _id_counter: int = field(default=1000, repr=False)
    # ── Public API ───────────────────────────────────────────────────
    def reset(self) -> None:
        """Clear recorded mutations (keep seed_records intact)."""
        self.mutations.clear()
    def get_mutations(self, *, table: str | None = None, action: str | None = None) -> list[Mutation]:
        """Filter mutations by table and/or action."""
        result = self.mutations
        if table:
            result = [m for m in result if m.table == table]
        if action:
            result = [m for m in result if m.action == action]
        return result
    def created_records(self, table: str) -> list[dict]:
        """Return data dicts of all inserts into *table*."""
        return [m.data for m in self.mutations if m.table == table and m.action == "insert"]
    def updated_records(self, table: str) -> list[dict]:
        """Return data dicts of all updates to *table*."""
        return [m.data for m in self.mutations if m.table == table and m.action == "update"]
    # ── Context manager for patching ──────────────────────────────
    @contextmanager
    def patch(self):
        """Patch execute_on_client and DB session at all usage sites."""
        mock_fn = AsyncMock(side_effect=self._handle)
        targets = [
            "shared.ws_context.execute_on_client",
            "app.agent_runner.execute_on_client",
            "app.agents.filesystem_agent.execute_on_client",
        ]
        # Mock async_session so run_local_agent / _finalize_run skip real DB
        fake_row = _FakeRow()
        fake_db = AsyncMock()
        fake_db.commit = AsyncMock()
        fake_db.refresh = AsyncMock()
        fake_db.execute = AsyncMock(return_value=_FakeResult(fake_row))
        fake_db.add = lambda obj: None  # noqa: ARG005
        @asynccontextmanager
        async def _fake_session():
            yield fake_db
        patches = [patch(t, new=mock_fn) for t in targets]
        patches.append(patch("app.agent_runner.async_session", _fake_session))
        for p in patches:
            p.start()
        try:
            yield mock_fn
        finally:
            for p in patches:
                p.stop()
    # ── Internal dispatch ─────────────────────────────────────────
    async def _handle(
        self,
        action: str,
        table: str | None = None,
        data: dict[str, Any] | None = None,
        filters: dict[str, Any] | None = None,
        vector: list[float] | None = None,
        limit: int | None = None,
    ) -> dict[str, Any]:
        # Filesystem
        if action == "list_directory":
            return self._list_directory(data or {})
        if action == "read_file_content":
            return self._read_file(data or {})
        if action == "get_file_metadata":
            return self._get_file_metadata(data or {})
        # CRUD
        if action == "select":
            return self._select(table or "", filters)
        if action == "get":
            return self._get(table or "", data or {})
        if action == "insert":
            return self._insert(table or "", data or {})
        if action == "update":
            return self._update(table or "", data or {})
        if action == "delete":
            return self._delete(table or "", data or {})
        # Vector (no-op for eval)
        if action in ("vector_upsert", "vector_search"):
            return {"rows": []}
        return {"error": f"Unknown action: {action}"}
    # ── Filesystem handlers ───────────────────────────────────────
    def _list_directory(self, data: dict) -> dict:
        rel_path = data.get("path", "")
        abs_path = self.fixture_dir / rel_path.lstrip("/\\")
        if not abs_path.is_dir():
            return {"entries": []}
        entries: list[dict] = []
        for child in sorted(abs_path.iterdir()):
            entry_type = "directory" if child.is_dir() else "file"
            # Return paths relative to fixture_dir but with the original prefix
            entry_path = rel_path.rstrip("/\\") + "/" + child.name
            entries.append({
                "name": child.name,
                "path": entry_path,
                "type": entry_type,
            })
        return {"entries": entries}
    def _read_file(self, data: dict) -> dict:
        rel_path = data.get("path", "")
        abs_path = self.fixture_dir / rel_path.lstrip("/\\")
        if not abs_path.is_file():
            return {"content": "", "error": f"File not found: {rel_path}"}
        return {"content": abs_path.read_text(encoding="utf-8", errors="replace")}
    def _get_file_metadata(self, data: dict) -> dict:
        rel_path = data.get("path", "")
        abs_path = self.fixture_dir / rel_path.lstrip("/\\")
        if not abs_path.exists():
            return {"error": f"Not found: {rel_path}"}
        stat = abs_path.stat()
        return {
            "path": rel_path,
            "size": stat.st_size,
            "modifiedAt": int(stat.st_mtime * 1000),
            "createdAt": int(stat.st_ctime * 1000),
            "isDirectory": abs_path.is_dir(),
        }
    # ── CRUD handlers ─────────────────────────────────────────────
    def _select(self, table: str, filters: dict | None) -> dict:
        rows = list(self.seed_records.get(table, []))
        if filters:
            rows = [
                r for r in rows
                if all(r.get(k) == v for k, v in filters.items() if v is not None)
            ]
        return {"rows": rows}
    def _get(self, table: str, data: dict) -> dict:
        record_id = data.get("id", "")
        rows = self.seed_records.get(table, [])
        for r in rows:
            if r.get("id") == record_id:
                return {"row": r}
        return {"row": None}
    def _insert(self, table: str, data: dict) -> dict:
        self._id_counter += 1
        record = {**data, "id": str(self._id_counter)}
        # Add to seed so subsequent selects can find it
        self.seed_records.setdefault(table, []).append(record)
        self.mutations.append(Mutation(action="insert", table=table, data=record))
        return {"row": record}
    def _update(self, table: str, data: dict) -> dict:
        record_id = data.get("id", "")
        rows = self.seed_records.get(table, [])
        for r in rows:
            if r.get("id") == record_id:
                r.update({k: v for k, v in data.items() if v is not None and v != ""})
                self.mutations.append(Mutation(action="update", table=table, data=dict(r)))
                return {"row": r}
        # Record not found — still log the mutation
        self.mutations.append(Mutation(action="update", table=table, data=data))
        return {"row": data}
    def _delete(self, table: str, data: dict) -> dict:
        record_id = data.get("id", "")
        rows = self.seed_records.get(table, [])
        self.seed_records[table] = [r for r in rows if r.get("id") != record_id]
        self.mutations.append(Mutation(action="delete", table=table, data={"id": record_id}))
        return {"deleted": True}
--- a/services/batch-agent/eval/requirements.txt
+++ b/services/batch-agent/eval/requirements.txt
@@ -1,2 +0,0 @@
 # Extra dependencies for the eval harness (on top of the service requirements.txt)
 pyyaml>=6.0.0
--- a/services/batch-agent/eval/runner.py
+++ b/services/batch-agent/eval/runner.py
@@ -1,545 +0,0 @@
 """Eval runner — orchestrates fixture → mock → agent pipeline → scoring.
 Supports three eval modes:
 - **step1**: Test classification prompt only (``_STEP1_SYSTEM_PROMPT``).
  Calls the LLM with fixture-provided ``domain_definitions`` and
  ``projects_list`` and compares output against ``expected_classification``.
 - **step2**: Test processing prompt only (``_PROCESSING_SYSTEM_PROMPT``).
  Compiles the prompt with fixture-provided ``existing_context``,
  ``project_context``, ``data_types``, and ``custom_prompt_section``,
  then runs the tool-calling loop.  Mutations are scored against
  ``expected`` records.
 - **full**: Run ``run_local_agent()`` end-to-end (both steps).
  Scored on both classification and extraction.
 """
 from __future__ import annotations
 import copy
 import json
 import logging
 import time
 import uuid
 from typing import Any
 from eval.config import EvalFixture, ExpectedClassification
 from eval.mock_executor import MockExecutor
 from eval.scorer import (
    EvalScores,
    FieldScore,
    compute_precision_recall,
    llm_judge_score,
    score_field_match,
 )
 from eval import langfuse_eval
 logger = logging.getLogger(__name__)
 # ── Step 1 runner ─────────────────────────────────────────────────────────
 async def _run_step1(
    fixture: EvalFixture,
    model: str,
    mock: MockExecutor,
 ) -> list[dict[str, Any]]:
    """Run step-1 classification for every file in the fixture directory.
    Scans the directory recursively, classifies each file, and returns
    a list of result dicts:
    ``[{file, project_id, domains, new_project_name}, ...]``
    """
    from app.agent_runner import _classify_file
    # Build project name lookup for display
    proj_names: dict[str, str] = {
        p.get("id", ""): p.get("name", "") for p in fixture.projects_list
    }
    # Discover all files in the fixture directory
    all_files = await _scan_fixture_files(mock, fixture.directory)
    print(f"\n  Scanning {len(all_files)} files in {fixture.directory}\n")
    results: list[dict[str, Any]] = []
    for i, file_path in enumerate(all_files, 1):
        file_result = await mock._handle(
            action="read_file_content",
            data={"path": file_path},
        )
        file_content: str = file_result.get("content", "")
        if not file_content.strip():
            continue
        project_id, domains, new_name = await _classify_file(
            file_path=file_path,
            file_content=file_content,
            projects=fixture.projects_list,
            config_data_types=fixture.data_types,
            custom_system_prompt=fixture.custom_step1_prompt or None,
        )
        short_name = file_path.rsplit("/", 1)[-1] if "/" in file_path else file_path
        proj_label = proj_names.get(project_id, new_name or "?")
        print(f"  [{i}/{len(all_files)}] {short_name}  →  {project_id} ({proj_label})  {domains}")
        results.append({
            "file": file_path,
            "project_id": project_id,
            "domains": domains,
            "new_project_name": new_name,
        })
    return results
 async def _scan_fixture_files(mock: MockExecutor, directory: str) -> list[str]:
    """Recursively list all files under *directory* via the mock executor."""
    files: list[str] = []
    async def _walk(path: str) -> None:
        result = await mock._handle(action="list_directory", data={"path": path})
        for entry in result.get("entries", []):
            if entry.get("type") == "directory":
                await _walk(entry["path"])
            elif entry.get("type") == "file":
                files.append(entry["path"])
    await _walk(directory)
    return sorted(files)
 def _score_step1(
    fixture: EvalFixture,
    results: list[dict[str, Any]],
 ) -> tuple[float, float, float, str]:
    """Score step-1 results. Returns (precision, recall, f1, reasoning).
    Files with expected classifications are scored (OK/FAIL).
    Files without expectations are shown as informational (INFO).
    """
    if not fixture.expected_classification:
        return 0.0, 0.0, 0.0, "No expected classifications"
    # Build project name lookup
    proj_names: dict[str, str] = {
        p.get("id", ""): p.get("name", "") for p in fixture.projects_list
    }
    proj_names["new"] = "(new project)"
    def _proj_label(pid: str, new_name: str | None = None) -> str:
        name = proj_names.get(pid, "?")
        if pid == "new" and new_name:
            return f"new → \"{new_name}\""
        return f"{pid} ({name})" if name and name != "?" else pid
    def _short_file(path: str) -> str:
        """Use just the filename for cleaner display."""
        return path.rsplit("/", 1)[-1] if "/" in path else path
    expected_files = {ec.file for ec in fixture.expected_classification}
    total = len(fixture.expected_classification)
    matched = 0
    scored_lines: list[str] = []
    info_lines: list[str] = []
    # Score expected files
    for ec in fixture.expected_classification:
        actual = next((r for r in results if r["file"] == ec.file), None)
        fname = _short_file(ec.file)
        if actual is None:
            scored_lines.append(f"  MISS  {fname}")
            scored_lines.append(f"          expected: {_proj_label(ec.project_id)}")
            continue
        pid_ok = actual["project_id"] == ec.project_id
        domains_ok = set(actual["domains"]) == set(ec.domains) if ec.domains else True
        if pid_ok and domains_ok:
            matched += 1
            scored_lines.append(f"  OK    {fname}")
            scored_lines.append(f"          project: {_proj_label(actual['project_id'])}")
            scored_lines.append(f"          domains: {actual['domains']}")
        else:
            scored_lines.append(f"  FAIL  {fname}")
            if not pid_ok:
                scored_lines.append(f"          project: {_proj_label(actual['project_id'])}  (expected: {_proj_label(ec.project_id)})")
            else:
                scored_lines.append(f"          project: {_proj_label(actual['project_id'])}")
            if not domains_ok:
                scored_lines.append(f"          domains: {actual['domains']}  (expected: {ec.domains})")
            else:
                scored_lines.append(f"          domains: {actual['domains']}")
    # Show unscored files
    for r in results:
        if r["file"] not in expected_files:
            fname = _short_file(r["file"])
            proj = _proj_label(r["project_id"], r.get("new_project_name"))
            info_lines.append(f"  ·     {fname}")
            info_lines.append(f"          project: {proj}  |  domains: {r['domains']}")
    precision = matched / total if total > 0 else 0.0
    recall = precision
    f1 = precision
    parts: list[str] = []
    if scored_lines:
        parts.append(f"Scored ({matched}/{total}):")
        parts.extend(scored_lines)
    if info_lines:
        parts.append(f"\nOther files ({len(info_lines) // 2}):")
        parts.extend(info_lines)
    return precision, recall, f1, "\n".join(parts)
 # ── Step 2 runner ─────────────────────────────────────────────────────────
 async def _run_step2(
    fixture: EvalFixture,
    model: str,
    mock: MockExecutor,
 ) -> None:
    """Run step-2 processing for each file in the fixture directory.
    Compiles ``_PROCESSING_SYSTEM_PROMPT`` with fixture-provided variables
    and runs the tool-calling loop.  Mutations are captured by the mock.
    """
    from app.agent_runner import (
        _PROCESSING_SYSTEM_PROMPT,
        _build_processing_tools,
        _run_agent_with_tools,
        _MAX_PROCESSING_STEPS,
    )
    from app import tracing
    # Compile the processing prompt with fixture variables
    system_prompt = tracing.compile_prompt(
        "batch_processing",
        fallback=_PROCESSING_SYSTEM_PROMPT,
        variables={
            "existing_context": fixture.existing_context,
            "project_context": fixture.project_context,
            "data_types": ", ".join(fixture.data_types),
            "custom_prompt_section": fixture.custom_prompt_section,
        },
    )
    tools = _build_processing_tools(fixture.data_types)
    # Scan files in the fixture directory
    file_entries = await mock._handle(
        action="list_directory",
        data={"path": fixture.directory},
    )
    for entry in file_entries.get("entries", []):
        if entry.get("type") != "file":
            continue
        # Filter by extension if specified
        if fixture.file_extensions:
            ext = entry["name"].rsplit(".", 1)[-1] if "." in entry["name"] else ""
            if ext not in fixture.file_extensions:
                continue
        file_result = await mock._handle(
            action="read_file_content",
            data={"path": entry["path"]},
        )
        file_content: str = file_result.get("content", "")
        if not file_content.strip():
            continue
        await _run_agent_with_tools(
            system_prompt=system_prompt,
            user_message=(
                f"Process this file and extract relevant information.\n\n"
                f"File: {entry['path']}\n\nContent:\n{file_content}"
            ),
            tools=tools,
            max_steps=_MAX_PROCESSING_STEPS,
        )
 # ── Full runner ───────────────────────────────────────────────────────────
 async def _run_full(
    fixture: EvalFixture,
    model: str,
    mock: MockExecutor,
    user_id: str,
 ) -> None:
    """Run the full two-step pipeline via ``run_local_agent``."""
    from app.agent_runner import run_local_agent
    trigger_data: dict[str, Any] = {
        "type": "agent_trigger",
        "directory": fixture.directory,
        "directory_paths": [fixture.directory],
        "data_types": fixture.data_types,
        "file_extensions": fixture.file_extensions,
        "prompt_template": fixture.custom_prompt_section,
        "device_id": "eval-harness",
        "run_context": {
            "agent_id": f"eval-{fixture.name}",
            "run_id": None,
        },
    }
    with mock.patch():
        await run_local_agent(user_id, trigger_data)
 # ── Scoring helpers ───────────────────────────────────────────────────────
 def _score_mutations(
    fixture: EvalFixture,
    mock: MockExecutor,
 ) -> tuple[list[FieldScore], float, float, float, int, int]:
    """Score mutations against expected records.
    Returns (field_scores, precision, recall, f1, extra, missing).
    """
    all_field_scores: list[FieldScore] = []
    total_expected = 0
    total_actual = 0
    total_matched = 0
    total_extra = 0
    total_missing = 0
    expected_by_table: dict[str, list[dict]] = {}
    for rec in fixture.expected:
        expected_by_table.setdefault(rec.table, []).append(rec.fields)
    tables = set(expected_by_table.keys()) | {m.table for m in mock.mutations}
    for table in tables:
        expected_records = expected_by_table.get(table, [])
        actual_records = mock.created_records(table) + mock.updated_records(table)
        field_scores, extra, missing = score_field_match(expected_records, actual_records, table)
        all_field_scores.extend(field_scores)
        matched = sum(1 for s in field_scores if s.best_match is not None)
        total_expected += len(expected_records)
        total_actual += len(actual_records)
        total_matched += matched
        total_extra += extra
        total_missing += missing
    precision, recall, f1 = compute_precision_recall(total_expected, total_actual, total_matched)
    return all_field_scores, precision, recall, f1, total_extra, total_missing
 # ── Main entry point ──────────────────────────────────────────────────────
 async def run_single_eval(
    fixture: EvalFixture,
    model: str,
    *,
    use_llm_judge: bool = True,
    judge_model: str = "gpt-4o-mini",
 ) -> EvalScores:
    """Execute one eval run for a fixture + model.  Mode is read from the fixture."""
    from shared.config import settings
    from shared.ws_context import set_current_user, clear_current_user
    seed = copy.deepcopy(fixture.seed_records)
    mock = MockExecutor(
        fixture_dir=fixture.fixture_path.parent,
        seed_records=seed,
    )
    original_model = settings.LLM_MODEL
    settings.LLM_MODEL = model
    eval_user_id = str(uuid.uuid4())
    logger.info(
        "eval: starting %s | mode=%s | model=%s",
        fixture.name, fixture.mode, model,
    )
    start_time = time.time()
    step1_results: list[dict[str, Any]] = []
    step1_reasoning = ""
    try:
        set_current_user(eval_user_id)
        if fixture.mode == "step1":
            with mock.patch():
                step1_results = await _run_step1(fixture, model, mock)
        elif fixture.mode == "step2":
            with mock.patch():
                await _run_step2(fixture, model, mock)
        elif fixture.mode == "full":
            with mock.patch():
                # Step 1 — classification (independent from run_local_agent)
                if fixture.expected_classification:
                    step1_results = await _run_step1(fixture, model, mock)
            # Step 2 — full pipeline (run_local_agent handles both steps)
            await _run_full(fixture, model, mock, eval_user_id)
    except Exception as exc:
        logger.error("eval: pipeline failed for %s/%s: %s", fixture.name, model, exc)
    finally:
        settings.LLM_MODEL = original_model
        clear_current_user()
    elapsed = time.time() - start_time
    logger.info("eval: completed in %.1fs — %d mutations", elapsed, len(mock.mutations))
    # ── Score ─────────────────────────────────────────────────────
    if fixture.mode == "step1":
        s1_precision, s1_recall, s1_f1, step1_reasoning = _score_step1(fixture, step1_results)
        scores = EvalScores(
            fixture_name=fixture.name,
            model=model,
            prompt_variant=fixture.mode,
            precision=s1_precision,
            recall=s1_recall,
            f1=s1_f1,
            llm_judge_reasoning=step1_reasoning,
        )
    else:
        # step2 or full — score mutations
        field_scores, precision, recall, f1, extra, missing = _score_mutations(fixture, mock)
        scores = EvalScores(
            fixture_name=fixture.name,
            model=model,
            prompt_variant=fixture.mode,
            field_scores=field_scores,
            precision=precision,
            recall=recall,
            f1=f1,
            extra_records=extra,
            missing_records=missing,
        )
        # Add step1 classification scores for full mode
        if fixture.mode == "full" and fixture.expected_classification:
            s1_p, s1_r, s1_f1, step1_reasoning = _score_step1(fixture, step1_results)
            scores.llm_judge_reasoning = f"Step1 classification:\n{step1_reasoning}"
        # Optional LLM judge for extraction quality
        if use_llm_judge and fixture.expected:
            all_expected = [r.fields for r in fixture.expected]
            all_actual = [m.data for m in mock.mutations if m.action in ("insert", "update")]
            judge_score, reasoning = await llm_judge_score(
                all_expected, all_actual, judge_model=judge_model,
            )
            scores.llm_judge_score = judge_score
            if step1_reasoning:
                scores.llm_judge_reasoning += f"\n\nLLM judge:\n{reasoning}"
            else:
                scores.llm_judge_reasoning = reasoning
    # ── Report to Langfuse ────────────────────────────────────────
    prompt_names = {
        "step1": ["batch_file_classifier"],
        "step2": ["batch_processing"],
        "full": ["batch_file_classifier", "batch_processing"],
    }.get(fixture.mode, ["batch_processing"])
    trace_id = langfuse_eval.log_eval_trace(
        fixture_name=fixture.name,
        model=model,
        prompt_variant=fixture.mode,
        prompt_template=fixture.custom_prompt_section or "(default)",
        actual_mutations=[{"action": m.action, "table": m.table, "data": m.data} for m in mock.mutations],
        scores_summary=scores.summary(),
        step1_results=step1_results or None,
        langfuse_prompt_names=prompt_names,
    )
    if trace_id:
        langfuse_eval.post_eval_scores(scores, trace_id=trace_id)
        # For full mode, post classification scores separately
        if fixture.mode == "full" and fixture.expected_classification:
            s1_p, s1_r, s1_f1, _ = _score_step1(fixture, step1_results)
            for name, value in [
                ("classification_precision", s1_p),
                ("classification_recall", s1_r),
                ("classification_f1", s1_f1),
            ]:
                try:
                    from langfuse import get_client
                    lf = get_client()
                    if lf:
                        lf.create_score(
                            name=name,
                            value=value,
                            trace_id=trace_id,
                            data_type="NUMERIC",
                            comment=f"{fixture.name} | {model} | full",
                        )
                except Exception:
                    pass
    return scores
 async def run_fixture_eval(
    fixture: EvalFixture,
    models: list[str],
    *,
    use_llm_judge: bool = True,
    judge_model: str = "gpt-4o-mini",
 ) -> list[EvalScores]:
    """Run all models for a fixture."""
    langfuse_eval.sync_fixture_to_dataset(fixture)
    results: list[EvalScores] = []
    for model in models:
        scores = await run_single_eval(
            fixture, model,
            use_llm_judge=use_llm_judge,
            judge_model=judge_model,
        )
        results.append(scores)
    return results
 def print_results(results: list[EvalScores]) -> None:
    """Print a formatted summary table of eval results."""
    if not results:
        print("\nNo eval results.")
        return
    W = 90
    print("\n" + "=" * W)
    print(f"{'Fixture':<25} {'Mode':<6} {'Model':<25} {'P':>6} {'R':>6} {'F1':>6} {'FA':>6} {'LLM':>6}")
    print("-" * W)
    for s in results:
        llm_str = f"{s.llm_judge_score:.2f}" if s.llm_judge_score is not None else "  --"
        fa_str = f"{s.field_accuracy:.2f}" if s.field_scores else "  --"
        print(
            f"{s.fixture_name:<25} {s.prompt_variant:<6} {s.model:<25} "
            f"{s.precision:>6.2f} {s.recall:>6.2f} {s.f1:>6.2f} "
            f"{fa_str:>6} {llm_str:>6}"
        )
    print("=" * W)
    for s in results:
        if s.llm_judge_reasoning:
            print(f"\n{'─' * W}")
            print(f"  {s.fixture_name}  |  {s.model}  |  {s.prompt_variant}")
            print(f"{'─' * W}")
            print(s.llm_judge_reasoning)
    print()
--- a/services/batch-agent/eval/scorer.py
+++ b/services/batch-agent/eval/scorer.py
@@ -1,268 +0,0 @@
 """Scoring functions for batch agent evaluation.
 Two scoring strategies:
 1. **FieldMatchScorer** — deterministic check: for each expected record,
   find the best-matching actual record and compare specified fields.
   Returns precision, recall, and per-field accuracy.
 2. **LLMJudgeScorer** — uses a secondary LLM to semantically evaluate
   whether the actual extractions satisfy the expected intent, even if
   wording differs.  Returns a 0-1 score + reasoning.
 """
 from __future__ import annotations
 import json
 import logging
 from dataclasses import dataclass, field
 from difflib import SequenceMatcher
 from typing import Any
 from langchain_core.messages import HumanMessage, SystemMessage
 logger = logging.getLogger(__name__)
 # ── Result types ─────────────────────────────────────────────────────────
@dataclass
 class FieldScore:
    """Score for a single expected record against its best match."""
    expected: dict[str, Any]
    best_match: dict[str, Any] | None
    matched_fields: dict[str, bool]
    similarity: float  # 0-1 overall similarity
    @property
    def field_accuracy(self) -> float:
        if not self.matched_fields:
            return 0.0
        return sum(self.matched_fields.values()) / len(self.matched_fields)
@dataclass
 class EvalScores:
    """Aggregated scores for one eval run."""
    fixture_name: str
    model: str
    prompt_variant: str
    field_scores: list[FieldScore] = field(default_factory=list)
    precision: float = 0.0
    recall: float = 0.0
    f1: float = 0.0
    llm_judge_score: float | None = None
    llm_judge_reasoning: str = ""
    extra_records: int = 0  # records created but not expected
    missing_records: int = 0  # expected but not found
    @property
    def field_accuracy(self) -> float:
        if not self.field_scores:
            return 0.0
        return sum(s.field_accuracy for s in self.field_scores) / len(self.field_scores)
    def summary(self) -> dict[str, Any]:
        return {
            "fixture": self.fixture_name,
            "model": self.model,
            "prompt_variant": self.prompt_variant,
            "precision": round(self.precision, 3),
            "recall": round(self.recall, 3),
            "f1": round(self.f1, 3),
            "field_accuracy": round(self.field_accuracy, 3),
            "llm_judge_score": round(self.llm_judge_score, 3) if self.llm_judge_score is not None else None,
            "extra_records": self.extra_records,
            "missing_records": self.missing_records,
        }
 # ── Field Match Scorer ───────────────────────────────────────────────────
 def _normalize(value: Any) -> str:
    """Normalize a value for comparison."""
    if value is None:
        return ""
    return str(value).strip().lower()
 def _text_similarity(a: str, b: str) -> float:
    """Fuzzy text similarity using SequenceMatcher."""
    if not a and not b:
        return 1.0
    if not a or not b:
        return 0.0
    return SequenceMatcher(None, a.lower(), b.lower()).ratio()
 def _find_best_match(
    expected: dict[str, Any],
    actuals: list[dict[str, Any]],
 ) -> tuple[dict[str, Any] | None, float]:
    """Find the actual record most similar to expected, return (match, similarity)."""
    if not actuals:
        return None, 0.0
    best_match = None
    best_score = 0.0
    # Primary matching key: title or name
    expected_title = _normalize(expected.get("title", expected.get("name", "")))
    for actual in actuals:
        actual_title = _normalize(actual.get("title", actual.get("name", "")))
        sim = _text_similarity(expected_title, actual_title)
        if sim > best_score:
            best_score = sim
            best_match = actual
    return best_match, best_score
 def _compare_fields(
    expected: dict[str, Any],
    actual: dict[str, Any],
 ) -> dict[str, bool]:
    """Compare each expected field against the actual record."""
    results: dict[str, bool] = {}
    for key, expected_val in expected.items():
        actual_val = actual.get(key)
        # Exact match for non-string types
        if not isinstance(expected_val, str):
            results[key] = actual_val == expected_val
        else:
            # Fuzzy match for strings (threshold: 0.7)
            results[key] = _text_similarity(
                _normalize(expected_val), _normalize(actual_val)
            ) >= 0.7
    return results
 def score_field_match(
    expected_records: list[dict[str, Any]],
    actual_records: list[dict[str, Any]],
    table: str,
 ) -> tuple[list[FieldScore], int, int]:
    """Score actual extractions against expected records for one table.
    Returns (field_scores, extra_count, missing_count).
    """
    field_scores: list[FieldScore] = []
    matched_actuals: set[int] = set()
    for exp in expected_records:
        # Find best match among unmatched actuals
        candidates = [
            (i, a) for i, a in enumerate(actual_records) if i not in matched_actuals
        ]
        if not candidates:
            field_scores.append(FieldScore(
                expected=exp, best_match=None, matched_fields={}, similarity=0.0,
            ))
            continue
        best_idx, best_match = None, None
        best_sim = 0.0
        for idx, actual in candidates:
            _, sim = _find_best_match(exp, [actual])
            if sim > best_sim:
                best_sim = sim
                best_idx = idx
                best_match = actual
        if best_sim >= 0.5 and best_match is not None:
            matched_actuals.add(best_idx)
            matched_fields = _compare_fields(exp, best_match)
            field_scores.append(FieldScore(
                expected=exp, best_match=best_match,
                matched_fields=matched_fields, similarity=best_sim,
            ))
        else:
            field_scores.append(FieldScore(
                expected=exp, best_match=None, matched_fields={}, similarity=0.0,
            ))
    extra_count = len(actual_records) - len(matched_actuals)
    missing_count = sum(1 for s in field_scores if s.best_match is None)
    return field_scores, extra_count, missing_count
 def compute_precision_recall(
    expected_count: int,
    actual_count: int,
    matched_count: int,
 ) -> tuple[float, float, float]:
    """Compute precision, recall, F1."""
    precision = matched_count / actual_count if actual_count > 0 else 0.0
    recall = matched_count / expected_count if expected_count > 0 else 0.0
    f1 = (
        2 * precision * recall / (precision + recall)
        if (precision + recall) > 0
        else 0.0
    )
    return precision, recall, f1
 # ── LLM Judge Scorer ─────────────────────────────────────────────────────
 _JUDGE_SYSTEM_PROMPT = """\
 You are an evaluation judge for a data extraction system.
 Your task is to compare the EXPECTED extractions against the ACTUAL extractions
 produced by an AI agent, and assess quality on a 0-1 scale.
 Scoring criteria:
 - 1.0: All expected records found with correct fields, no significant extras
 - 0.8: Most expected records found, minor field differences or extras
 - 0.6: Core extractions present but some missing or incorrect
 - 0.4: Partial match — several expected records missing or wrong
 - 0.2: Poor quality — most expected records missing or incorrect
 - 0.0: Complete failure — no meaningful overlap
 Consider semantic equivalence: "Send invoice" and "Email the invoice" are matches.
 Ignore field ordering and formatting differences.
 Respond with ONLY a JSON object:
 {"score": 0.85, "reasoning": "Brief explanation of the score"}
 """
 async def llm_judge_score(
    expected: list[dict[str, Any]],
    actual: list[dict[str, Any]],
    *,
    judge_model: str = "gpt-4o-mini",
 ) -> tuple[float, str]:
    """Use an LLM to semantically evaluate extraction quality.
    Returns (score, reasoning).
    """
    from shared.llm import get_llm
    llm = get_llm(model=judge_model, temperature=0)
    user_content = (
        f"## Expected extractions\n```json\n{json.dumps(expected, indent=2, default=str)}\n```\n\n"
        f"## Actual extractions\n```json\n{json.dumps(actual, indent=2, default=str)}\n```"
    )
    try:
        response = await llm.ainvoke([
            SystemMessage(content=_JUDGE_SYSTEM_PROMPT),
            HumanMessage(content=user_content),
        ])
        raw = response.content.strip()
        if raw.startswith("```"):
            raw = raw.split("```")[1]
            if raw.startswith("json"):
                raw = raw[4:]
        parsed = json.loads(raw.strip())
        return float(parsed.get("score", 0.0)), str(parsed.get("reasoning", ""))
    except Exception as exc:
        logger.warning("eval: LLM judge failed: %s", exc)
        return 0.0, f"Judge error: {exc}"
--- a/services/batch-agent/requirements.txt
+++ b/services/batch-agent/requirements.txt
@@ -1,21 +0,0 @@
 fastapi>=0.115.0
 uvicorn[standard]>=0.34.0
 gunicorn>=22.0.0
 pydantic>=2.10.0
 pydantic-settings>=2.7.0
 sqlalchemy>=2.0.0
 asyncpg>=0.30.0
 redis>=5.0.0
 cryptography>=42.0.0
 python-dotenv>=1.0.0
 langchain-core>=0.3.0
 langchain-openai>=0.3.0
 langchain-litellm>=0.3.0
 litellm>=1.50.0
 openai>=1.50.0
 httpx>=0.27.0
 langfuse>=3.0.0
 croniter>=2.0.0
 google-api-python-client>=2.130.0
 google-auth>=2.30.0
 msal>=1.28.0
--- a/services/billing/Dockerfile
+++ b/services/billing/Dockerfile
@@ -1,36 +0,0 @@
 # ── builder ──────────────────────────────────────────────────────────────────
 FROM python:3.12-slim AS builder
 WORKDIR /build
 COPY services/billing/requirements.txt ./requirements.txt
 RUN pip install --upgrade pip && \
    pip install --no-cache-dir --prefix=/install -r requirements.txt
 # ── runtime ──────────────────────────────────────────────────────────────────
 FROM python:3.12-slim AS runtime
 RUN addgroup --system appgroup && adduser --system --ingroup appgroup appuser
 WORKDIR /app
 COPY --from=builder /install /usr/local
 # Shared module
 COPY shared/ shared/
 # Service source
 COPY services/billing/app/ app/
 RUN chown -R appuser:appgroup /app
 USER appuser
 EXPOSE 8000
 # Billing is lightweight — single worker is fine
 CMD ["gunicorn", "app.main:app", \
     "-k", "uvicorn.workers.UvicornWorker", \
     "--bind", "0.0.0.0:8000", \
     "--workers", "1", \
     "--timeout", "30"]
--- a/services/billing/README.md
+++ b/services/billing/README.md
@@ -1,15 +0,0 @@
 # Billing Service
 Owns: Stripe integration, tier management, subscription CRUD.
 ## Tables owned (write)
 - `subscriptions`
 ## Endpoints
 - `POST /billing/checkout`
 - `POST /billing/webhook` (Stripe, no JWT auth)
 - `GET /billing/subscription`
 - `DELETE /billing/subscription`
 ## Redis channels
 - Publish: `tier:changed:{user_id}` on tier change
--- a/services/billing/app/main.py
+++ b/services/billing/app/main.py
@@ -1,53 +0,0 @@
 """Billing Service — FastAPI application.
 Owns: Stripe checkout/webhook, subscription management, tier feature matrix,
 quota enforcement.
 Downstream services query this service (or read the user's tier from
 the X-User-Tier header injected by Traefik) for billing decisions.
 The webhook endpoint is exposed WITHOUT ForwardAuth so Stripe can reach it.
 """
 from __future__ import annotations
 import logging
 import sys
 from contextlib import asynccontextmanager
 from pathlib import Path
 from typing import AsyncGenerator
 # Ensure the repo root is on sys.path so "shared" is importable in local dev.
 _repo_root = str(Path(__file__).resolve().parents[3])
 if _repo_root not in sys.path:
    sys.path.insert(0, _repo_root)
 from fastapi import FastAPI
 from fastapi.middleware.cors import CORSMiddleware
 from app.routes import router
 logger = logging.getLogger(__name__)
@asynccontextmanager
 async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]:
    logger.info("billing: service started")
    yield
    logger.info("billing: service stopped")
 app = FastAPI(title="Adiuva Billing Service", lifespan=lifespan)
 app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_methods=["GET", "POST", "DELETE"],
    allow_headers=["*"],
 )
 app.include_router(router)
@app.get("/health")
 async def health() -> dict[str, str]:
    return {"status": "ok", "service": "billing"}
--- a/services/billing/app/routes.py
+++ b/services/billing/app/routes.py
@@ -1,134 +0,0 @@
 """Billing routes: Stripe checkout, webhook, subscription, tier query.
 Adapted for the Billing microservice:
  - Authenticated routes use Traefik-injected headers (X-User-Id, X-User-Tier)
  - Webhook route has NO auth (Stripe signature verification only)
  - Added /tier/{user_id} for internal service-to-service tier lookups
  - Added /features/{tier} for feature matrix queries
 """
 from __future__ import annotations
 from typing import Any
 from fastapi import APIRouter, Header, HTTPException, Request, status
 from pydantic import BaseModel
 from shared.db import async_session
 from shared.schemas import BillingTier
 from app.stripe_service import stripe_service
 from app.tier_manager import tier_manager, FEATURES, RATE_LIMITS
 router = APIRouter(prefix="/billing", tags=["billing"])
 # ── Request bodies ─────────────────────────────────────────────────────
 class _CheckoutRequest(BaseModel):
    tier: BillingTier
 # ── Checkout ───────────────────────────────────────────────────────────
@router.post("/checkout")
 async def create_checkout(
    body: _CheckoutRequest,
    x_user_id: str = Header(..., alias="X-User-Id"),
 ) -> dict[str, str]:
    """Create a Stripe checkout session for a tier upgrade."""
    url = stripe_service.create_checkout_session(x_user_id, body.tier)
    return {"checkout_url": url}
 # ── Webhook (NO auth — Stripe signature only) ─────────────────────────
@router.post("/webhook")
 async def stripe_webhook(
    request: Request,
    stripe_signature: str = Header(default="", alias="Stripe-Signature"),
 ) -> dict[str, bool]:
    """Handle Stripe webhook events.
    This endpoint is exposed without ForwardAuth in Traefik config
    so Stripe can reach it directly.
    """
    payload = await request.body()
    async with async_session() as db:
        await stripe_service.handle_webhook(payload, stripe_signature, db)
    return {"ok": True}
 # ── Subscription CRUD ─────────────────────────────────────────────────
@router.get("/subscription")
 async def get_subscription(
    x_user_id: str = Header(..., alias="X-User-Id"),
    x_user_tier: str = Header("free", alias="X-User-Tier"),
 ) -> dict[str, Any]:
    """Return the current subscription info for the authenticated user."""
    async with async_session() as db:
        sub = await stripe_service.get_subscription(x_user_id, db)
    if sub is None:
        return {
            "tier": x_user_tier,
            "status": "free",
            "stripe_subscription_id": None,
            "current_period_end": None,
        }
    return sub
@router.delete("/subscription")
 async def cancel_subscription(
    x_user_id: str = Header(..., alias="X-User-Id"),
 ) -> dict[str, bool]:
    """Cancel the active subscription."""
    async with async_session() as db:
        await stripe_service.cancel_subscription(x_user_id, db)
    return {"ok": True}
 # ── Tier query (internal, service-to-service) ─────────────────────────
@router.get("/tier/{user_id}")
 async def get_user_tier(user_id: str) -> dict[str, str]:
    """Return the billing tier for a given user_id.
    Used by other services for tier lookups. Protected by Traefik
    ForwardAuth — only internal services should call this.
    """
    async with async_session() as db:
        tier = await tier_manager.get_tier(user_id, db)
    return {"user_id": user_id, "tier": tier}
 # ── Feature matrix (public, cacheable) ────────────────────────────────
@router.get("/features/{tier}")
 async def get_tier_features(tier: str) -> dict[str, Any]:
    """Return the feature matrix for a tier."""
    if tier not in FEATURES:
        raise HTTPException(
            status_code=status.HTTP_404_NOT_FOUND,
            detail=f"Unknown tier: {tier}",
        )
    return {
        "tier": tier,
        "features": FEATURES[tier],
        "rate_limit_rpm": RATE_LIMITS.get(tier, RATE_LIMITS["free"]),
    }
@router.get("/features")
 async def get_all_features() -> dict[str, Any]:
    """Return the full feature matrix for all tiers."""
    return {
        "tiers": {
            tier: {
                "features": features,
                "rate_limit_rpm": RATE_LIMITS.get(tier, RATE_LIMITS["free"]),
            }
            for tier, features in FEATURES.items()
        },
    }
--- a/services/billing/app/tier_manager.py
+++ b/services/billing/app/tier_manager.py
@@ -1,178 +0,0 @@
 """Tier manager: feature matrix and quota enforcement.
 Single source of truth for what each billing tier allows.
 Other services can query the /tier/{user_id} endpoint or rely on the
 X-User-Tier header injected by Traefik.
 """
 from __future__ import annotations
 from typing import Any
 from fastapi import HTTPException, status
 from sqlalchemy import select
 from sqlalchemy.ext.asyncio import AsyncSession
 from shared.config import settings
 from shared.models import Subscription
 from shared.schemas import BillingTier
 # Feature matrix per tier.  -1 means unlimited; 0 means disabled.
 FEATURES: dict[str, dict[str, Any]] = {
    "free": {
        "agents": 3,
        "batch_active": 2,
        "batch_runs_per_day": 5,
        "cloud_storage_gb": 0,
        "backup_gb": 0,
        "providers": 1,
        "batch_builder": False,
        "plugin_marketplace": False,
        "sso": False,
    },
    "pro": {
        "agents": -1,
        "batch_active": 10,
        "batch_runs_per_day": 50,
        "cloud_storage_gb": 5,
        "backup_gb": 5,
        "providers": -1,
        "batch_builder": False,
        "plugin_marketplace": False,
        "sso": False,
    },
    "power": {
        "agents": -1,
        "batch_active": -1,
        "batch_runs_per_day": -1,
        "cloud_storage_gb": 25,
        "backup_gb": 25,
        "providers": -1,
        "batch_builder": True,
        "plugin_marketplace": True,
        "sso": False,
    },
    "team": {
        "agents": -1,
        "batch_active": -1,
        "batch_runs_per_day": -1,
        "cloud_storage_gb": -1,
        "backup_gb": -1,
        "providers": -1,
        "batch_builder": True,
        "plugin_marketplace": True,
        "sso": True,
    },
 }
 # Requests-per-minute limit per tier.
 RATE_LIMITS: dict[str, int] = {
    "free": 20,
    "pro": 60,
    "power": 120,
    "team": 200,
 }
 class TierManager:
    """Centralises tier feature-gating, rate-limit lookups, and quota checks."""
    async def get_tier(self, user_id: str, db: AsyncSession) -> BillingTier:
        """Return the current billing tier for user_id from the DB."""
        result = await db.execute(
            select(Subscription.tier).where(Subscription.user_id == user_id)
        )
        tier: str | None = result.scalar_one_or_none()
        if tier is None or tier not in FEATURES:
            return "power" if settings.ENV == "dev" else "free"
        return tier  # type: ignore[return-value]
    def get_features(self, tier: BillingTier) -> dict[str, Any]:
        """Return the full feature dict for a tier."""
        return FEATURES.get(tier, FEATURES["free"])
    def check_feature(self, tier: BillingTier, feature: str) -> bool:
        """Return True if tier has feature enabled."""
        value = FEATURES.get(tier, FEATURES["free"]).get(feature)
        if value is None:
            return False
        if isinstance(value, bool):
            return value
        return value != 0
    def require_feature(self, tier: BillingTier, feature: str, tier_name: str = "") -> None:
        """Raise HTTP 403 if tier does not have feature."""
        if not self.check_feature(tier, feature):
            detail = (
                f"Feature '{feature}' requires {tier_name} tier or above."
                if tier_name
                else f"Feature '{feature}' is not available on your current tier."
            )
            raise HTTPException(status_code=status.HTTP_403_FORBIDDEN, detail=detail)
    def get_rate_limit(self, tier: BillingTier) -> int:
        """Return the requests-per-minute limit for tier."""
        return RATE_LIMITS.get(tier, RATE_LIMITS["free"])
    def enforce_quota(
        self,
        tier: BillingTier,
        current_bytes: int = 0,
        additional_bytes: int = 0,
    ) -> None:
        """Raise HTTP 402 if the user would exceed their cloud storage quota."""
        limit_gb: int = FEATURES[tier]["cloud_storage_gb"]
        if limit_gb == 0:
            raise HTTPException(
                status_code=status.HTTP_402_PAYMENT_REQUIRED,
                detail=f"Cloud storage is not available on the '{tier}' tier",
            )
        if limit_gb == -1:
            return
        limit_bytes = limit_gb * 1024 ** 3
        if current_bytes + additional_bytes > limit_bytes:
            raise HTTPException(
                status_code=status.HTTP_402_PAYMENT_REQUIRED,
                detail=f"Storage quota exceeded for tier '{tier}'",
            )
    def enforce_backup_quota(
        self,
        tier: BillingTier,
        current_bytes: int = 0,
        additional_bytes: int = 0,
    ) -> None:
        """Raise HTTP 402 if the user would exceed their backup quota."""
        limit_gb: int = FEATURES[tier]["backup_gb"]
        if limit_gb == 0:
            raise HTTPException(
                status_code=status.HTTP_402_PAYMENT_REQUIRED,
                detail=f"Backup is not available on the '{tier}' tier",
            )
        if limit_gb == -1:
            return
        limit_bytes = limit_gb * 1024 ** 3
        if current_bytes + additional_bytes > limit_bytes:
            raise HTTPException(
                status_code=status.HTTP_402_PAYMENT_REQUIRED,
                detail=f"Backup quota exceeded for tier '{tier}'",
            )
    def check_quota(
        self,
        tier: BillingTier,
        current_bytes: int = 0,
        additional_bytes: int = 0,
    ) -> bool:
        """Return True if the user can store additional_bytes more data."""
        limit_gb: int = FEATURES[tier]["cloud_storage_gb"]
        if limit_gb == 0:
            return False
        if limit_gb == -1:
            return True
        limit_bytes = limit_gb * 1024 ** 3
        return current_bytes + additional_bytes <= limit_bytes
 # Module-level singleton
 tier_manager = TierManager()
--- a/services/billing/requirements.txt
+++ b/services/billing/requirements.txt
@@ -1,9 +0,0 @@
 fastapi>=0.115.0
 uvicorn[standard]>=0.34.0
 gunicorn>=22.0.0
 pydantic>=2.10.0
 pydantic-settings>=2.7.0
 sqlalchemy>=2.0.0
 asyncpg>=0.30.0
 python-dotenv>=1.0.0
 stripe>=8.0.0
--- a/services/chat/README.md
+++ b/services/chat/README.md
@@ -1,21 +0,0 @@
 # Chat Service
 Owns: deep_agent (home + floating chat), memory middleware, domain agents
 (task, note, project, timeline), LLM orchestration.
 ## Tables owned
 - `memory_core`
 - `memory_associative`
 - `memory_episodic`
 - `memory_proactive`
 ## Tables read (cross-service)
 - `users` (for encryption_key — memory decryption)
 ## Endpoints
 - `POST /chat` (REST fallback)
 ## Redis channels
 - Subscribe: `chat:request:{user_id}`
 - Publish: `ws:out:{user_id}` (stream frames + tool calls)
 - BRPOP: `tool:result:{call_id}` (30s timeout)
--- a/services/chat/app/llm.py
+++ b/services/chat/app/llm.py
@@ -1,77 +0,0 @@
 """LLM factory — centralised model instantiation via LiteLLM.
 Adapted from app/core/llm.py for the Chat Service.
 Uses shared.config.settings instead of app.config.settings.
 """
 from __future__ import annotations
 import os
 import warnings
 from openai import AsyncOpenAI
 import litellm
 from langchain_openai import ChatOpenAI
 from langchain_litellm import ChatLiteLLM
 from shared.config import settings
 litellm.drop_params = True
 warnings.filterwarnings(
    "ignore",
    message=r"PydanticSerializationUnexpectedValue\(Expected `ResponseAPIUsage`",
    category=UserWarning,
 )
 def _api_key_for_model(model: str) -> str | None:
    if model.startswith("anthropic/"):
        return settings.ANTHROPIC_API_KEY or None
    if model.startswith("gemini/") or model.startswith("google/"):
        return settings.GOOGLE_API_KEY or None
    if model.startswith("cerebras/"):
        return settings.CEREBRAS_API_KEY or None
    if model.startswith("github/"):
        return settings.GITHUB_TOKEN or None
    if model.startswith("github_copilot/"):
        return None
    return settings.OPENAI_API_KEY or None
 def get_llm(
    *,
    model: str | None = None,
    temperature: float = 0,
    callbacks: list | None = None,
 ) -> ChatOpenAI | ChatLiteLLM:
    model = model or settings.LLM_MODEL
    if settings.GITHUB_COPILOT_TOKEN_DIR:
        os.environ.setdefault("GITHUB_COPILOT_TOKEN_DIR", settings.GITHUB_COPILOT_TOKEN_DIR)
    if settings.GITHUB_TOKEN:
        os.environ.setdefault("GITHUB_TOKEN", settings.GITHUB_TOKEN)
    if "/" in model:
        return ChatLiteLLM(model=model, temperature=temperature, callbacks=callbacks)
    return ChatOpenAI(
        model=model,
        temperature=temperature,
        api_key=_api_key_for_model(model),
        callbacks=callbacks,
    )
 async def embed(text: str) -> list[float]:
    model = settings.LLM_EMBED_MODEL
    if model.startswith("github_copilot/") or "/" in model:
        response = await litellm.aembedding(model=model, input=[text])
        return response.data[0]["embedding"]
    client = AsyncOpenAI(api_key=settings.OPENAI_API_KEY)
    response = await client.embeddings.create(model=model, input=text)
    return response.data[0].embedding
--- a/services/chat/app/main.py
+++ b/services/chat/app/main.py
@@ -1,87 +0,0 @@
 """Chat Service — LLM orchestration, domain agents, memory.
 Consumes chat requests from Redis, executes deep_agent (home/floating),
 streams responses back via Redis pub/sub to WS Gateway.
 Owns: memory_core, memory_associative, memory_episodic, memory_proactive tables.
 """
 import sys
 from contextlib import asynccontextmanager
 import logging
 from pathlib import Path
 # Ensure the repo root is on sys.path so "shared" is importable in local dev.
 _repo_root = str(Path(__file__).resolve().parents[3])
 if _repo_root not in sys.path:
    sys.path.insert(0, _repo_root)
 from fastapi import FastAPI
 from fastapi.middleware.cors import CORSMiddleware
 from shared.config import settings
 logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s %(levelname)s %(name)s: %(message)s",
 )
 logging.getLogger("sqlalchemy.engine").setLevel(logging.WARNING)
 logging.getLogger("sqlalchemy.pool").setLevel(logging.WARNING)
@asynccontextmanager
 async def lifespan(app: FastAPI):
    # Initialise Langfuse tracing (no-op if keys are missing)
    from app.tracing import init_langfuse
    init_langfuse()
    # Start Redis consumer in background
    from app.redis_consumer import start_consumer
    consumer_task = start_consumer()
    yield
    consumer_task.cancel()
    from app.tracing import shutdown as shutdown_langfuse
    shutdown_langfuse()
    from shared.db import engine
    await engine.dispose()
    from shared.redis import redis_client
    await redis_client.aclose()
 def create_app() -> FastAPI:
    app = FastAPI(
        title="Adiuva Chat Service",
        version="0.1.0",
        docs_url="/docs" if settings.ENV == "dev" else None,
        redoc_url=None,
        lifespan=lifespan,
    )
    app.add_middleware(
        CORSMiddleware,
        allow_origins=settings.CORS_ORIGINS,
        allow_credentials=True,
        allow_methods=["*"],
        allow_headers=["*"],
    )
    from app.routes import router
    app.include_router(router, prefix="/api/v1")
    @app.get("/api/v1/health", tags=["health"])
    async def health() -> dict:
        return {"status": "ok", "service": "chat", "version": app.version}
    return app
 app = create_app()
--- a/services/chat/app/redis_consumer.py
+++ b/services/chat/app/redis_consumer.py
@@ -1,209 +0,0 @@
 """Redis consumer — listens for chat requests and dispatches to deep_agent.
 Subscribes to a Redis pattern channel chat:request:* so it receives
 requests for ALL users. Each request is processed in a separate asyncio task.
 """
 from __future__ import annotations
 import asyncio
 import json
 import logging
 from uuid import uuid4
 from shared.db import async_session
 from shared.redis import redis_client, ws_out_channel
 from app.deep_agent import run_floating_stream, run_home_stream
 from app.memory_middleware import MemoryMiddleware
 from app.output_formatter import StreamFormatter
 from shared.ws_context import clear_current_user, set_current_user
 from app import tracing
 logger = logging.getLogger(__name__)
 def start_consumer() -> asyncio.Task:
    """Start the Redis consumer as a background asyncio task."""
    return asyncio.create_task(_consumer_loop())
 async def _consumer_loop() -> None:
    """Subscribe to chat:request:* and dispatch incoming frames."""
    pubsub = redis_client.pubsub()
    await pubsub.psubscribe("chat:request:*")
    logger.info("redis_consumer: subscribed to chat:request:*")
    try:
        while True:
            message = await pubsub.get_message(
                ignore_subscribe_messages=True, timeout=1.0
            )
            if message is not None and message["type"] == "pmessage":
                frame = json.loads(message["data"])
                asyncio.create_task(_dispatch(frame))
            else:
                await asyncio.sleep(0.01)
    except asyncio.CancelledError:
        logger.info("redis_consumer: shutting down")
    finally:
        await pubsub.punsubscribe()
        await pubsub.aclose()
 async def _dispatch(frame: dict) -> None:
    """Route a chat request frame to the appropriate handler."""
    frame_type = frame.get("type")
    user_id = frame.get("user_id")
    if not user_id:
        logger.warning("redis_consumer: frame missing user_id: %s", frame.get("type"))
        return
    if frame_type == "home_request":
        await _handle_home_request(user_id, frame)
    elif frame_type == "floating_request":
        await _handle_floating_request(user_id, frame)
    else:
        logger.debug("redis_consumer: unknown frame type %r", frame_type)
 async def _publish_frame(user_id: str, frame_data: str) -> None:
    """Publish a frame to ws:out:{user_id} for the WS Gateway to forward."""
    channel = ws_out_channel(user_id)
    await redis_client.publish(channel, frame_data)
 async def _handle_home_request(user_id: str, frame: dict) -> None:
    """Process a home_request — enrich with memory, run deep_agent, stream results."""
    request_id = frame.get("request_id") or str(uuid4())
    message: str = frame.get("message", "")
    session_id: str = frame.get("session_id") or str(uuid4())
    logger.info(
        "redis_consumer: home_request user=%s req=%s msg=%s",
        user_id, request_id, message[:200],
    )
    response_chunks: list[str] = []
    with tracing.trace_span(
        name="home_request",
        user_id=user_id,
        session_id=session_id,
        trace_id=request_id,
        input=message,
        metadata={"message_preview": message[:200]},
        tags=["home"],
    ) as span:
        langfuse_handler = tracing.get_langfuse_callback()
        # Enrich with memory context
        async with async_session() as db:
            memory = MemoryMiddleware(db)
            memory_context = await memory.enrich_context(
                user_id, message,
                trace_id=request_id, session_id=session_id,
            )
        context: dict = {
            "conversation_history": frame.get("conversation_history", []),
            "_debug": {"request_id": request_id, "session_id": session_id, "user_id": user_id},
            **memory_context,
        }
        set_current_user(user_id)
        try:
            event_stream = run_home_stream(user_id, message, context, langfuse_handler=langfuse_handler)
            formatter = StreamFormatter(request_id=request_id)
            async for ws_frame in formatter.format(event_stream):
                await _publish_frame(user_id, ws_frame.model_dump_json())
                if hasattr(ws_frame, "chunk"):
                    response_chunks.append(ws_frame.chunk)
        except Exception as exc:
            logger.error("redis_consumer: home_request failed user=%s req=%s: %s", user_id, request_id, exc)
        finally:
            clear_current_user()
        # Link prompt and attach output preview
        tracing.link_prompt_to_trace(span, "home_system")
        response_text = "".join(response_chunks)
        span.update(output=response_text[:500] if response_text else None)
    tracing.flush()
    # Store episode
    async with async_session() as db:
        memory = MemoryMiddleware(db)
        await memory.store_episode(
            user_id, session_id, message, "".join(response_chunks),
            trace_id=request_id,
        )
 async def _handle_floating_request(user_id: str, frame: dict) -> None:
    """Process a floating_request — enrich with memory, run deep_agent, stream results."""
    request_id = frame.get("request_id") or str(uuid4())
    message: str = frame.get("message", "")
    session_id: str = frame.get("session_id") or str(uuid4())
    scope: dict = frame.get("scope", {})
    logger.info(
        "redis_consumer: floating_request user=%s req=%s scope=%s msg=%s",
        user_id, request_id, json.dumps(scope)[:200], message[:200],
    )
    response_chunks: list[str] = []
    with tracing.trace_span(
        name="floating_request",
        user_id=user_id,
        session_id=session_id,
        trace_id=request_id,
        input=message,
        metadata={"message_preview": message[:200], "scope": scope},
        tags=["floating"],
    ) as span:
        langfuse_handler = tracing.get_langfuse_callback()
        # Enrich with memory context
        async with async_session() as db:
            memory = MemoryMiddleware(db)
            memory_context = await memory.enrich_context(
                user_id, message,
                trace_id=request_id, session_id=session_id,
            )
        context: dict = {
            "scope": scope,
            "_debug": {"request_id": request_id, "session_id": session_id, "user_id": user_id},
            **memory_context,
        }
        set_current_user(user_id)
        try:
            event_stream = run_floating_stream(user_id, message, context, langfuse_handler=langfuse_handler)
            formatter = StreamFormatter(request_id=request_id)
            async for ws_frame in formatter.format(event_stream):
                await _publish_frame(user_id, ws_frame.model_dump_json())
                if hasattr(ws_frame, "chunk"):
                    response_chunks.append(ws_frame.chunk)
        except Exception as exc:
            logger.error("redis_consumer: floating_request failed user=%s req=%s: %s", user_id, request_id, exc)
        finally:
            clear_current_user()
        # Link prompt and attach output preview
        tracing.link_prompt_to_trace(span, "floating_system")
        response_text = "".join(response_chunks)
        span.update(output=response_text[:500] if response_text else None)
    tracing.flush()
    # Store episode
    async with async_session() as db:
        memory = MemoryMiddleware(db)
        await memory.store_episode(
            user_id, session_id, message, "".join(response_chunks),
            trace_id=request_id,
        )
--- a/services/chat/app/routes.py
+++ b/services/chat/app/routes.py
@@ -1,37 +0,0 @@
 """Chat REST route — POST /chat fallback when WS is unavailable."""
 from __future__ import annotations
 from fastapi import APIRouter, Request
 from fastapi.responses import JSONResponse
 from shared.schemas import ChatRequest
 from app.deep_agent import run_home
 from shared.ws_context import clear_current_user, set_current_user
 router = APIRouter(prefix="/chat", tags=["chat"])
@router.post("")
 async def chat(body: ChatRequest, request: Request) -> JSONResponse:
    """REST fallback for home chat.
    In the microservices setup, Traefik ForwardAuth has already validated
    the JWT and injected X-User-Id / X-User-Email / X-User-Tier headers.
    """
    user_id = request.headers.get("X-User-Id", "")
    if not user_id:
        return JSONResponse(status_code=401, content={"detail": "Missing X-User-Id header"})
    set_current_user(user_id)
    try:
        response = await run_home(
            user_id=user_id,
            message=body.message,
            context=body.context.model_dump(),
        )
    finally:
        clear_current_user()
    return JSONResponse(content={"response": response})
--- a/services/chat/app/tracing.py
+++ b/services/chat/app/tracing.py
@@ -1,304 +0,0 @@
 """Langfuse tracing & prompt management for the Chat Service (v4 SDK).
 Provides:
 - ``init_langfuse()`` — initialise the singleton client at startup
 - ``trace_span()`` — context manager that creates a trace + span
 - ``get_langfuse_callback()`` — LangChain callback handler (auto-inherits trace)
 - ``get_prompt()`` — fetch a managed prompt from Langfuse by name
 - ``flush()`` / ``shutdown()`` — lifecycle management
 All functions gracefully degrade to no-ops when Langfuse is not configured,
 so the service works identically with or without observability keys.
 Requires ``langfuse >= 3.0.0`` (v4 / "Fast Preview" SDK).
 """
 from __future__ import annotations
 import logging
 from contextlib import contextmanager
 from typing import Any
 from shared.config import settings
 logger = logging.getLogger(__name__)
 # ── State ────────────────────────────────────────────────────────────────
 _initialised: bool = False
 _disabled: bool = False
 def _is_configured() -> bool:
    return bool(settings.LANGFUSE_SECRET_KEY and settings.LANGFUSE_PUBLIC_KEY)
 def init_langfuse() -> None:
    """Initialise the Langfuse singleton. Call once at startup."""
    global _initialised, _disabled
    if _initialised or _disabled:
        return
    if not _is_configured():
        _disabled = True
        logger.info("tracing: Langfuse keys not set — tracing disabled")
        return
    try:
        from langfuse import Langfuse
        Langfuse(
            secret_key=settings.LANGFUSE_SECRET_KEY,
            public_key=settings.LANGFUSE_PUBLIC_KEY,
            host=settings.LANGFUSE_HOST,
        )
        _initialised = True
        logger.info("tracing: Langfuse client initialised (host=%s)", settings.LANGFUSE_HOST)
    except Exception as exc:
        _disabled = True
        logger.warning("tracing: failed to initialise Langfuse: %s", exc)
 def _get_client() -> Any | None:
    """Return the singleton Langfuse client, or *None* if disabled."""
    if _disabled:
        return None
    if not _initialised:
        init_langfuse()
    if _disabled:
        return None
    try:
        from langfuse import get_client
        return get_client()
    except Exception:
        return None
 # ── Null span (no-op when Langfuse is disabled) ─────────────────────────
 class _NullSpan:
    """Drop-in replacement when Langfuse is disabled."""
    def update(self, **_: Any) -> None: ...
    def set_trace_io(self, **_: Any) -> None: ...
    def score_trace(self, **_: Any) -> None: ...
 # ── Trace context manager ───────────────────────────────────────────────
@contextmanager
 def trace_span(
    *,
    name: str,
    user_id: str,
    session_id: str | None = None,
    trace_id: str | None = None,
    input: Any = None,
    metadata: dict[str, Any] | None = None,
    tags: list[str] | None = None,
 ):
    """Context manager that creates a Langfuse trace/span.
    Yields the span object (or a ``_NullSpan`` if Langfuse is disabled).
    A ``CallbackHandler`` created inside this block auto-inherits the trace
    context, so there is no need to pass trace IDs manually.
    """
    lf = _get_client()
    if lf is None:
        yield _NullSpan()
        return
    try:
        from langfuse import Langfuse, propagate_attributes
        trace_ctx: dict[str, str] = {}
        if trace_id is not None:
            trace_ctx["trace_id"] = Langfuse.create_trace_id(seed=trace_id)
        with lf.start_as_current_observation(
            as_type="span",
            name=name,
            input=input,
            metadata=metadata or {},
            **({"trace_context": trace_ctx} if trace_ctx else {}),
        ) as span:
            with propagate_attributes(
                user_id=user_id,
                session_id=session_id,
                tags=tags or [],
            ):
                yield span
    except Exception as exc:
        logger.warning("tracing: trace_span(%s) failed: %s", name, exc)
        yield _NullSpan()
 # ── LangChain callback handler ──────────────────────────────────────────
 def get_langfuse_callback() -> Any | None:
    """Return a LangChain ``CallbackHandler`` that auto-inherits the current trace.
    Must be called inside a ``trace_span()`` block for proper linking.
    Returns *None* when Langfuse is disabled.
    """
    if _disabled and not _initialised:
        return None
    try:
        from langfuse.langchain import CallbackHandler
        return CallbackHandler()
    except Exception as exc:
        logger.warning("tracing: get_langfuse_callback failed: %s", exc)
        return None
 # ── Prompt management ────────────────────────────────────────────────────
 def get_prompt(
    name: str,
    *,
    version: int | None = None,
    label: str | None = None,
    fallback: str | None = None,
    cache_ttl_seconds: int = 300,
 ) -> str | None:
    """Fetch a managed prompt from Langfuse by name (without variable compilation).
    Returns the raw prompt string, or *fallback* if the prompt is not
    found or Langfuse is disabled.
    """
    lf = _get_client()
    if lf is None:
        return fallback
    try:
        kwargs: dict[str, Any] = {
            "name": name,
            "cache_ttl_seconds": cache_ttl_seconds,
        }
        if version is not None:
            kwargs["version"] = version
        if label is not None:
            kwargs["label"] = label
        prompt = lf.get_prompt(**kwargs)
        return prompt.prompt
    except Exception as exc:
        logger.warning("tracing: get_prompt(%s) failed: %s", name, exc)
        return fallback
 def compile_prompt(
    name: str,
    *,
    fallback: str,
    variables: dict[str, str],
    version: int | None = None,
    label: str | None = None,
    cache_ttl_seconds: int = 300,
 ) -> str:
    """Fetch a managed prompt from Langfuse and compile it with ``{{variables}}``.
    If the prompt exists in Langfuse, uses the SDK's ``.compile(**variables)``
    which replaces ``{{key}}`` placeholders.  If Langfuse is disabled or the
    prompt is not found, falls back to ``fallback.format(**variables)`` (Python
    ``{key}`` placeholders).
    This means:
      - Langfuse prompts use ``{{variable}}`` syntax.
      - Hardcoded fallback strings use Python ``{variable}`` syntax.
    """
    lf = _get_client()
    if lf is None:
        return fallback.format(**variables)
    try:
        kwargs: dict[str, Any] = {
            "name": name,
            "cache_ttl_seconds": cache_ttl_seconds,
        }
        if version is not None:
            kwargs["version"] = version
        if label is not None:
            kwargs["label"] = label
        prompt = lf.get_prompt(**kwargs)
        return prompt.compile(**variables)
    except Exception as exc:
        logger.warning("tracing: compile_prompt(%s) failed, using fallback: %s", name, exc)
        return fallback.format(**variables)
 def link_prompt_to_trace(
    span: Any,
    prompt_name: str,
    *,
    version: int | None = None,
    label: str | None = None,
 ) -> None:
    """Attach prompt metadata to a span/trace."""
    lf = _get_client()
    if lf is None or isinstance(span, _NullSpan):
        return
    try:
        kwargs: dict[str, Any] = {"name": prompt_name}
        if version is not None:
            kwargs["version"] = version
        if label is not None:
            kwargs["label"] = label
        prompt = lf.get_prompt(**kwargs)
        span.update(metadata={"prompt": {"name": prompt_name, "version": prompt.version}})
    except Exception as exc:
        logger.warning("tracing: link_prompt_to_trace(%s) failed: %s", prompt_name, exc)
 # ── Scoring helper ───────────────────────────────────────────────────────
 def score_trace(
    trace_id: str,
    name: str,
    value: float,
    *,
    comment: str | None = None,
 ) -> None:
    """Post a score to a trace (e.g. user feedback, latency, quality)."""
    lf = _get_client()
    if lf is None:
        return
    try:
        lf.create_score(trace_id=trace_id, name=name, value=value, comment=comment)
    except Exception as exc:
        logger.warning("tracing: score_trace failed: %s", exc)
 # ── Shutdown ─────────────────────────────────────────────────────────────
 def flush() -> None:
    """Flush pending Langfuse events."""
    lf = _get_client()
    if lf is not None:
        try:
            lf.flush()
        except Exception as exc:
            logger.warning("tracing: flush failed: %s", exc)
 def shutdown() -> None:
    """Flush and close the Langfuse client."""
    global _initialised, _disabled
    lf = _get_client()
    if lf is not None:
        try:
            lf.flush()
            lf.shutdown()
        except Exception as exc:
            logger.warning("tracing: shutdown failed: %s", exc)
    _initialised = False
    _disabled = False
--- a/services/chat/requirements.txt
+++ b/services/chat/requirements.txt
@@ -1,17 +0,0 @@
 fastapi>=0.115.0
 uvicorn[standard]>=0.34.0
 gunicorn>=22.0.0
 pydantic>=2.10.0
 pydantic-settings>=2.7.0
 sqlalchemy>=2.0.0
 asyncpg>=0.30.0
 redis>=5.0.0
 cryptography>=42.0.0
 python-dotenv>=1.0.0
 langchain-core>=0.3.0
 langchain-openai>=0.3.0
 langchain-litellm>=0.3.0
 litellm>=1.50.0
 openai>=1.50.0
 httpx>=0.27.0
 langfuse>=3.0.0
--- a/services/ws-gateway/Dockerfile
+++ b/services/ws-gateway/Dockerfile
@@ -1,36 +0,0 @@
 # ── builder ──────────────────────────────────────────────────────────────────
 FROM python:3.12-slim AS builder
 WORKDIR /build
 COPY services/ws-gateway/requirements.txt ./requirements.txt
 RUN pip install --upgrade pip && \
    pip install --no-cache-dir --prefix=/install -r requirements.txt
 # ── runtime ──────────────────────────────────────────────────────────────────
 FROM python:3.12-slim AS runtime
 RUN addgroup --system appgroup && adduser --system --ingroup appgroup appuser
 WORKDIR /app
 COPY --from=builder /install /usr/local
 # Shared module
 COPY shared/ shared/
 # Service source
 COPY services/ws-gateway/app/ app/
 RUN chown -R appuser:appgroup /app
 USER appuser
 EXPOSE 8000
 # Single worker — each instance handles many WS connections via asyncio
 CMD ["gunicorn", "app.main:app", \
     "-k", "uvicorn.workers.UvicornWorker", \
     "--bind", "0.0.0.0:8000", \
     "--workers", "1", \
     "--timeout", "0"]
--- a/services/ws-gateway/README.md
+++ b/services/ws-gateway/README.md
@@ -1,17 +0,0 @@
 # WS Gateway
 Stateless WebSocket proxy. Accepts Electron connections, authenticates JWT,
 routes frames to Chat/Batch services via Redis pub/sub.
 ## No business logic
 This service does NOT know what tasks, notes, or agents are.
 It only routes JSON frames between Electron and downstream services.
 ## Scaling
 Sticky sessions on `user_id` (Traefik consistent hashing).
 ## Redis channels used
 - Subscribe: `ws:out:{user_id}` (frames to send to client)
 - Publish: `chat:request:{user_id}`, `batch:request:{user_id}`
 - LPUSH: `tool:result:{call_id}` (from client tool_result frames)
 - HSET/HDEL: `ws:devices:{user_id}` (device registry)
--- a/services/ws-gateway/app/handler.py
+++ b/services/ws-gateway/app/handler.py
@@ -1,173 +0,0 @@
 """WebSocket handler — device connection lifecycle.
 Accepts Electron WS connections, authenticates JWT, registers device in Redis,
 and runs two concurrent loops:
  1. Message loop: receive frames from Electron, route to Redis
  2. Outbound loop: subscribe to Redis ws:out:{user_id}, forward to Electron
  3. Heartbeat loop: ping every 30s
 No business logic lives here — the handler is a JSON frame router.
 """
 from __future__ import annotations
 import asyncio
 import json
 import logging
 from uuid import uuid4
 from fastapi import APIRouter, WebSocket, WebSocketDisconnect
 from jose import JWTError, jwt
 from shared.config import settings
 from shared.schemas import WsFrameType
 from app.redis_bridge import (
    publish_batch_request,
    publish_chat_request,
    push_tool_result,
    register_device,
    set_gateway_id,
    subscribe_outbound,
    unregister_device,
 )
 logger = logging.getLogger(__name__)
 router = APIRouter(prefix="/ws", tags=["ws-gateway"])
 _HEARTBEAT_INTERVAL = 30  # seconds
 # Set a unique gateway instance ID on module load
 set_gateway_id(str(uuid4()))
@router.websocket("/device")
 async def device_ws(websocket: WebSocket) -> None:
    """Persistent WebSocket endpoint for Electron device connections."""
    # ── 1. Authenticate via ?token= query parameter ──────────────────
    token = websocket.query_params.get("token", "")
    try:
        payload = jwt.decode(
            token,
            settings.JWT_PUBLIC_KEY,
            algorithms=["RS256"],
        )
        user_id: str | None = payload.get("sub")
        email: str | None = payload.get("email")
        if not user_id:
            raise JWTError("missing sub")
    except JWTError:
        await websocket.close(code=1008)
        return
    await websocket.accept()
    # ── 2. Await device_hello frame ──────────────────────────────────
    try:
        raw = await asyncio.wait_for(websocket.receive_text(), timeout=15.0)
    except (asyncio.TimeoutError, WebSocketDisconnect):
        await websocket.close(code=1008)
        return
    try:
        hello = json.loads(raw)
        if hello.get("type") != WsFrameType.device_hello:
            raise ValueError("expected device_hello as first frame")
        device_id: str = hello["device_id"]
        agent_ids: list[str] = hello.get("agent_ids", [])
    except (KeyError, ValueError, json.JSONDecodeError) as exc:
        logger.warning("handler: invalid device_hello user=%s: %s", user_id, exc)
        await websocket.close(code=1008)
        return
    # ── 3. Register device in Redis ──────────────────────────────────
    await register_device(user_id, device_id)
    logger.info("handler: connected user=%s device=%s agents=%s", user_id, device_id, agent_ids)
    # Notify downstream services that device is online (for agent trigger)
    await publish_batch_request(user_id, {
        "type": "device_online",
        "user_id": user_id,
        "device_id": device_id,
        "agent_ids": agent_ids,
    })
    # ── 4. Subscribe to outbound Redis channel ───────────────────────
    pubsub = await subscribe_outbound(user_id)
    # ── 5. Run concurrent loops ──────────────────────────────────────
    try:
        await asyncio.gather(
            _inbound_loop(websocket, user_id),
            _outbound_loop(websocket, pubsub),
            _heartbeat_loop(websocket),
        )
    except WebSocketDisconnect:
        pass
    except Exception as exc:
        logger.warning("handler: unhandled exception user=%s: %s", user_id, exc)
    finally:
        await pubsub.unsubscribe()
        await pubsub.aclose()
        await unregister_device(user_id)
        logger.info("handler: disconnected user=%s device=%s", user_id, device_id)
 # ── Inbound: Electron → Redis ────────────────────────────────────────
 async def _inbound_loop(websocket: WebSocket, user_id: str) -> None:
    """Receive frames from Electron and route to the appropriate Redis channel."""
    async for raw in websocket.iter_text():
        try:
            frame: dict = json.loads(raw)
        except json.JSONDecodeError:
            logger.warning("handler: invalid JSON from user=%s", user_id)
            continue
        frame_type = frame.get("type")
        # Inject user_id so downstream services know who sent it
        frame["user_id"] = user_id
        if frame_type == WsFrameType.tool_result:
            call_id = frame.get("id")
            if call_id:
                await push_tool_result(call_id, frame)
            else:
                logger.warning("handler: tool_result missing id user=%s", user_id)
        elif frame_type in (WsFrameType.home_request, WsFrameType.floating_request):
            await publish_chat_request(user_id, frame)
        elif frame_type in (WsFrameType.journey_start, WsFrameType.journey_message):
            await publish_batch_request(user_id, frame)
        elif frame_type == "pong":
            pass  # heartbeat ack
        else:
            logger.debug("handler: unknown frame type %r user=%s", frame_type, user_id)
 # ── Outbound: Redis → Electron ───────────────────────────────────────
 async def _outbound_loop(websocket: WebSocket, pubsub) -> None:
    """Subscribe to Redis ws:out:{user_id} and forward frames to Electron."""
    while True:
        message = await pubsub.get_message(ignore_subscribe_messages=True, timeout=1.0)
        if message is not None and message["type"] == "message":
            await websocket.send_text(message["data"])
        else:
            # Brief sleep to avoid busy-wait when no messages
            await asyncio.sleep(0.01)
 # ── Heartbeat ────────────────────────────────────────────────────────
 async def _heartbeat_loop(websocket: WebSocket) -> None:
    """Send ping frames every 30s to keep the connection alive."""
    while True:
        await asyncio.sleep(_HEARTBEAT_INTERVAL)
        await websocket.send_text(json.dumps({"type": "ping"}))
--- a/services/ws-gateway/app/main.py
+++ b/services/ws-gateway/app/main.py
@@ -1,56 +0,0 @@
 """WS Gateway — stateless WebSocket proxy.
 Accepts Electron device connections, authenticates JWT (RS256 public key),
 and routes frames between Electron and downstream services via Redis pub/sub.
 This service has NO business logic — it only routes JSON frames.
 """
 import sys
 from contextlib import asynccontextmanager
 import logging
 from pathlib import Path
 # Ensure the repo root is on sys.path so "shared" is importable in local dev.
 _repo_root = str(Path(__file__).resolve().parents[3])
 if _repo_root not in sys.path:
    sys.path.insert(0, _repo_root)
 from fastapi import FastAPI
 from shared.config import settings
 logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s %(levelname)s %(name)s: %(message)s",
 )
@asynccontextmanager
 async def lifespan(app: FastAPI):
    yield
    from shared.redis import redis_client
    await redis_client.aclose()
 def create_app() -> FastAPI:
    app = FastAPI(
        title="Adiuva WS Gateway",
        version="0.1.0",
        docs_url="/docs" if settings.ENV == "dev" else None,
        redoc_url=None,
        lifespan=lifespan,
    )
    from app.handler import router
    app.include_router(router, prefix="/api/v1")
    @app.get("/api/v1/health", tags=["health"])
    async def health() -> dict:
        return {"status": "ok", "service": "ws-gateway", "version": app.version}
    return app
 app = create_app()
--- a/Show More
+++ b/Show More
		`@@ -1 +0,0 @@`
			`"""Batch Agent Service domain agents and filesystem tools."""`
		`@@ -1,2 +0,0 @@`
			`# Extra dependencies for the eval harness (on top of the service requirements.txt)`
			`pyyaml>=6.0.0`