feat: integrate vectordb for note embeddings

- Added `vectordb` as a dependency in `package.json`.
- Implemented `embedText` function in `src/main/ai/embeddings.ts` to handle text embeddings using GitHub Copilot OAuth token or OpenAI token.
- Created `vectordb.ts` for managing LanceDB connection and embedding notes with upsert strategy.
- Updated `index.ts` to initialize vector database and migrate existing notes on app ready.
- Modified `router/index.ts` to fire-and-forget embedding calls on note creation and updates.
- Enhanced `progress.txt` with detailed implementation notes and learnings regarding the integration.
This commit is contained in:
Roberto Musso
2026-02-24 21:34:48 +01:00
parent e70982c8b6
commit 2cb2f0e4e8
9 changed files with 750 additions and 27 deletions

73
src/main/ai/embeddings.ts Normal file
View File

@@ -0,0 +1,73 @@
import fs from 'node:fs';
import os from 'node:os';
import path from 'node:path';
import { getToken } from './token';
interface CopilotConfig {
copilot_tokens?: Record<string, string>;
}
/**
* Read the GitHub Copilot OAuth token from the CLI config file.
* Stored at ~/.copilot/config.json under copilot_tokens["{host}:{login}"].
* Returns the first available token, or null if unavailable.
*/
function readCopilotToken(): string | null {
try {
const raw = fs.readFileSync(
path.join(os.homedir(), '.copilot', 'config.json'),
'utf-8',
);
const cfg = JSON.parse(raw) as CopilotConfig;
const vals = Object.values(cfg.copilot_tokens ?? {});
return vals[0] ?? null;
} catch {
return null;
}
}
/**
* Embed a single text string using the best available credentials.
*
* Priority:
* 1. GitHub Copilot CLI token → OpenAI-compatible embeddings endpoint at
* https://api.githubcopilot.com
* 2. Stored OpenAI token → standard OpenAI embeddings API
*
* Throws if no credentials are available or the API call fails.
* Callers must .catch() this and handle the error without rejecting
* the surrounding tRPC mutation.
*/
export async function embedText(text: string): Promise<number[]> {
const { OpenAIEmbeddings } = await import('@langchain/openai');
const copilotToken = readCopilotToken();
let embeddingsInstance;
if (copilotToken) {
embeddingsInstance = new OpenAIEmbeddings({
apiKey: copilotToken,
model: 'text-embedding-3-small',
configuration: { baseURL: 'https://api.githubcopilot.com' },
});
} else {
const openaiToken = await getToken('openai');
if (!openaiToken) {
throw new Error(
'[Embeddings] No credentials available. Authenticate with Copilot CLI or add an OpenAI token in Settings.',
);
}
embeddingsInstance = new OpenAIEmbeddings({
apiKey: openaiToken,
model: 'text-embedding-3-small',
});
}
// embedDocuments returns number[][] — cast explicitly to satisfy strict TS
const results = (await embeddingsInstance.embedDocuments([text])) as number[][];
const vector = results[0] as number[] | undefined;
if (!vector || vector.length === 0) {
throw new Error('[Embeddings] Empty vector returned from embedding API');
}
return vector;
}

113
src/main/db/vectordb.ts Normal file
View File

@@ -0,0 +1,113 @@
import * as lancedb from 'vectordb';
import { app } from 'electron';
import path from 'node:path';
import { getDb } from './index';
import { notes } from './schema';
import { embedText } from '../ai/embeddings';
interface NoteRecord {
id: string;
/** Empty string when the note has no project (Arrow string fields don't cleanly handle null) */
projectId: string;
content: string;
vector: number[];
}
let conn: lancedb.Connection | null = null;
/**
* Initialize the LanceDB connection. Must be called before any other
* function in this module. Vector data is stored at userData/vectors/.
*/
export async function initVectorDb(): Promise<void> {
const vectorPath = path.join(app.getPath('userData'), 'vectors');
conn = await lancedb.connect(vectorPath);
console.log('[VectorDB] Connected at:', vectorPath);
}
function getConn(): lancedb.Connection {
if (!conn) throw new Error('[VectorDB] Not initialized. Call initVectorDb() first.');
return conn;
}
/**
* Embed note content and upsert the record into the LanceDB 'notes' table.
*
* Upsert strategy: delete-then-add.
* table.delete(where) is a no-op when no rows match, so this is safe for
* both first-time inserts and subsequent updates.
*
* On the very first call when the table does not yet exist, createTable
* infers the Arrow schema from the initial record.
*
* Throws on error — callers fire-and-forget via .catch().
*/
export async function upsertNoteEmbedding(
noteId: string,
projectId: string | null,
content: string,
): Promise<void> {
const c = getConn();
const vector = await embedText(content);
const record: NoteRecord = {
id: noteId,
projectId: projectId ?? '',
content,
vector,
};
const tableNames = await c.tableNames();
if (!tableNames.includes('notes')) {
// First embedding: createTable infers the Arrow schema from this record.
// The vector dimension (1536 for text-embedding-3-small) is baked in here.
await c.createTable('notes', [record]);
console.log('[VectorDB] Created notes table');
return;
}
const table = await c.openTable<NoteRecord>('notes');
// Note IDs are UUID v4 — only [0-9a-f-] chars, no SQL injection risk.
await table.delete(`id = '${noteId}'`);
await table.add([record]);
}
/**
* On first startup, check if the LanceDB 'notes' table exists.
* If not, embed all existing SQLite notes and populate LanceDB.
*
* Per-note errors are caught and logged; a single failure does not
* abort the remaining notes.
*/
export async function migrateNotesIfNeeded(): Promise<void> {
const c = getConn();
const tableNames = await c.tableNames();
if (tableNames.includes('notes')) {
console.log('[VectorDB] Notes table exists, skipping migration');
return;
}
const allNotes = getDb().select().from(notes).all();
if (allNotes.length === 0) {
console.log('[VectorDB] No existing notes to migrate');
return;
}
console.log(`[VectorDB] Migrating ${allNotes.length} notes...`);
let successCount = 0;
for (const note of allNotes) {
try {
const embeddingText = `${note.title}\n\n${note.content}`;
await upsertNoteEmbedding(note.id, note.projectId ?? null, embeddingText);
successCount++;
} catch (err) {
console.error(`[VectorDB] Failed to embed note ${note.id} during migration:`, err);
}
}
console.log(`[VectorDB] Migration complete: ${successCount}/${allNotes.length} notes embedded`);
}

View File

@@ -5,6 +5,7 @@ import { initDb } from './db';
import { appRouter } from './router';
import { createIPCHandler } from './ipc';
import { initAI } from './ai/provider';
import { initVectorDb, migrateNotesIfNeeded } from './db/vectordb';
// Import to trigger provider registration before initAI() runs
import './ai/copilot';
@@ -54,6 +55,10 @@ app.on('ready', () => {
createIPCHandler({ router: appRouter, windows: [win] });
// AI init is best-effort — never block window creation
initAI().catch((err) => console.error('[AI] Init failed:', err));
// Vector DB init + migration is best-effort — runs after window is shown
initVectorDb()
.then(() => migrateNotesIfNeeded())
.catch((err) => console.error('[VectorDB] Init or migration failed:', err));
});
// Quit when all windows are closed, except on macOS. There, it's common

View File

@@ -7,6 +7,7 @@ import { clients, projects, tasks, checkpoints, notes, taskComments } from '../d
import { getStore } from '../store';
import { saveTokenAndInit, hasActiveToken } from '../ai/provider';
import { orchestrate } from '../ai/orchestrator';
import { upsertNoteEmbedding } from '../db/vectordb';
import type { TRPCContext } from '../ipc';
const t = initTRPC.context<TRPCContext>().create();
@@ -406,7 +407,7 @@ const notesRouter = router({
create: publicProcedure
.input(z.object({ title: z.string(), content: z.string(), projectId: z.string().optional() }))
.mutation(({ input }) => {
.mutation(async ({ input }) => {
const id = crypto.randomUUID();
const now = Date.now();
getDb().insert(notes).values({
@@ -417,18 +418,37 @@ const notesRouter = router({
createdAt: now,
updatedAt: now,
}).run();
// Fire-and-forget: embed the note. Errors are logged, never thrown.
upsertNoteEmbedding(id, input.projectId ?? null, `${input.title}\n\n${input.content}`)
.catch((err) => console.error('[VectorDB] Failed to embed note on create:', err));
return { id };
}),
update: publicProcedure
.input(z.object({ id: z.string(), title: z.string().optional(), content: z.string().optional() }))
.mutation(({ input }) => {
.mutation(async ({ input }) => {
const set: Partial<{ title: string; content: string; updatedAt: number }> = {};
if (input.title !== undefined) set.title = input.title;
if (input.content !== undefined) set.content = input.content;
// Always update updatedAt
set.updatedAt = Date.now();
getDb().update(notes).set(set).where(eq(notes.id, input.id)).run();
// Re-embed if searchable text fields changed.
// Re-fetch from SQLite so the embedding reflects the full current note
// (the update may have changed only one of title or content).
if (input.title !== undefined || input.content !== undefined) {
const updated = getDb()
.select({ id: notes.id, projectId: notes.projectId, title: notes.title, content: notes.content })
.from(notes)
.where(eq(notes.id, input.id))
.all()[0];
if (updated) {
upsertNoteEmbedding(updated.id, updated.projectId ?? null, `${updated.title}\n\n${updated.content}`)
.catch((err) => console.error('[VectorDB] Failed to embed note on update:', err));
}
}
return null;
}),