feat: KBDB self-hosted 查詢 + embed 模組 + thin-shell 收窄 + search_workflow(code done 待端到端)

按 issue 分段標明(檔 #5/#8 改動交疊處無法乾淨拆檔,故併一個 commit):

#4 thin-shell §3.1 自力救濟階梯 + code-node 規則(純文檔/規則,code-node 零件未實作)
#5 KBDB source filter(json_extract metadata_json 零建表)+ 能力對照;documents 聚合與
   DELETE proxy 部分擱置等頂層 T8
#7 base embed 模組(kbdb/src/embed.ts)+ vectorize 開關(deploy/config/wrangler.toml 註解範本)
   + 語義查詢降級閉環(mode=semantic 未開→LIKE+capability_hint)
#8 部分(workflow-discovery):
   - KBDB /entries/search 加 base 通用 entry_type filter(entry-crud/embed/route/kbdb-proxy 透傳)
   - /webhooks/named 強制 description(空→400,訊息要求操盤 AI 據實寫一句)
   - 部署雙寫 entry_type=workflow embeddable entry(waitUntil 非阻塞,供 search)
   - cypher GET /workflows/search + MCP u6u_search_workflows(優先語意、降級 hint)
   - cypher POST /workflows/backfill-search-entries(無 desc 列出不編造)
   - GET /webhooks/named 補回 description/created_at 欄位(為 list 來源收斂備)

⚠️ tsc 綠 = code done,非完成(mindset §7 禁假綠):
- #7/#8 端到端待 leo21c 部署驗(Vectorize 需官方憑證、CC 跑不了)
- #8 ①-a(MCP deploy 改打 /webhooks/named)未做、MCP deploy 那半仍 404
- #8 端到端(強制填擋空/語義命中/租戶隔離/降級 hint)未驗

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
uncle6me-web
2026-06-27 17:52:52 +08:00
parent 013b55e97e
commit 934b9265d9
16 changed files with 610 additions and 33 deletions
+13 -1
View File
@@ -57,6 +57,7 @@ export interface ListEntriesFilter {
owner_id?: string;
parent_id?: string;
page_name?: string; // exact-match lookup (e.g. skill-/example- idempotency key)
source?: string; // filter by metadata_json.$.source (ingest envelope source.uri). issue #5.1
limit?: number;
offset?: number;
}
@@ -68,6 +69,9 @@ export async function listEntries(db: D1Database, f: ListEntriesFilter = {}): Pr
if (f.owner_id) { conds.push('owner_id = ?'); params.push(f.owner_id); }
if (f.parent_id) { conds.push('parent_id = ?'); params.push(f.parent_id); }
if (f.page_name) { conds.push('page_name = ?'); params.push(f.page_name); }
// source is queryable via SQLite json_extract on the existing metadata_json TEXT column —
// no new column / no migration (表不變鐵律). Per issue #5.1 (頂層化 source 成可查 filter).
if (f.source) { conds.push("json_extract(metadata_json, '$.source') = ?"); params.push(f.source); }
const where = conds.length ? `WHERE ${conds.join(' AND ')}` : '';
const limit = Math.min(f.limit ?? 100, 1000);
const offset = f.offset ?? 0;
@@ -107,10 +111,18 @@ export async function deleteEntry(db: D1Database, id: string): Promise<void> {
}
// D1 LIKE keyword search (base; semantic search is the optional embed module).
export async function searchEntries(db: D1Database, q: string, owner_id?: string, limit = 50): Promise<Entry[]> {
// entry_type: optional base filter (generic — caller passes any type, base stays type-agnostic).
export async function searchEntries(
db: D1Database,
q: string,
owner_id?: string,
entry_type?: string,
limit = 50,
): Promise<Entry[]> {
const conds = ['content LIKE ?'];
const params: unknown[] = [`%${q}%`];
if (owner_id) { conds.push('owner_id = ?'); params.push(owner_id); }
if (entry_type) { conds.push('entry_type = ?'); params.push(entry_type); }
const res = await db
.prepare(`SELECT * FROM entries WHERE ${conds.join(' AND ')} ORDER BY updated_at DESC LIMIT ?`)
.bind(...params, Math.min(limit, 200))
+46
View File
@@ -88,6 +88,52 @@ export async function createRecord(db: D1Database, input: CreateRecordInput): Pr
return { record_id: recordId, template_id: tpl.id, values: input.values };
}
// Update an existing record's slot values (mira-dissolve T2.1, issue #6).
// "Deprecate by flipping a slot value" — base append-only is NOT broken: we change the
// underlying entries.content of the slot's entry, we do not alter table structure / add columns / delete rows.
// - slot already on the record → UPDATE the linked entries.content.
// - slot valid for the record's template but not yet present → create entry + entry_value (idempotent grow).
// - slot not in the template's slots_json → reject (records must stay template-shaped).
// Returns null if the record does not exist.
export async function updateRecord(
db: D1Database,
recordId: string,
values: Record<string, string>,
): Promise<RecordResult | null> {
// Existing slot → entry_id + template_id for this record.
const evRes = await db
.prepare(`SELECT slot_name, entry_id, template_id FROM entry_values WHERE record_id = ?`)
.bind(recordId)
.all<{ slot_name: string; entry_id: string; template_id: string }>();
const evRows = evRes.results ?? [];
if (evRows.length === 0) return null; // record does not exist
const templateId = evRows[0].template_id;
const slotToEntry = new Map(evRows.map((r) => [r.slot_name, r.entry_id]));
const tpl = await getTemplate(db, templateId);
const allowed: string[] = tpl ? JSON.parse(tpl.slots_json) : [...slotToEntry.keys()];
for (const [slot, content] of Object.entries(values)) {
if (!allowed.includes(slot)) {
throw new Error(`slot not in template: ${slot}`);
}
const entryId = slotToEntry.get(slot);
if (entryId) {
// flip the slot value: update the linked entry's content (table structure untouched)
await db.prepare(`UPDATE entries SET content = ?, updated_at = unixepoch() WHERE id = ?`).bind(content, entryId).run();
} else {
// valid template slot not yet on this record → grow it (create entry + link)
const entry = await createEntry(db, { content, entry_type: 'value' });
await db
.prepare(`INSERT INTO entry_values (id, record_id, template_id, slot_name, entry_id) VALUES (?, ?, ?, ?, ?)`)
.bind(uid('ev'), recordId, templateId, slot, entry.id)
.run();
}
}
return getRecord(db, recordId);
}
export async function getRecord(db: D1Database, recordId: string): Promise<RecordResult | null> {
const res = await db
.prepare(
+119
View File
@@ -0,0 +1,119 @@
// KBDB optional embed module (issue #7 / mira-dissolve SDD T2.4).
//
// 鐵律對齊:
// - embedding 屬 **base 的 optional 模組**(非 graph/ingest)。CF 內建(Vectorize+AI),程式薄。
// - **不拆 repobinding 開/關**:有 env.VECTORIZE + env.AI 才啟用;沒有 → base 維持 LIKE keywordAPI 不變。
// - 不動三表結構(只標既有 entries.is_embedded / content_hash bookkeeping 欄;那些 base 從不讀,embed 才寫)。
// - 不對每個 block 地毯式 embed(精耕,非 RAG 一股腦灌):只 embed「被標記為 embeddable」的 entry
// wiki 段落 + graph node gloss)。標記方式=寫入時 metadata_json.embed === truecaller 顯式標)。
//
// 為何用 metadata flag 而非 entry_type 白名單:base 不該寫死「哪些 entry_type 該 embed」(那是上游語意,
// 會讓 base 知道 wiki/graph 概念,破壞解耦)。改由 caller(wiki/gloss 寫入端)顯式標 embed:true
// base 只認這個通用旗標 → base 維持對內容語意無知。
import type { Bindings, Entry } from './types';
const EMBED_MODEL = '@cf/baai/bge-base-en-v1.5'; // 768-dim,與 Vectorize index dimensions=768 對齊
/** embed 模組是否啟用(binding 都在才算開)。base 一切 embed 動作先過這關。 */
export function embedEnabled(env: Bindings): boolean {
return !!(env.VECTORIZE && env.AI);
}
/** 一段文字 → 768 維向量(Workers AI bge)。空字串回 null(不 embed)。 */
async function embedText(env: Bindings, text: string): Promise<number[] | null> {
const t = (text ?? '').trim();
if (!t || !env.AI) return null;
const res = (await env.AI.run(EMBED_MODEL, { text: [t] })) as { data: number[][] };
return res?.data?.[0] ?? null;
}
/**
* 寫入時選擇性 embedembed-on-write#5 第4點併入此)。
* - 模組未開 → no-opbase 輕量)。
* - 只 embed 被標 embeddable 的 entrymetadata_json.embed === true)。其餘略過(非地毯式)。
* 失敗不致命(fire-and-forget 由 caller 用 waitUntil 包;這裡只負責「能 embed 就 embed」)。
* 回傳是否真的 embed 了(讓 caller 決定要不要標 is_embedded)。
*/
export async function embedOnWrite(env: Bindings, entry: Entry): Promise<boolean> {
if (!embedEnabled(env)) return false;
if (!isEmbeddable(entry)) return false;
const vec = await embedText(env, entry.content ?? '');
if (!vec) return false;
await env.VECTORIZE!.upsert([
{
id: entry.id,
values: vec,
// metadata 走 indexed 範圍:owner_id(租戶隔離)、entry_type、source#5.1 過濾與語義共用)。
metadata: {
owner_id: entry.owner_id ?? '',
entry_type: entry.entry_type,
source: readSource(entry) ?? '',
},
},
]);
// 標記 bookkeeping(既有欄,base 不讀、僅供「已 embed」可查)。不動表結構。
await env.DB.prepare('UPDATE entries SET is_embedded = 1 WHERE id = ?').bind(entry.id).run();
return true;
}
/** entry 是否該被 embedcaller 在 metadata_json 標 embed:true(精耕,非地毯式)。 */
function isEmbeddable(entry: Entry): boolean {
const meta = parseMeta(entry.metadata_json);
return meta?.embed === true;
}
function readSource(entry: Entry): string | null {
const meta = parseMeta(entry.metadata_json);
const s = meta?.source;
return typeof s === 'string' ? s : null;
}
function parseMeta(json: string | null): Record<string, unknown> | null {
if (!json) return null;
try {
const p = JSON.parse(json);
return p && typeof p === 'object' ? (p as Record<string, unknown>) : null;
} catch {
return null;
}
}
export interface SemanticHit {
id: string;
score: number;
owner_id?: string;
entry_type?: string;
source?: string;
}
/**
* 語義搜尋(mode:'semantic')。模組未開 → 回 nullcaller 降級 keyword + 告知缺能力)。
* owner_id / source / entry_type 過濾走 Vectorize metadata filterentry_type 已 index,見上 upsert metadata)。
* entry_type 是 base 通用 filtercaller 傳任意 typebase 不寫死語意)。
*/
export async function semanticSearch(
env: Bindings,
q: string,
opts: { owner_id?: string; source?: string; entry_type?: string; topK?: number } = {},
): Promise<SemanticHit[] | null> {
if (!embedEnabled(env)) return null;
const vec = await embedText(env, q);
if (!vec) return [];
const filter: Record<string, string> = {};
if (opts.owner_id) filter.owner_id = opts.owner_id;
if (opts.source) filter.source = opts.source;
if (opts.entry_type) filter.entry_type = opts.entry_type;
const res = await env.VECTORIZE!.query(vec, {
topK: Math.min(opts.topK ?? 20, 100),
returnMetadata: 'indexed',
...(Object.keys(filter).length ? { filter } : {}),
});
return (res.matches ?? []).map((m) => ({
id: m.id,
score: m.score,
owner_id: m.metadata?.owner_id as string | undefined,
entry_type: m.metadata?.entry_type as string | undefined,
source: m.metadata?.source as string | undefined,
}));
}
+47 -4
View File
@@ -1,4 +1,4 @@
// Entries route — atomic data + tree (project/workflow). Base, no embed/triplet.
// Entries route — atomic data + tree (project/workflow). Base; embed is OPTIONAL (issue #7).
import { Hono } from 'hono';
import type { Bindings } from '../types';
import {
@@ -9,6 +9,7 @@ import {
deleteEntry,
searchEntries,
} from '../actions/entry-crud';
import { embedEnabled, embedOnWrite, semanticSearch } from '../embed';
export const entryRoutes = new Hono<{ Bindings: Bindings }>();
@@ -17,29 +18,63 @@ entryRoutes.post('/', async (c) => {
const body = await c.req.json().catch(() => null);
if (!body || !body.entry_type) return c.json({ success: false, error: 'entry_type required' }, 400);
const entry = await createEntry(c.env.DB, body);
// embed-on-write (#7 / #5 第4點):模組開 + entry 標 embed:true 才做;fire-and-forget,不阻塞回應、失敗不致命。
if (embedEnabled(c.env)) c.executionCtx.waitUntil(embedOnWrite(c.env, entry).catch(() => {}));
return c.json({ success: true, entry });
});
// GET /entries — list with filters (entry_type, owner_id, parent_id, page_name)
// GET /entries — list with filters (entry_type, owner_id, parent_id, page_name, source)
// e.g. list workflows under a project: ?parent_id=PROJECT&entry_type=workflow
// e.g. get one by idempotency key: ?page_name=skill-rag_with_arcrun
// e.g. filter by ingest source: ?source=logseq://vault/foo.md (issue #5.1)
entryRoutes.get('/', async (c) => {
const entries = await listEntries(c.env.DB, {
entry_type: c.req.query('entry_type') || undefined,
owner_id: c.req.query('owner_id') || undefined,
parent_id: c.req.query('parent_id') || undefined,
page_name: c.req.query('page_name') || undefined,
source: c.req.query('source') || undefined,
limit: c.req.query('limit') ? Number(c.req.query('limit')) : undefined,
offset: c.req.query('offset') ? Number(c.req.query('offset')) : undefined,
});
return c.json({ success: true, entries, count: entries.length });
});
// GET /entries/search?q=...&owner_id=... — D1 LIKE keyword search (base)
// GET /entries/search?q=...&owner_id=...&source=...&entry_type=...&mode=keyword|semantic
// - mode=keyword(預設):D1 LIKEbase,永遠可用)。
// - mode=semantic:需 embed 模組開(Vectorize+AI binding)。未開 → 降級 keyword + capability_hint 告知缺能力(#7 發現閉環)。
// - entry_typebase 通用 filtercaller 傳任意 type,如 workflowbase 不寫死語意,workflow-discovery Q4)。
entryRoutes.get('/search', async (c) => {
const q = c.req.query('q');
if (!q) return c.json({ success: false, error: 'q required' }, 400);
const entries = await searchEntries(c.env.DB, q, c.req.query('owner_id') || undefined);
const owner_id = c.req.query('owner_id') || undefined;
const source = c.req.query('source') || undefined;
const entry_type = c.req.query('entry_type') || undefined;
const mode = c.req.query('mode') === 'semantic' ? 'semantic' : 'keyword';
if (mode === 'semantic') {
const hits = await semanticSearch(c.env, q, { owner_id, source, entry_type });
if (hits === null) {
// 模組沒開:誠實降級 keyword + 告知「叫 CC 幫你開 vectorize」(不假裝有語義)。
const entries = await searchEntries(c.env.DB, q, owner_id, entry_type);
return c.json({
success: true,
entries,
count: entries.length,
mode: 'keyword',
requested_mode: 'semantic',
capability_hint:
'語義查詢需先開 vectorizeembed 模組)。叫 CC「幫我開語義查詢」即可(設 kbdb_embed:true + redeploy)。本次已降級關鍵字搜尋。',
});
}
// hydrate vector hits → 完整 entry(保持回應形狀與 keyword 一致)。
const entries = (await Promise.all(hits.map((h) => getEntry(c.env.DB, h.id)))).filter(
(e): e is NonNullable<typeof e> => e !== null,
);
return c.json({ success: true, entries, count: entries.length, mode: 'semantic' });
}
const entries = await searchEntries(c.env.DB, q, owner_id, entry_type);
return c.json({ success: true, entries, count: entries.length, mode: 'keyword' });
});
@@ -55,11 +90,19 @@ entryRoutes.patch('/:id', async (c) => {
const body = await c.req.json().catch(() => ({}));
const entry = await updateEntry(c.env.DB, c.req.param('id'), body);
if (!entry) return c.json({ success: false, error: 'not found' }, 404);
// 內容改了 → 重 embed(保持向量新鮮)。embedOnWrite 內部自會檢查模組開 + entry 是否 embeddable。
if (embedEnabled(c.env) && body.content !== undefined) {
c.executionCtx.waitUntil(embedOnWrite(c.env, entry).catch(() => {}));
}
return c.json({ success: true, entry });
});
// DELETE /entries/:id
entryRoutes.delete('/:id', async (c) => {
// 模組開 → 連帶刪向量(避免孤兒向量)。失敗不致命。
if (embedEnabled(c.env)) {
c.executionCtx.waitUntil(c.env.VECTORIZE!.deleteByIds([c.req.param('id')]).then(() => {}).catch(() => {}));
}
await deleteEntry(c.env.DB, c.req.param('id'));
return c.json({ success: true });
});
+17 -1
View File
@@ -1,7 +1,7 @@
// Records route — structured records (entry_values composed by a template).
import { Hono } from 'hono';
import type { Bindings } from '../types';
import { createRecord, getRecord, searchByTemplate } from '../actions/record-crud';
import { createRecord, getRecord, searchByTemplate, updateRecord } from '../actions/record-crud';
export const recordRoutes = new Hono<{ Bindings: Bindings }>();
@@ -31,3 +31,19 @@ recordRoutes.get('/:recordId', async (c) => {
if (!rec) return c.json({ success: false, error: 'not found' }, 404);
return c.json({ success: true, record: rec });
});
// PATCH /records/:recordId — { values:{slot:content} } update existing record slot values
// (mira-dissolve T2.1 / issue #6; deprecate = flip a slot value, append-only tables untouched).
recordRoutes.patch('/:recordId', async (c) => {
const body = await c.req.json().catch(() => null);
if (!body || !body.values || typeof body.values !== 'object') {
return c.json({ success: false, error: 'values required' }, 400);
}
try {
const rec = await updateRecord(c.env.DB, c.req.param('recordId'), body.values);
if (!rec) return c.json({ success: false, error: 'not found' }, 404);
return c.json({ success: true, record: rec });
} catch (e) {
return c.json({ success: false, error: e instanceof Error ? e.message : String(e) }, 400);
}
});
+5
View File
@@ -4,6 +4,11 @@
export type Bindings = {
DB: D1Database;
ENVIRONMENT: string;
// Optional embed module (issue #7 / SDD T2.4). Present ONLY when the self-host opened
// semantic search (kbdb_embed:true → deploy injects [[vectorize]] + [ai]). Base never
// requires them; code checks `if (env.VECTORIZE && env.AI)` before touching embed.
VECTORIZE?: VectorizeIndex;
AI?: Ai;
};
export type EntryType =