feat: KBDB self-hosted 查詢 + embed 模組 + thin-shell 收窄 + search_workflow(code done 待端到端)
按 issue 分段標明(檔 #5/#8 改動交疊處無法乾淨拆檔,故併一個 commit): #4 thin-shell §3.1 自力救濟階梯 + code-node 規則(純文檔/規則,code-node 零件未實作) #5 KBDB source filter(json_extract metadata_json 零建表)+ 能力對照;documents 聚合與 DELETE proxy 部分擱置等頂層 T8 #7 base embed 模組(kbdb/src/embed.ts)+ vectorize 開關(deploy/config/wrangler.toml 註解範本) + 語義查詢降級閉環(mode=semantic 未開→LIKE+capability_hint) #8 部分(workflow-discovery): - KBDB /entries/search 加 base 通用 entry_type filter(entry-crud/embed/route/kbdb-proxy 透傳) - /webhooks/named 強制 description(空→400,訊息要求操盤 AI 據實寫一句) - 部署雙寫 entry_type=workflow embeddable entry(waitUntil 非阻塞,供 search) - cypher GET /workflows/search + MCP u6u_search_workflows(優先語意、降級 hint) - cypher POST /workflows/backfill-search-entries(無 desc 列出不編造) - GET /webhooks/named 補回 description/created_at 欄位(為 list 來源收斂備) ⚠️ tsc 綠 = code done,非完成(mindset §7 禁假綠): - #7/#8 端到端待 leo21c 部署驗(Vectorize 需官方憑證、CC 跑不了) - #8 ①-a(MCP deploy 改打 /webhooks/named)未做、MCP deploy 那半仍 404 - #8 端到端(強制填擋空/語義命中/租戶隔離/降級 hint)未驗 Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -57,6 +57,7 @@ export interface ListEntriesFilter {
|
||||
owner_id?: string;
|
||||
parent_id?: string;
|
||||
page_name?: string; // exact-match lookup (e.g. skill-/example- idempotency key)
|
||||
source?: string; // filter by metadata_json.$.source (ingest envelope source.uri). issue #5.1
|
||||
limit?: number;
|
||||
offset?: number;
|
||||
}
|
||||
@@ -68,6 +69,9 @@ export async function listEntries(db: D1Database, f: ListEntriesFilter = {}): Pr
|
||||
if (f.owner_id) { conds.push('owner_id = ?'); params.push(f.owner_id); }
|
||||
if (f.parent_id) { conds.push('parent_id = ?'); params.push(f.parent_id); }
|
||||
if (f.page_name) { conds.push('page_name = ?'); params.push(f.page_name); }
|
||||
// source is queryable via SQLite json_extract on the existing metadata_json TEXT column —
|
||||
// no new column / no migration (表不變鐵律). Per issue #5.1 (頂層化 source 成可查 filter).
|
||||
if (f.source) { conds.push("json_extract(metadata_json, '$.source') = ?"); params.push(f.source); }
|
||||
const where = conds.length ? `WHERE ${conds.join(' AND ')}` : '';
|
||||
const limit = Math.min(f.limit ?? 100, 1000);
|
||||
const offset = f.offset ?? 0;
|
||||
@@ -107,10 +111,18 @@ export async function deleteEntry(db: D1Database, id: string): Promise<void> {
|
||||
}
|
||||
|
||||
// D1 LIKE keyword search (base; semantic search is the optional embed module).
|
||||
export async function searchEntries(db: D1Database, q: string, owner_id?: string, limit = 50): Promise<Entry[]> {
|
||||
// entry_type: optional base filter (generic — caller passes any type, base stays type-agnostic).
|
||||
export async function searchEntries(
|
||||
db: D1Database,
|
||||
q: string,
|
||||
owner_id?: string,
|
||||
entry_type?: string,
|
||||
limit = 50,
|
||||
): Promise<Entry[]> {
|
||||
const conds = ['content LIKE ?'];
|
||||
const params: unknown[] = [`%${q}%`];
|
||||
if (owner_id) { conds.push('owner_id = ?'); params.push(owner_id); }
|
||||
if (entry_type) { conds.push('entry_type = ?'); params.push(entry_type); }
|
||||
const res = await db
|
||||
.prepare(`SELECT * FROM entries WHERE ${conds.join(' AND ')} ORDER BY updated_at DESC LIMIT ?`)
|
||||
.bind(...params, Math.min(limit, 200))
|
||||
|
||||
@@ -88,6 +88,52 @@ export async function createRecord(db: D1Database, input: CreateRecordInput): Pr
|
||||
return { record_id: recordId, template_id: tpl.id, values: input.values };
|
||||
}
|
||||
|
||||
// Update an existing record's slot values (mira-dissolve T2.1, issue #6).
|
||||
// "Deprecate by flipping a slot value" — base append-only is NOT broken: we change the
|
||||
// underlying entries.content of the slot's entry, we do not alter table structure / add columns / delete rows.
|
||||
// - slot already on the record → UPDATE the linked entries.content.
|
||||
// - slot valid for the record's template but not yet present → create entry + entry_value (idempotent grow).
|
||||
// - slot not in the template's slots_json → reject (records must stay template-shaped).
|
||||
// Returns null if the record does not exist.
|
||||
export async function updateRecord(
|
||||
db: D1Database,
|
||||
recordId: string,
|
||||
values: Record<string, string>,
|
||||
): Promise<RecordResult | null> {
|
||||
// Existing slot → entry_id + template_id for this record.
|
||||
const evRes = await db
|
||||
.prepare(`SELECT slot_name, entry_id, template_id FROM entry_values WHERE record_id = ?`)
|
||||
.bind(recordId)
|
||||
.all<{ slot_name: string; entry_id: string; template_id: string }>();
|
||||
const evRows = evRes.results ?? [];
|
||||
if (evRows.length === 0) return null; // record does not exist
|
||||
|
||||
const templateId = evRows[0].template_id;
|
||||
const slotToEntry = new Map(evRows.map((r) => [r.slot_name, r.entry_id]));
|
||||
|
||||
const tpl = await getTemplate(db, templateId);
|
||||
const allowed: string[] = tpl ? JSON.parse(tpl.slots_json) : [...slotToEntry.keys()];
|
||||
|
||||
for (const [slot, content] of Object.entries(values)) {
|
||||
if (!allowed.includes(slot)) {
|
||||
throw new Error(`slot not in template: ${slot}`);
|
||||
}
|
||||
const entryId = slotToEntry.get(slot);
|
||||
if (entryId) {
|
||||
// flip the slot value: update the linked entry's content (table structure untouched)
|
||||
await db.prepare(`UPDATE entries SET content = ?, updated_at = unixepoch() WHERE id = ?`).bind(content, entryId).run();
|
||||
} else {
|
||||
// valid template slot not yet on this record → grow it (create entry + link)
|
||||
const entry = await createEntry(db, { content, entry_type: 'value' });
|
||||
await db
|
||||
.prepare(`INSERT INTO entry_values (id, record_id, template_id, slot_name, entry_id) VALUES (?, ?, ?, ?, ?)`)
|
||||
.bind(uid('ev'), recordId, templateId, slot, entry.id)
|
||||
.run();
|
||||
}
|
||||
}
|
||||
return getRecord(db, recordId);
|
||||
}
|
||||
|
||||
export async function getRecord(db: D1Database, recordId: string): Promise<RecordResult | null> {
|
||||
const res = await db
|
||||
.prepare(
|
||||
|
||||
@@ -0,0 +1,119 @@
|
||||
// KBDB optional embed module (issue #7 / mira-dissolve SDD T2.4).
|
||||
//
|
||||
// 鐵律對齊:
|
||||
// - embedding 屬 **base 的 optional 模組**(非 graph/ingest)。CF 內建(Vectorize+AI),程式薄。
|
||||
// - **不拆 repo,binding 開/關**:有 env.VECTORIZE + env.AI 才啟用;沒有 → base 維持 LIKE keyword,API 不變。
|
||||
// - 不動三表結構(只標既有 entries.is_embedded / content_hash bookkeeping 欄;那些 base 從不讀,embed 才寫)。
|
||||
// - 不對每個 block 地毯式 embed(精耕,非 RAG 一股腦灌):只 embed「被標記為 embeddable」的 entry
|
||||
// (wiki 段落 + graph node gloss)。標記方式=寫入時 metadata_json.embed === true(caller 顯式標)。
|
||||
//
|
||||
// 為何用 metadata flag 而非 entry_type 白名單:base 不該寫死「哪些 entry_type 該 embed」(那是上游語意,
|
||||
// 會讓 base 知道 wiki/graph 概念,破壞解耦)。改由 caller(wiki/gloss 寫入端)顯式標 embed:true,
|
||||
// base 只認這個通用旗標 → base 維持對內容語意無知。
|
||||
|
||||
import type { Bindings, Entry } from './types';
|
||||
|
||||
const EMBED_MODEL = '@cf/baai/bge-base-en-v1.5'; // 768-dim,與 Vectorize index dimensions=768 對齊
|
||||
|
||||
/** embed 模組是否啟用(binding 都在才算開)。base 一切 embed 動作先過這關。 */
|
||||
export function embedEnabled(env: Bindings): boolean {
|
||||
return !!(env.VECTORIZE && env.AI);
|
||||
}
|
||||
|
||||
/** 一段文字 → 768 維向量(Workers AI bge)。空字串回 null(不 embed)。 */
|
||||
async function embedText(env: Bindings, text: string): Promise<number[] | null> {
|
||||
const t = (text ?? '').trim();
|
||||
if (!t || !env.AI) return null;
|
||||
const res = (await env.AI.run(EMBED_MODEL, { text: [t] })) as { data: number[][] };
|
||||
return res?.data?.[0] ?? null;
|
||||
}
|
||||
|
||||
/**
|
||||
* 寫入時選擇性 embed(embed-on-write,#5 第4點併入此)。
|
||||
* - 模組未開 → no-op(base 輕量)。
|
||||
* - 只 embed 被標 embeddable 的 entry(metadata_json.embed === true)。其餘略過(非地毯式)。
|
||||
* 失敗不致命(fire-and-forget 由 caller 用 waitUntil 包;這裡只負責「能 embed 就 embed」)。
|
||||
* 回傳是否真的 embed 了(讓 caller 決定要不要標 is_embedded)。
|
||||
*/
|
||||
export async function embedOnWrite(env: Bindings, entry: Entry): Promise<boolean> {
|
||||
if (!embedEnabled(env)) return false;
|
||||
if (!isEmbeddable(entry)) return false;
|
||||
const vec = await embedText(env, entry.content ?? '');
|
||||
if (!vec) return false;
|
||||
await env.VECTORIZE!.upsert([
|
||||
{
|
||||
id: entry.id,
|
||||
values: vec,
|
||||
// metadata 走 indexed 範圍:owner_id(租戶隔離)、entry_type、source(#5.1 過濾與語義共用)。
|
||||
metadata: {
|
||||
owner_id: entry.owner_id ?? '',
|
||||
entry_type: entry.entry_type,
|
||||
source: readSource(entry) ?? '',
|
||||
},
|
||||
},
|
||||
]);
|
||||
// 標記 bookkeeping(既有欄,base 不讀、僅供「已 embed」可查)。不動表結構。
|
||||
await env.DB.prepare('UPDATE entries SET is_embedded = 1 WHERE id = ?').bind(entry.id).run();
|
||||
return true;
|
||||
}
|
||||
|
||||
/** entry 是否該被 embed:caller 在 metadata_json 標 embed:true(精耕,非地毯式)。 */
|
||||
function isEmbeddable(entry: Entry): boolean {
|
||||
const meta = parseMeta(entry.metadata_json);
|
||||
return meta?.embed === true;
|
||||
}
|
||||
|
||||
function readSource(entry: Entry): string | null {
|
||||
const meta = parseMeta(entry.metadata_json);
|
||||
const s = meta?.source;
|
||||
return typeof s === 'string' ? s : null;
|
||||
}
|
||||
|
||||
function parseMeta(json: string | null): Record<string, unknown> | null {
|
||||
if (!json) return null;
|
||||
try {
|
||||
const p = JSON.parse(json);
|
||||
return p && typeof p === 'object' ? (p as Record<string, unknown>) : null;
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
export interface SemanticHit {
|
||||
id: string;
|
||||
score: number;
|
||||
owner_id?: string;
|
||||
entry_type?: string;
|
||||
source?: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* 語義搜尋(mode:'semantic')。模組未開 → 回 null(caller 降級 keyword + 告知缺能力)。
|
||||
* owner_id / source / entry_type 過濾走 Vectorize metadata filter(entry_type 已 index,見上 upsert metadata)。
|
||||
* entry_type 是 base 通用 filter(caller 傳任意 type,base 不寫死語意)。
|
||||
*/
|
||||
export async function semanticSearch(
|
||||
env: Bindings,
|
||||
q: string,
|
||||
opts: { owner_id?: string; source?: string; entry_type?: string; topK?: number } = {},
|
||||
): Promise<SemanticHit[] | null> {
|
||||
if (!embedEnabled(env)) return null;
|
||||
const vec = await embedText(env, q);
|
||||
if (!vec) return [];
|
||||
const filter: Record<string, string> = {};
|
||||
if (opts.owner_id) filter.owner_id = opts.owner_id;
|
||||
if (opts.source) filter.source = opts.source;
|
||||
if (opts.entry_type) filter.entry_type = opts.entry_type;
|
||||
const res = await env.VECTORIZE!.query(vec, {
|
||||
topK: Math.min(opts.topK ?? 20, 100),
|
||||
returnMetadata: 'indexed',
|
||||
...(Object.keys(filter).length ? { filter } : {}),
|
||||
});
|
||||
return (res.matches ?? []).map((m) => ({
|
||||
id: m.id,
|
||||
score: m.score,
|
||||
owner_id: m.metadata?.owner_id as string | undefined,
|
||||
entry_type: m.metadata?.entry_type as string | undefined,
|
||||
source: m.metadata?.source as string | undefined,
|
||||
}));
|
||||
}
|
||||
@@ -1,4 +1,4 @@
|
||||
// Entries route — atomic data + tree (project/workflow). Base, no embed/triplet.
|
||||
// Entries route — atomic data + tree (project/workflow). Base; embed is OPTIONAL (issue #7).
|
||||
import { Hono } from 'hono';
|
||||
import type { Bindings } from '../types';
|
||||
import {
|
||||
@@ -9,6 +9,7 @@ import {
|
||||
deleteEntry,
|
||||
searchEntries,
|
||||
} from '../actions/entry-crud';
|
||||
import { embedEnabled, embedOnWrite, semanticSearch } from '../embed';
|
||||
|
||||
export const entryRoutes = new Hono<{ Bindings: Bindings }>();
|
||||
|
||||
@@ -17,29 +18,63 @@ entryRoutes.post('/', async (c) => {
|
||||
const body = await c.req.json().catch(() => null);
|
||||
if (!body || !body.entry_type) return c.json({ success: false, error: 'entry_type required' }, 400);
|
||||
const entry = await createEntry(c.env.DB, body);
|
||||
// embed-on-write (#7 / #5 第4點):模組開 + entry 標 embed:true 才做;fire-and-forget,不阻塞回應、失敗不致命。
|
||||
if (embedEnabled(c.env)) c.executionCtx.waitUntil(embedOnWrite(c.env, entry).catch(() => {}));
|
||||
return c.json({ success: true, entry });
|
||||
});
|
||||
|
||||
// GET /entries — list with filters (entry_type, owner_id, parent_id, page_name)
|
||||
// GET /entries — list with filters (entry_type, owner_id, parent_id, page_name, source)
|
||||
// e.g. list workflows under a project: ?parent_id=PROJECT&entry_type=workflow
|
||||
// e.g. get one by idempotency key: ?page_name=skill-rag_with_arcrun
|
||||
// e.g. filter by ingest source: ?source=logseq://vault/foo.md (issue #5.1)
|
||||
entryRoutes.get('/', async (c) => {
|
||||
const entries = await listEntries(c.env.DB, {
|
||||
entry_type: c.req.query('entry_type') || undefined,
|
||||
owner_id: c.req.query('owner_id') || undefined,
|
||||
parent_id: c.req.query('parent_id') || undefined,
|
||||
page_name: c.req.query('page_name') || undefined,
|
||||
source: c.req.query('source') || undefined,
|
||||
limit: c.req.query('limit') ? Number(c.req.query('limit')) : undefined,
|
||||
offset: c.req.query('offset') ? Number(c.req.query('offset')) : undefined,
|
||||
});
|
||||
return c.json({ success: true, entries, count: entries.length });
|
||||
});
|
||||
|
||||
// GET /entries/search?q=...&owner_id=... — D1 LIKE keyword search (base)
|
||||
// GET /entries/search?q=...&owner_id=...&source=...&entry_type=...&mode=keyword|semantic
|
||||
// - mode=keyword(預設):D1 LIKE(base,永遠可用)。
|
||||
// - mode=semantic:需 embed 模組開(Vectorize+AI binding)。未開 → 降級 keyword + capability_hint 告知缺能力(#7 發現閉環)。
|
||||
// - entry_type:base 通用 filter(caller 傳任意 type,如 workflow;base 不寫死語意,workflow-discovery Q4)。
|
||||
entryRoutes.get('/search', async (c) => {
|
||||
const q = c.req.query('q');
|
||||
if (!q) return c.json({ success: false, error: 'q required' }, 400);
|
||||
const entries = await searchEntries(c.env.DB, q, c.req.query('owner_id') || undefined);
|
||||
const owner_id = c.req.query('owner_id') || undefined;
|
||||
const source = c.req.query('source') || undefined;
|
||||
const entry_type = c.req.query('entry_type') || undefined;
|
||||
const mode = c.req.query('mode') === 'semantic' ? 'semantic' : 'keyword';
|
||||
|
||||
if (mode === 'semantic') {
|
||||
const hits = await semanticSearch(c.env, q, { owner_id, source, entry_type });
|
||||
if (hits === null) {
|
||||
// 模組沒開:誠實降級 keyword + 告知「叫 CC 幫你開 vectorize」(不假裝有語義)。
|
||||
const entries = await searchEntries(c.env.DB, q, owner_id, entry_type);
|
||||
return c.json({
|
||||
success: true,
|
||||
entries,
|
||||
count: entries.length,
|
||||
mode: 'keyword',
|
||||
requested_mode: 'semantic',
|
||||
capability_hint:
|
||||
'語義查詢需先開 vectorize(embed 模組)。叫 CC「幫我開語義查詢」即可(設 kbdb_embed:true + redeploy)。本次已降級關鍵字搜尋。',
|
||||
});
|
||||
}
|
||||
// hydrate vector hits → 完整 entry(保持回應形狀與 keyword 一致)。
|
||||
const entries = (await Promise.all(hits.map((h) => getEntry(c.env.DB, h.id)))).filter(
|
||||
(e): e is NonNullable<typeof e> => e !== null,
|
||||
);
|
||||
return c.json({ success: true, entries, count: entries.length, mode: 'semantic' });
|
||||
}
|
||||
|
||||
const entries = await searchEntries(c.env.DB, q, owner_id, entry_type);
|
||||
return c.json({ success: true, entries, count: entries.length, mode: 'keyword' });
|
||||
});
|
||||
|
||||
@@ -55,11 +90,19 @@ entryRoutes.patch('/:id', async (c) => {
|
||||
const body = await c.req.json().catch(() => ({}));
|
||||
const entry = await updateEntry(c.env.DB, c.req.param('id'), body);
|
||||
if (!entry) return c.json({ success: false, error: 'not found' }, 404);
|
||||
// 內容改了 → 重 embed(保持向量新鮮)。embedOnWrite 內部自會檢查模組開 + entry 是否 embeddable。
|
||||
if (embedEnabled(c.env) && body.content !== undefined) {
|
||||
c.executionCtx.waitUntil(embedOnWrite(c.env, entry).catch(() => {}));
|
||||
}
|
||||
return c.json({ success: true, entry });
|
||||
});
|
||||
|
||||
// DELETE /entries/:id
|
||||
entryRoutes.delete('/:id', async (c) => {
|
||||
// 模組開 → 連帶刪向量(避免孤兒向量)。失敗不致命。
|
||||
if (embedEnabled(c.env)) {
|
||||
c.executionCtx.waitUntil(c.env.VECTORIZE!.deleteByIds([c.req.param('id')]).then(() => {}).catch(() => {}));
|
||||
}
|
||||
await deleteEntry(c.env.DB, c.req.param('id'));
|
||||
return c.json({ success: true });
|
||||
});
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
// Records route — structured records (entry_values composed by a template).
|
||||
import { Hono } from 'hono';
|
||||
import type { Bindings } from '../types';
|
||||
import { createRecord, getRecord, searchByTemplate } from '../actions/record-crud';
|
||||
import { createRecord, getRecord, searchByTemplate, updateRecord } from '../actions/record-crud';
|
||||
|
||||
export const recordRoutes = new Hono<{ Bindings: Bindings }>();
|
||||
|
||||
@@ -31,3 +31,19 @@ recordRoutes.get('/:recordId', async (c) => {
|
||||
if (!rec) return c.json({ success: false, error: 'not found' }, 404);
|
||||
return c.json({ success: true, record: rec });
|
||||
});
|
||||
|
||||
// PATCH /records/:recordId — { values:{slot:content} } update existing record slot values
|
||||
// (mira-dissolve T2.1 / issue #6; deprecate = flip a slot value, append-only tables untouched).
|
||||
recordRoutes.patch('/:recordId', async (c) => {
|
||||
const body = await c.req.json().catch(() => null);
|
||||
if (!body || !body.values || typeof body.values !== 'object') {
|
||||
return c.json({ success: false, error: 'values required' }, 400);
|
||||
}
|
||||
try {
|
||||
const rec = await updateRecord(c.env.DB, c.req.param('recordId'), body.values);
|
||||
if (!rec) return c.json({ success: false, error: 'not found' }, 404);
|
||||
return c.json({ success: true, record: rec });
|
||||
} catch (e) {
|
||||
return c.json({ success: false, error: e instanceof Error ? e.message : String(e) }, 400);
|
||||
}
|
||||
});
|
||||
|
||||
@@ -4,6 +4,11 @@
|
||||
export type Bindings = {
|
||||
DB: D1Database;
|
||||
ENVIRONMENT: string;
|
||||
// Optional embed module (issue #7 / SDD T2.4). Present ONLY when the self-host opened
|
||||
// semantic search (kbdb_embed:true → deploy injects [[vectorize]] + [ai]). Base never
|
||||
// requires them; code checks `if (env.VECTORIZE && env.AI)` before touching embed.
|
||||
VECTORIZE?: VectorizeIndex;
|
||||
AI?: Ai;
|
||||
};
|
||||
|
||||
export type EntryType =
|
||||
|
||||
@@ -14,3 +14,16 @@ database_id = "0c580910-e00b-4f8e-9c57-ac54ea52242f" # 官方 prod D1(arcrun-
|
||||
|
||||
[vars]
|
||||
ENVIRONMENT = "production"
|
||||
|
||||
# ── Optional embed module (issue #7 / SDD T2.4) ────────────────────────────────
|
||||
# Base 預設不開(free-tier 友善)。self-host 開語義查詢時,deploy.ts 偵測 config kbdb_embed:true
|
||||
# → 取消下面兩段註解(注入 active binding)並 `wrangler vectorize create arcrun-kbdb-embed
|
||||
# --dimensions=768 --metric=cosine`(bge-base-en-v1.5 = 768 維)。官方帳號同理由 deploy 注入。
|
||||
# 沒有這兩個 binding 時,kbdb/src/embed.ts 的 embedEnabled() 回 false → 維持 LIKE keyword、API 不變。
|
||||
#
|
||||
# [[vectorize]]
|
||||
# binding = "VECTORIZE"
|
||||
# index_name = "arcrun-kbdb-embed"
|
||||
#
|
||||
# [ai]
|
||||
# binding = "AI"
|
||||
|
||||
Reference in New Issue
Block a user