feat(ingest): T0.5–T5 純餵食器管線實作(issue #2)

ingest 全管線(採取優先、extract fallback、跨庫織網、POST envelope):
- T0.5 骨架:Hono + zod-openapi,無 D1/Vectorize/AI 綁定(不碰儲存鐵律)
- T1 SourceAdapter:GitHub runtime API 拉 + per-file sha256 content-hash + /refresh 受理端
- T2 採取(路徑 A 優先):harvest template 1.8.0+ 卡(gloss/實體/typed-edge)
- T3 extract(路徑 B fallback):LlmCaller 可選模型 + JSON-fail 升級閘 + 端點對齊硬自檢護欄;第一版不 embed(只打標)
- T4 跨庫織網(主職):匯總多 repo → 偵測跨庫橋/異見,不算 bridge_score(graph 領域)
- T5 輸出:buildEnvelope strict + 顯式禁送欄位自檢;graph-client 純 POST(cherry-pick _kbdb_client.py 改不碰 base);薄 ops CLI(不帶查詢 MCP)

envelope 對齊 full contract(embed/id/aliases/predicate_embed);同步 contract 向量化欄位升格。

gate:vitest 28 passed / tsc clean / wrangler dry-run 乾淨(只 env-var 綁定)。
端到端 ingest→graph:graph receiver 已補對齊 → 待 ingest 部署 + GRAPH_BASE_URL → 待部署驗,未假綠。

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-26 20:40:53 +08:00
parent dffefdcdc2
commit 16ad1cb208
24 changed files with 4003 additions and 28 deletions
+117
View File
@@ -0,0 +1,117 @@
#!/usr/bin/env node
// 薄 ops CLI(T5.2)— 人手動觸發重萃。不帶查詢 MCP(ambient 餵食器沒人「問」它)。
//
// 兩種模式:
// ingest refresh <github:owner/repo@path> 經部署的 Worker /refresh 重萃單一來源
// ingest pull <owner/repo> [root] 本地 dry-run:拉 + 列出會送的 envelope(不 POST
//
// 設定走 env
// KBDB_INGEST_URL 已部署的 ingest Worker baserefresh 模式用)
// GRAPH_BASE_URL graph 寫入端(pull --post 用)
// GITHUB_TOKEN 拉私庫用(公庫可空)
//
// 鐵律:CLI 不碰儲存;refresh 經 Worker、pull --post 經 graph 寫入端。觸發=人手動(無排程)。
import process from 'node:process';
const [, , cmd, arg, arg2] = process.argv;
async function sha256hex(text) {
const data = new TextEncoder().encode(text);
const digest = await crypto.subtle.digest('SHA-256', data);
return [...new Uint8Array(digest)].map((b) => b.toString(16).padStart(2, '0')).join('');
}
function ghHeaders() {
const h = { Accept: 'application/vnd.github+json', 'User-Agent': 'kbdb-ingest-cli' };
if (process.env.GITHUB_TOKEN) h.Authorization = `Bearer ${process.env.GITHUB_TOKEN}`;
return h;
}
async function ghGetFile(owner, repo, path) {
const url = `https://api.github.com/repos/${owner}/${repo}/contents/${path}`;
const res = await fetch(url, { headers: ghHeaders() });
if (!res.ok) throw new Error(`github ${owner}/${repo}@${path}: ${res.status}`);
const body = await res.json();
const text = body.encoding === 'base64' ? Buffer.from(body.content, 'base64').toString('utf-8') : body.content;
return { text, commit: body.sha };
}
async function ghListMarkdown(owner, repo, root = '') {
const res = await fetch(`https://api.github.com/repos/${owner}/${repo}/git/trees/HEAD?recursive=1`, { headers: ghHeaders() });
if (!res.ok) throw new Error(`github list ${owner}/${repo}: ${res.status}`);
const body = await res.json();
const prefix = root.replace(/^\/+|\/+$/g, '');
return (body.tree || [])
.filter((e) => e.type === 'blob' && e.path.endsWith('.md'))
.map((e) => e.path)
.filter((p) => (prefix ? p === prefix || p.startsWith(prefix + '/') : true));
}
// 極簡採取(鏡射 src/lib/harvest.tsCLI dry-run 用,不引 TS)。
function harvest(md) {
const fm = /^---\n([\s\S]*?)\n---\n?([\s\S]*)$/.exec(md);
const body = fm ? fm[2] : md;
const gloss = fm && /^gloss:\s*(.+)$/m.exec(fm[1]) ? /^gloss:\s*(.+)$/m.exec(fm[1])[1].trim() : undefined;
const title = /^#\s+(.+)$/m.exec(body)?.[1]?.trim();
const sec = (h) => new RegExp(`^##\\s+${h}[^\\n]*\\n([\\s\\S]*?)(?=\\n##\\s|$)`, 'm').exec(body)?.[1] || '';
const nodes = [];
if (title) nodes.push({ name: title, gloss, embed: true });
for (const line of sec('實體').split('\n')) {
const m = /^-\s*\*\*(.+?)\*\*\s*(?:(.+?))?\s*(?:[—-]\s*(.+))?$/.exec(line.trim());
if (m) nodes.push({ name: m[1].trim(), gloss: m[3]?.trim() || undefined, embed: true });
}
const triplets = [];
for (const line of sec('關聯').split('\n')) {
const m = /^(.+?)\s*>>\s*(.+?)\s*>>\s*(.+?)$/.exec(line.replace(/^-\s*/, '').trim());
if (m) {
const clean = (s) => s.replace(/\[\[|\]\]|\*\*/g, '').trim();
triplets.push({ subject: clean(m[1]), predicate: m[2].trim(), object: clean(m[3]), predicate_embed: true });
}
}
return { nodes, triplets };
}
async function doRefresh(uri) {
const base = process.env.KBDB_INGEST_URL;
if (!base) throw new Error('KBDB_INGEST_URL 未設(指向已部署的 ingest Worker');
const res = await fetch(base.replace(/\/$/, '') + '/refresh', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ uri }),
});
console.log(JSON.stringify(await res.json(), null, 2));
}
async function doPull(ownerRepo, root) {
const [owner, repo] = ownerRepo.split('/');
if (!owner || !repo) throw new Error('用法:ingest pull <owner/repo> [root]');
const paths = await ghListMarkdown(owner, repo, root || '');
console.error(`[ingest] ${owner}/${repo}: ${paths.length} 個 MD`);
const envelopes = [];
for (const path of paths) {
const { text, commit } = await ghGetFile(owner, repo, path);
const { nodes, triplets } = harvest(text);
if (!triplets.length) continue; // 採不到(非 template 卡)→ dry-run 跳過(CLI 不做 extract
envelopes.push({
source: { uri: `github:${owner}/${repo}@${path}`, content_hash: await sha256hex(text), commit },
extractor: { model: 'local-harvest', tier: 'shallow' },
nodes,
triplets,
});
}
console.error(`[ingest] 採取出 ${envelopes.length} 個 envelope(共 ${envelopes.reduce((n, e) => n + e.triplets.length, 0)} 三元組)`);
console.log(JSON.stringify(envelopes, null, 2));
}
try {
if (cmd === 'refresh' && arg) await doRefresh(arg);
else if (cmd === 'pull' && arg) await doPull(arg, arg2);
else {
console.error('用法:\n ingest refresh <github:owner/repo@path>\n ingest pull <owner/repo> [root]');
process.exit(2);
}
} catch (e) {
console.error('[ingest] 錯誤:', e.message);
process.exit(1);
}