Files
kbdb-ingest-plugin/scripts/ingest-cli.mjs
Leo 16ad1cb208 feat(ingest): T0.5–T5 純餵食器管線實作(issue #2)
ingest 全管線(採取優先、extract fallback、跨庫織網、POST envelope):
- T0.5 骨架:Hono + zod-openapi,無 D1/Vectorize/AI 綁定(不碰儲存鐵律)
- T1 SourceAdapter:GitHub runtime API 拉 + per-file sha256 content-hash + /refresh 受理端
- T2 採取(路徑 A 優先):harvest template 1.8.0+ 卡(gloss/實體/typed-edge)
- T3 extract(路徑 B fallback):LlmCaller 可選模型 + JSON-fail 升級閘 + 端點對齊硬自檢護欄;第一版不 embed(只打標)
- T4 跨庫織網(主職):匯總多 repo → 偵測跨庫橋/異見,不算 bridge_score(graph 領域)
- T5 輸出:buildEnvelope strict + 顯式禁送欄位自檢;graph-client 純 POST(cherry-pick _kbdb_client.py 改不碰 base);薄 ops CLI(不帶查詢 MCP)

envelope 對齊 full contract(embed/id/aliases/predicate_embed);同步 contract 向量化欄位升格。

gate:vitest 28 passed / tsc clean / wrangler dry-run 乾淨(只 env-var 綁定)。
端到端 ingest→graph:graph receiver 已補對齊 → 待 ingest 部署 + GRAPH_BASE_URL → 待部署驗,未假綠。

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-26 20:40:53 +08:00

118 lines
5.1 KiB
JavaScript
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env node
// 薄 ops CLI(T5.2)— 人手動觸發重萃。不帶查詢 MCP(ambient 餵食器沒人「問」它)。
//
// 兩種模式:
// ingest refresh <github:owner/repo@path> 經部署的 Worker /refresh 重萃單一來源
// ingest pull <owner/repo> [root] 本地 dry-run:拉 + 列出會送的 envelope(不 POST
//
// 設定走 env
// KBDB_INGEST_URL 已部署的 ingest Worker baserefresh 模式用)
// GRAPH_BASE_URL graph 寫入端(pull --post 用)
// GITHUB_TOKEN 拉私庫用(公庫可空)
//
// 鐵律:CLI 不碰儲存;refresh 經 Worker、pull --post 經 graph 寫入端。觸發=人手動(無排程)。
import process from 'node:process';
const [, , cmd, arg, arg2] = process.argv;
async function sha256hex(text) {
const data = new TextEncoder().encode(text);
const digest = await crypto.subtle.digest('SHA-256', data);
return [...new Uint8Array(digest)].map((b) => b.toString(16).padStart(2, '0')).join('');
}
function ghHeaders() {
const h = { Accept: 'application/vnd.github+json', 'User-Agent': 'kbdb-ingest-cli' };
if (process.env.GITHUB_TOKEN) h.Authorization = `Bearer ${process.env.GITHUB_TOKEN}`;
return h;
}
async function ghGetFile(owner, repo, path) {
const url = `https://api.github.com/repos/${owner}/${repo}/contents/${path}`;
const res = await fetch(url, { headers: ghHeaders() });
if (!res.ok) throw new Error(`github ${owner}/${repo}@${path}: ${res.status}`);
const body = await res.json();
const text = body.encoding === 'base64' ? Buffer.from(body.content, 'base64').toString('utf-8') : body.content;
return { text, commit: body.sha };
}
async function ghListMarkdown(owner, repo, root = '') {
const res = await fetch(`https://api.github.com/repos/${owner}/${repo}/git/trees/HEAD?recursive=1`, { headers: ghHeaders() });
if (!res.ok) throw new Error(`github list ${owner}/${repo}: ${res.status}`);
const body = await res.json();
const prefix = root.replace(/^\/+|\/+$/g, '');
return (body.tree || [])
.filter((e) => e.type === 'blob' && e.path.endsWith('.md'))
.map((e) => e.path)
.filter((p) => (prefix ? p === prefix || p.startsWith(prefix + '/') : true));
}
// 極簡採取(鏡射 src/lib/harvest.tsCLI dry-run 用,不引 TS)。
function harvest(md) {
const fm = /^---\n([\s\S]*?)\n---\n?([\s\S]*)$/.exec(md);
const body = fm ? fm[2] : md;
const gloss = fm && /^gloss:\s*(.+)$/m.exec(fm[1]) ? /^gloss:\s*(.+)$/m.exec(fm[1])[1].trim() : undefined;
const title = /^#\s+(.+)$/m.exec(body)?.[1]?.trim();
const sec = (h) => new RegExp(`^##\\s+${h}[^\\n]*\\n([\\s\\S]*?)(?=\\n##\\s|$)`, 'm').exec(body)?.[1] || '';
const nodes = [];
if (title) nodes.push({ name: title, gloss, embed: true });
for (const line of sec('實體').split('\n')) {
const m = /^-\s*\*\*(.+?)\*\*\s*(?:(.+?))?\s*(?:[—-]\s*(.+))?$/.exec(line.trim());
if (m) nodes.push({ name: m[1].trim(), gloss: m[3]?.trim() || undefined, embed: true });
}
const triplets = [];
for (const line of sec('關聯').split('\n')) {
const m = /^(.+?)\s*>>\s*(.+?)\s*>>\s*(.+?)$/.exec(line.replace(/^-\s*/, '').trim());
if (m) {
const clean = (s) => s.replace(/\[\[|\]\]|\*\*/g, '').trim();
triplets.push({ subject: clean(m[1]), predicate: m[2].trim(), object: clean(m[3]), predicate_embed: true });
}
}
return { nodes, triplets };
}
async function doRefresh(uri) {
const base = process.env.KBDB_INGEST_URL;
if (!base) throw new Error('KBDB_INGEST_URL 未設(指向已部署的 ingest Worker');
const res = await fetch(base.replace(/\/$/, '') + '/refresh', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ uri }),
});
console.log(JSON.stringify(await res.json(), null, 2));
}
async function doPull(ownerRepo, root) {
const [owner, repo] = ownerRepo.split('/');
if (!owner || !repo) throw new Error('用法:ingest pull <owner/repo> [root]');
const paths = await ghListMarkdown(owner, repo, root || '');
console.error(`[ingest] ${owner}/${repo}: ${paths.length} 個 MD`);
const envelopes = [];
for (const path of paths) {
const { text, commit } = await ghGetFile(owner, repo, path);
const { nodes, triplets } = harvest(text);
if (!triplets.length) continue; // 採不到(非 template 卡)→ dry-run 跳過(CLI 不做 extract
envelopes.push({
source: { uri: `github:${owner}/${repo}@${path}`, content_hash: await sha256hex(text), commit },
extractor: { model: 'local-harvest', tier: 'shallow' },
nodes,
triplets,
});
}
console.error(`[ingest] 採取出 ${envelopes.length} 個 envelope(共 ${envelopes.reduce((n, e) => n + e.triplets.length, 0)} 三元組)`);
console.log(JSON.stringify(envelopes, null, 2));
}
try {
if (cmd === 'refresh' && arg) await doRefresh(arg);
else if (cmd === 'pull' && arg) await doPull(arg, arg2);
else {
console.error('用法:\n ingest refresh <github:owner/repo@path>\n ingest pull <owner/repo> [root]');
process.exit(2);
}
} catch (e) {
console.error('[ingest] 錯誤:', e.message);
process.exit(1);
}