16ad1cb208
ingest 全管線(採取優先、extract fallback、跨庫織網、POST envelope): - T0.5 骨架:Hono + zod-openapi,無 D1/Vectorize/AI 綁定(不碰儲存鐵律) - T1 SourceAdapter:GitHub runtime API 拉 + per-file sha256 content-hash + /refresh 受理端 - T2 採取(路徑 A 優先):harvest template 1.8.0+ 卡(gloss/實體/typed-edge) - T3 extract(路徑 B fallback):LlmCaller 可選模型 + JSON-fail 升級閘 + 端點對齊硬自檢護欄;第一版不 embed(只打標) - T4 跨庫織網(主職):匯總多 repo → 偵測跨庫橋/異見,不算 bridge_score(graph 領域) - T5 輸出:buildEnvelope strict + 顯式禁送欄位自檢;graph-client 純 POST(cherry-pick _kbdb_client.py 改不碰 base);薄 ops CLI(不帶查詢 MCP) envelope 對齊 full contract(embed/id/aliases/predicate_embed);同步 contract 向量化欄位升格。 gate:vitest 28 passed / tsc clean / wrangler dry-run 乾淨(只 env-var 綁定)。 端到端 ingest→graph:graph receiver 已補對齊 → 待 ingest 部署 + GRAPH_BASE_URL → 待部署驗,未假綠。 Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
118 lines
5.1 KiB
JavaScript
118 lines
5.1 KiB
JavaScript
#!/usr/bin/env node
|
||
// 薄 ops CLI(T5.2)— 人手動觸發重萃。不帶查詢 MCP(ambient 餵食器沒人「問」它)。
|
||
//
|
||
// 兩種模式:
|
||
// ingest refresh <github:owner/repo@path> 經部署的 Worker /refresh 重萃單一來源
|
||
// ingest pull <owner/repo> [root] 本地 dry-run:拉 + 列出會送的 envelope(不 POST)
|
||
//
|
||
// 設定走 env:
|
||
// KBDB_INGEST_URL 已部署的 ingest Worker base(refresh 模式用)
|
||
// GRAPH_BASE_URL graph 寫入端(pull --post 用)
|
||
// GITHUB_TOKEN 拉私庫用(公庫可空)
|
||
//
|
||
// 鐵律:CLI 不碰儲存;refresh 經 Worker、pull --post 經 graph 寫入端。觸發=人手動(無排程)。
|
||
|
||
import process from 'node:process';
|
||
|
||
const [, , cmd, arg, arg2] = process.argv;
|
||
|
||
async function sha256hex(text) {
|
||
const data = new TextEncoder().encode(text);
|
||
const digest = await crypto.subtle.digest('SHA-256', data);
|
||
return [...new Uint8Array(digest)].map((b) => b.toString(16).padStart(2, '0')).join('');
|
||
}
|
||
|
||
function ghHeaders() {
|
||
const h = { Accept: 'application/vnd.github+json', 'User-Agent': 'kbdb-ingest-cli' };
|
||
if (process.env.GITHUB_TOKEN) h.Authorization = `Bearer ${process.env.GITHUB_TOKEN}`;
|
||
return h;
|
||
}
|
||
|
||
async function ghGetFile(owner, repo, path) {
|
||
const url = `https://api.github.com/repos/${owner}/${repo}/contents/${path}`;
|
||
const res = await fetch(url, { headers: ghHeaders() });
|
||
if (!res.ok) throw new Error(`github ${owner}/${repo}@${path}: ${res.status}`);
|
||
const body = await res.json();
|
||
const text = body.encoding === 'base64' ? Buffer.from(body.content, 'base64').toString('utf-8') : body.content;
|
||
return { text, commit: body.sha };
|
||
}
|
||
|
||
async function ghListMarkdown(owner, repo, root = '') {
|
||
const res = await fetch(`https://api.github.com/repos/${owner}/${repo}/git/trees/HEAD?recursive=1`, { headers: ghHeaders() });
|
||
if (!res.ok) throw new Error(`github list ${owner}/${repo}: ${res.status}`);
|
||
const body = await res.json();
|
||
const prefix = root.replace(/^\/+|\/+$/g, '');
|
||
return (body.tree || [])
|
||
.filter((e) => e.type === 'blob' && e.path.endsWith('.md'))
|
||
.map((e) => e.path)
|
||
.filter((p) => (prefix ? p === prefix || p.startsWith(prefix + '/') : true));
|
||
}
|
||
|
||
// 極簡採取(鏡射 src/lib/harvest.ts;CLI dry-run 用,不引 TS)。
|
||
function harvest(md) {
|
||
const fm = /^---\n([\s\S]*?)\n---\n?([\s\S]*)$/.exec(md);
|
||
const body = fm ? fm[2] : md;
|
||
const gloss = fm && /^gloss:\s*(.+)$/m.exec(fm[1]) ? /^gloss:\s*(.+)$/m.exec(fm[1])[1].trim() : undefined;
|
||
const title = /^#\s+(.+)$/m.exec(body)?.[1]?.trim();
|
||
const sec = (h) => new RegExp(`^##\\s+${h}[^\\n]*\\n([\\s\\S]*?)(?=\\n##\\s|$)`, 'm').exec(body)?.[1] || '';
|
||
const nodes = [];
|
||
if (title) nodes.push({ name: title, gloss, embed: true });
|
||
for (const line of sec('實體').split('\n')) {
|
||
const m = /^-\s*\*\*(.+?)\*\*\s*(?:((.+?)))?\s*(?:[—-]\s*(.+))?$/.exec(line.trim());
|
||
if (m) nodes.push({ name: m[1].trim(), gloss: m[3]?.trim() || undefined, embed: true });
|
||
}
|
||
const triplets = [];
|
||
for (const line of sec('關聯').split('\n')) {
|
||
const m = /^(.+?)\s*>>\s*(.+?)\s*>>\s*(.+?)$/.exec(line.replace(/^-\s*/, '').trim());
|
||
if (m) {
|
||
const clean = (s) => s.replace(/\[\[|\]\]|\*\*/g, '').trim();
|
||
triplets.push({ subject: clean(m[1]), predicate: m[2].trim(), object: clean(m[3]), predicate_embed: true });
|
||
}
|
||
}
|
||
return { nodes, triplets };
|
||
}
|
||
|
||
async function doRefresh(uri) {
|
||
const base = process.env.KBDB_INGEST_URL;
|
||
if (!base) throw new Error('KBDB_INGEST_URL 未設(指向已部署的 ingest Worker)');
|
||
const res = await fetch(base.replace(/\/$/, '') + '/refresh', {
|
||
method: 'POST',
|
||
headers: { 'Content-Type': 'application/json' },
|
||
body: JSON.stringify({ uri }),
|
||
});
|
||
console.log(JSON.stringify(await res.json(), null, 2));
|
||
}
|
||
|
||
async function doPull(ownerRepo, root) {
|
||
const [owner, repo] = ownerRepo.split('/');
|
||
if (!owner || !repo) throw new Error('用法:ingest pull <owner/repo> [root]');
|
||
const paths = await ghListMarkdown(owner, repo, root || '');
|
||
console.error(`[ingest] ${owner}/${repo}: ${paths.length} 個 MD`);
|
||
const envelopes = [];
|
||
for (const path of paths) {
|
||
const { text, commit } = await ghGetFile(owner, repo, path);
|
||
const { nodes, triplets } = harvest(text);
|
||
if (!triplets.length) continue; // 採不到(非 template 卡)→ dry-run 跳過(CLI 不做 extract)
|
||
envelopes.push({
|
||
source: { uri: `github:${owner}/${repo}@${path}`, content_hash: await sha256hex(text), commit },
|
||
extractor: { model: 'local-harvest', tier: 'shallow' },
|
||
nodes,
|
||
triplets,
|
||
});
|
||
}
|
||
console.error(`[ingest] 採取出 ${envelopes.length} 個 envelope(共 ${envelopes.reduce((n, e) => n + e.triplets.length, 0)} 三元組)`);
|
||
console.log(JSON.stringify(envelopes, null, 2));
|
||
}
|
||
|
||
try {
|
||
if (cmd === 'refresh' && arg) await doRefresh(arg);
|
||
else if (cmd === 'pull' && arg) await doPull(arg, arg2);
|
||
else {
|
||
console.error('用法:\n ingest refresh <github:owner/repo@path>\n ingest pull <owner/repo> [root]');
|
||
process.exit(2);
|
||
}
|
||
} catch (e) {
|
||
console.error('[ingest] 錯誤:', e.message);
|
||
process.exit(1);
|
||
}
|