feat(ingest): T0.5–T5 純餵食器管線實作(issue #2)

ingest 全管線(採取優先、extract fallback、跨庫織網、POST envelope):
- T0.5 骨架:Hono + zod-openapi,無 D1/Vectorize/AI 綁定(不碰儲存鐵律)
- T1 SourceAdapter:GitHub runtime API 拉 + per-file sha256 content-hash + /refresh 受理端
- T2 採取(路徑 A 優先):harvest template 1.8.0+ 卡(gloss/實體/typed-edge)
- T3 extract(路徑 B fallback):LlmCaller 可選模型 + JSON-fail 升級閘 + 端點對齊硬自檢護欄;第一版不 embed(只打標)
- T4 跨庫織網(主職):匯總多 repo → 偵測跨庫橋/異見,不算 bridge_score(graph 領域)
- T5 輸出:buildEnvelope strict + 顯式禁送欄位自檢;graph-client 純 POST(cherry-pick _kbdb_client.py 改不碰 base);薄 ops CLI(不帶查詢 MCP)

envelope 對齊 full contract(embed/id/aliases/predicate_embed);同步 contract 向量化欄位升格。

gate:vitest 28 passed / tsc clean / wrangler dry-run 乾淨(只 env-var 綁定)。
端到端 ingest→graph:graph receiver 已補對齊 → 待 ingest 部署 + GRAPH_BASE_URL → 待部署驗,未假綠。

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-26 20:40:53 +08:00
parent dffefdcdc2
commit 16ad1cb208
24 changed files with 4003 additions and 28 deletions
+58
View File
@@ -0,0 +1,58 @@
import { describe, it, expect } from 'vitest';
import { extract, parseExtractJson, type LlmCaller } from '../src/lib/extract';
const GOOD_JSON = JSON.stringify({
nodes: [
{ name: '原子筆記', gloss: '一個不可再分論點的記錄單元' },
{ name: '傳統筆記', gloss: '多主題混雜的記錄' },
],
triplets: [{ subject: '原子筆記', predicate: '對立於', object: '傳統筆記', confidence: 0.9 }],
});
function caller(model: string, out: string | (() => Promise<string>)): LlmCaller {
return { model, call: typeof out === 'string' ? async () => out : out };
}
describe('parseExtractJson', () => {
it('解析 fenced JSON + 打標 embed/predicate_embed', () => {
const g = parseExtractJson('```json\n' + GOOD_JSON + '\n```');
expect(g.triplets[0].predicate_embed).toBe(true);
expect(g.nodes[0].embed).toBe(true);
expect(g.triplets[0].confidence).toBe(0.9);
});
it('無 triplets → throw', () => {
expect(() => parseExtractJson(JSON.stringify({ nodes: [], triplets: [] }))).toThrow();
});
});
describe('extract', () => {
it('淺萃成功不升級', async () => {
const r = await extract('原文', caller('haiku', GOOD_JSON));
expect(r.tier).toBe('shallow');
expect(r.escalated).toBe(false);
expect(r.model).toBe('haiku');
});
it('淺萃 JSON-fail → 升 deep(升級閘)', async () => {
const r = await extract('原文', caller('haiku', 'not json at all'), caller('claude', GOOD_JSON));
expect(r.escalated).toBe(true);
expect(r.tier).toBe('deep');
expect(r.model).toBe('claude');
expect(r.triplets.length).toBe(1);
});
it('淺萃失敗且無 deep caller → throw', async () => {
await expect(extract('原文', caller('haiku', 'garbage'))).rejects.toThrow();
});
it('端點對齊護欄:模型吐對不齊端點 → 自動補進 nodes', async () => {
const skewed = JSON.stringify({
nodes: [{ name: 'A' }],
triplets: [{ subject: 'A', predicate: '連到', object: 'B(沒在 nodes' }],
});
const r = await extract('原文', caller('haiku', skewed));
// B 被自動補成 node → 端點全對齊
expect(r.nodes.some((n) => n.name === 'B(沒在 nodes')).toBe(true);
});
});