feat(ingest): T0.5–T5 純餵食器管線實作(issue #2)

ingest 全管線(採取優先、extract fallback、跨庫織網、POST envelope):
- T0.5 骨架:Hono + zod-openapi,無 D1/Vectorize/AI 綁定(不碰儲存鐵律)
- T1 SourceAdapter:GitHub runtime API 拉 + per-file sha256 content-hash + /refresh 受理端
- T2 採取(路徑 A 優先):harvest template 1.8.0+ 卡(gloss/實體/typed-edge)
- T3 extract(路徑 B fallback):LlmCaller 可選模型 + JSON-fail 升級閘 + 端點對齊硬自檢護欄;第一版不 embed(只打標)
- T4 跨庫織網(主職):匯總多 repo → 偵測跨庫橋/異見,不算 bridge_score(graph 領域)
- T5 輸出:buildEnvelope strict + 顯式禁送欄位自檢;graph-client 純 POST(cherry-pick _kbdb_client.py 改不碰 base);薄 ops CLI(不帶查詢 MCP)

envelope 對齊 full contract(embed/id/aliases/predicate_embed);同步 contract 向量化欄位升格。

gate:vitest 28 passed / tsc clean / wrangler dry-run 乾淨(只 env-var 綁定)。
端到端 ingest→graph:graph receiver 已補對齊 → 待 ingest 部署 + GRAPH_BASE_URL → 待部署驗,未假綠。

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-26 20:40:53 +08:00
parent dffefdcdc2
commit 16ad1cb208
24 changed files with 4003 additions and 28 deletions
+47
View File
@@ -0,0 +1,47 @@
import { describe, it, expect } from 'vitest';
import { buildEnvelope } from '../src/lib/envelope';
const base = {
source: { uri: 'github:o/r@a.md', content_hash: 'abc' },
extractor: { model: 'local-harvest', tier: 'shallow' as const },
triplets: [{ subject: 'A', predicate: 'p', object: 'B', predicate_embed: true }],
};
describe('buildEnvelope', () => {
it('組合法 envelope(含向量化打標欄位)', () => {
const env = buildEnvelope({
...base,
nodes: [{ name: 'A', gloss: 'a', aliases: ['a2'], embed: true, id: 'A' }],
});
expect(env.source.uri).toBe('github:o/r@a.md');
expect(env.nodes?.[0].embed).toBe(true);
expect(env.nodes?.[0].id).toBe('A');
expect(env.triplets[0].predicate_embed).toBe(true);
});
it('node 帶禁送欄位(bridge_score)→ strict throw(本地提早攔,不等 graph 422', () => {
expect(() => buildEnvelope({ ...base, nodes: [{ name: 'A', embed: true }] })).not.toThrow();
expect(() =>
buildEnvelope({ ...base, nodes: [{ name: 'A', bridge_score: 0.5 } as any] }),
).toThrow();
});
it('node 帶 graph 領域 record id(非去重 id)以外的禁送鍵 → strict throw', () => {
// 契約允許 nodes[].id(去重鍵);但 clusters 是 graph 領域 → strict 擋。
expect(() => buildEnvelope({ ...base, nodes: [{ name: 'A', id: 'A', embed: true }] })).not.toThrow();
expect(() => buildEnvelope({ ...base, nodes: [{ name: 'A', clusters: ['c'] } as any] })).toThrow();
});
it('禁送邊上 entity_type → strict throw', () => {
expect(() =>
buildEnvelope({
...base,
triplets: [{ subject: 'A', predicate: 'p', object: 'B', subject_entity_type: 'person' } as any],
}),
).toThrow();
});
it('無 triplets → throw(契約 min 1', () => {
expect(() => buildEnvelope({ ...base, triplets: [] })).toThrow();
});
});
+58
View File
@@ -0,0 +1,58 @@
import { describe, it, expect } from 'vitest';
import { extract, parseExtractJson, type LlmCaller } from '../src/lib/extract';
const GOOD_JSON = JSON.stringify({
nodes: [
{ name: '原子筆記', gloss: '一個不可再分論點的記錄單元' },
{ name: '傳統筆記', gloss: '多主題混雜的記錄' },
],
triplets: [{ subject: '原子筆記', predicate: '對立於', object: '傳統筆記', confidence: 0.9 }],
});
function caller(model: string, out: string | (() => Promise<string>)): LlmCaller {
return { model, call: typeof out === 'string' ? async () => out : out };
}
describe('parseExtractJson', () => {
it('解析 fenced JSON + 打標 embed/predicate_embed', () => {
const g = parseExtractJson('```json\n' + GOOD_JSON + '\n```');
expect(g.triplets[0].predicate_embed).toBe(true);
expect(g.nodes[0].embed).toBe(true);
expect(g.triplets[0].confidence).toBe(0.9);
});
it('無 triplets → throw', () => {
expect(() => parseExtractJson(JSON.stringify({ nodes: [], triplets: [] }))).toThrow();
});
});
describe('extract', () => {
it('淺萃成功不升級', async () => {
const r = await extract('原文', caller('haiku', GOOD_JSON));
expect(r.tier).toBe('shallow');
expect(r.escalated).toBe(false);
expect(r.model).toBe('haiku');
});
it('淺萃 JSON-fail → 升 deep(升級閘)', async () => {
const r = await extract('原文', caller('haiku', 'not json at all'), caller('claude', GOOD_JSON));
expect(r.escalated).toBe(true);
expect(r.tier).toBe('deep');
expect(r.model).toBe('claude');
expect(r.triplets.length).toBe(1);
});
it('淺萃失敗且無 deep caller → throw', async () => {
await expect(extract('原文', caller('haiku', 'garbage'))).rejects.toThrow();
});
it('端點對齊護欄:模型吐對不齊端點 → 自動補進 nodes', async () => {
const skewed = JSON.stringify({
nodes: [{ name: 'A' }],
triplets: [{ subject: 'A', predicate: '連到', object: 'B(沒在 nodes' }],
});
const r = await extract('原文', caller('haiku', skewed));
// B 被自動補成 node → 端點全對齊
expect(r.nodes.some((n) => n.name === 'B(沒在 nodes')).toBe(true);
});
});
+43
View File
@@ -0,0 +1,43 @@
import { describe, it, expect } from 'vitest';
import { makeGraphClient } from '../src/lib/graph-client';
import type { Envelope } from '../src/types';
const env: Envelope = {
source: { uri: 'github:o/r@a.md', content_hash: 'abc' },
extractor: { model: 'local-harvest', tier: 'shallow' },
triplets: [{ subject: 'A', predicate: 'p', object: 'B' }],
};
function mockFetch(status: number, body: unknown): typeof fetch {
return (async () =>
new Response(JSON.stringify(body), { status, headers: { 'Content-Type': 'application/json' } })) as any;
}
describe('makeGraphClient', () => {
it('GRAPH_BASE_URL 未設 → 誠實回 ok:false,不假綠、不打網路', async () => {
let called = false;
const client = makeGraphClient(undefined, undefined, (async () => {
called = true;
return new Response('{}');
}) as any);
const r = await client.postEnvelope(env);
expect(r.ok).toBe(false);
expect(r.error).toContain('未設');
expect(called).toBe(false);
});
it('200 → ok + 帶 graph 回的 {skipped,ingested,deprecated}', async () => {
const client = makeGraphClient('https://graph.example', 'tok', mockFetch(200, { skipped: false, ingested: 1, deprecated: 0 }));
const r = await client.postEnvelope(env);
expect(r.ok).toBe(true);
expect((r.body as any).ingested).toBe(1);
});
it('422 → ok:false 帶 issues(供修禁送欄位)', async () => {
const client = makeGraphClient('https://graph.example', undefined, mockFetch(422, { error: 'invalid envelope', issues: [{ path: ['bridge_score'] }] }));
const r = await client.postEnvelope(env);
expect(r.ok).toBe(false);
expect(r.status).toBe(422);
expect((r.body as any).issues).toBeDefined();
});
});
+68
View File
@@ -0,0 +1,68 @@
import { describe, it, expect } from 'vitest';
import { harvestCard, parseEntities, parseEdges, parseFrontmatter } from '../src/lib/harvest';
const CARD = `---
tags: [掛載架構, 架構設計]
gloss: ingest 在 KBDB 堆疊裡的位置。
---
# 掛載架構
← [[ingest/00-INDEX]]
## 摘要
KBDB 是三層堆疊。
## 實體
- **kbdb-ingest-plugin**(餵食器) — 最薄一層,純 POST 候選。
- **base KBDB**arcrun/kbdb/基本盤) — 最底儲存層。
## 關聯
### 內文知識關係
- kbdb-ingest-plugin >> 掛載於 >> base KBDB
### 卡片關係
- [[掛載架構]] >> 受約束於 >> [[envelope-契約]]
`;
describe('parseFrontmatter', () => {
it('抽出 gloss', () => {
const { fm, body } = parseFrontmatter(CARD);
expect(fm.gloss).toBe('ingest 在 KBDB 堆疊裡的位置。');
expect(body).toContain('# 掛載架構');
});
});
describe('parseEntities', () => {
it('解析正規名 + aliases + gloss', () => {
const { body } = parseFrontmatter(CARD);
const nodes = parseEntities(body);
expect(nodes.map((n) => n.name)).toEqual(['kbdb-ingest-plugin', 'base KBDB']);
expect(nodes[1].aliases).toEqual(['arcrun/kbdb', '基本盤']);
expect(nodes[0].gloss).toBe('最薄一層,純 POST 候選。');
expect(nodes[0].embed).toBe(true);
});
});
describe('parseEdges', () => {
it('解析 typed-edge、去 [[ ]]、標記卡對卡', () => {
const { body } = parseFrontmatter(CARD);
const edges = parseEdges(body);
expect(edges).toContainEqual({ subject: 'kbdb-ingest-plugin', predicate: '掛載於', object: 'base KBDB', predicate_embed: true, subjectIsCard: false, objectIsCard: false });
expect(edges).toContainEqual({ subject: '掛載架構', predicate: '受約束於', object: 'envelope-契約', predicate_embed: true, subjectIsCard: true, objectIsCard: true });
});
});
describe('harvestCard', () => {
it('卡標題 node 帶 frontmatter gloss、含內文 node', () => {
const r = harvestCard(CARD);
const titleNode = r.nodes.find((n) => n.name === '掛載架構');
expect(titleNode?.gloss).toBe('ingest 在 KBDB 堆疊裡的位置。');
expect(r.nodes.some((n) => n.name === 'base KBDB')).toBe(true);
expect(r.triplets.length).toBe(2);
});
it('內文端點對齊(無對不齊)', () => {
const r = harvestCard(CARD);
// kbdb-ingest-plugin / base KBDB 都在 ## 實體;卡對卡端點不要求
expect(r.unalignedEndpoints).toEqual([]);
});
});
+73
View File
@@ -0,0 +1,73 @@
import { describe, it, expect } from 'vitest';
import { makeSourceUri, parseSourceUri, contentHash, pullRepoMarkdown, type GitHubFetcher } from '../src/lib/source-adapter';
import { processSource } from '../src/lib/pipeline';
import type { LlmCaller } from '../src/lib/extract';
describe('source-adapter uri', () => {
it('makeSourceUri / parseSourceUri round-trip', () => {
const uri = makeSourceUri('uncle6me-web', 'kbdb-ingest-plugin', 'system-dev/wiki/cards/ingest/掛載架構.md');
expect(uri).toBe('github:uncle6me-web/kbdb-ingest-plugin@system-dev/wiki/cards/ingest/掛載架構.md');
expect(parseSourceUri(uri)).toEqual({
owner: 'uncle6me-web',
repo: 'kbdb-ingest-plugin',
path: 'system-dev/wiki/cards/ingest/掛載架構.md',
});
});
it('content-hash 穩定且隨內容變', async () => {
const a = await contentHash('hello');
expect(a).toBe(await contentHash('hello'));
expect(a).not.toBe(await contentHash('world'));
});
});
const HARVEST_CARD = `---
gloss: 卡標題定義。
---
# 卡A
## 實體
- **甲** — 甲的定義。
- **乙** — 乙的定義。
## 關聯
- 甲 >> 連到 >> 乙
`;
function mockFetcher(files: Record<string, string>): GitHubFetcher {
return {
async listMarkdown() {
return Object.keys(files);
},
async getFile(_o, _r, path) {
return { text: files[path], commit: 'sha1' };
},
};
}
describe('pullRepoMarkdown + processSource', () => {
it('採取路徑 A:拉檔 → harvest → envelope(不 extract', async () => {
const sources = await pullRepoMarkdown(mockFetcher({ 'cards/a.md': HARVEST_CARD }), 'o', 'r');
expect(sources.length).toBe(1);
const result = await processSource(sources[0]);
expect(result.path).toBe('harvest');
expect(result.envelope?.triplets).toEqual([{ subject: '甲', predicate: '連到', object: '乙', predicate_embed: true }]);
expect(result.envelope?.extractor.model).toBe('local-harvest');
});
it('採不到三元組 + 無萃取模型 → skipped(不假萃)', async () => {
const sources = await pullRepoMarkdown(mockFetcher({ 'plain.md': '# 純文字\n沒有三元組。' }), 'o', 'r');
const result = await processSource(sources[0]);
expect(result.path).toBe('skipped');
expect(result.envelope).toBeNull();
});
it('採不到 → fallback extract(路徑 B', async () => {
const caller: LlmCaller = {
model: 'haiku',
call: async () => JSON.stringify({ nodes: [{ name: '甲' }], triplets: [{ subject: '甲', predicate: '是', object: '乙' }] }),
};
const sources = await pullRepoMarkdown(mockFetcher({ 'plain.md': '# 純文字\n甲是乙。' }), 'o', 'r');
const result = await processSource(sources[0], { shallowCaller: caller });
expect(result.path).toBe('extract');
expect(result.envelope?.extractor.model).toBe('haiku');
});
});
+45
View File
@@ -0,0 +1,45 @@
import { describe, it, expect } from 'vitest';
import { weave, flattenForPost, type RepoEnvelopes } from '../src/lib/weave';
import type { Envelope } from '../src/types';
function env(uri: string, nodes: string[], triplets: Array<[string, string, string]>): Envelope {
return {
source: { uri, content_hash: uri },
extractor: { model: 'local-harvest', tier: 'shallow' },
nodes: nodes.map((n) => ({ name: n, embed: true })),
triplets: triplets.map(([s, p, o]) => ({ subject: s, predicate: p, object: o })),
};
}
const repos: RepoEnvelopes[] = [
{ repo: 'o/repoA', envelopes: [env('github:o/repoA@x.md', ['Arcrun', '餵食器'], [['Arcrun', '包含', '餵食器']])] },
{ repo: 'o/repoB', envelopes: [env('github:o/repoB@y.md', ['Arcrun', '圖層'], [['Arcrun', '依賴', '圖層']])] },
];
describe('weave', () => {
it('偵測跨庫橋(同名節點跨 ≥2 repo)', () => {
const r = weave(repos);
const bridge = r.bridges.find((b) => b.node === 'Arcrun');
expect(bridge?.repos).toEqual(['o/repoA', 'o/repoB']);
expect(r.totalTriplets).toBe(2);
});
it('偵測跨庫異見(同 s/o 對、不同謂詞跨 repo', () => {
const diverge: RepoEnvelopes[] = [
{ repo: 'o/repoA', envelopes: [env('github:o/repoA@x.md', ['X', 'Y'], [['X', '支持', 'Y']])] },
{ repo: 'o/repoB', envelopes: [env('github:o/repoB@y.md', ['X', 'Y'], [['X', '反對', 'Y']])] },
];
const r = weave(diverge);
expect(r.divergences.length).toBe(1);
expect(r.divergences[0].predicatesByRepo.map((p) => p.predicate).sort()).toEqual(['反對', '支持']);
});
it('flattenForPost 攤平所有 envelope(順序穩定)', () => {
expect(flattenForPost(repos).length).toBe(2);
});
it('ingest 不算 bridge_score(橋只標 repos,無分數欄位)', () => {
const r = weave(repos);
expect(r.bridges[0]).not.toHaveProperty('bridge_score');
});
});