feat(ingest): T0.5–T5 純餵食器管線實作(issue #2)

ingest 全管線(採取優先、extract fallback、跨庫織網、POST envelope):
- T0.5 骨架:Hono + zod-openapi,無 D1/Vectorize/AI 綁定(不碰儲存鐵律)
- T1 SourceAdapter:GitHub runtime API 拉 + per-file sha256 content-hash + /refresh 受理端
- T2 採取(路徑 A 優先):harvest template 1.8.0+ 卡(gloss/實體/typed-edge)
- T3 extract(路徑 B fallback):LlmCaller 可選模型 + JSON-fail 升級閘 + 端點對齊硬自檢護欄;第一版不 embed(只打標)
- T4 跨庫織網(主職):匯總多 repo → 偵測跨庫橋/異見,不算 bridge_score(graph 領域)
- T5 輸出:buildEnvelope strict + 顯式禁送欄位自檢;graph-client 純 POST(cherry-pick _kbdb_client.py 改不碰 base);薄 ops CLI(不帶查詢 MCP)

envelope 對齊 full contract(embed/id/aliases/predicate_embed);同步 contract 向量化欄位升格。

gate:vitest 28 passed / tsc clean / wrangler dry-run 乾淨(只 env-var 綁定)。
端到端 ingest→graph:graph receiver 已補對齊 → 待 ingest 部署 + GRAPH_BASE_URL → 待部署驗,未假綠。

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-26 20:40:53 +08:00
parent dffefdcdc2
commit 16ad1cb208
24 changed files with 4003 additions and 28 deletions
+73
View File
@@ -0,0 +1,73 @@
import { describe, it, expect } from 'vitest';
import { makeSourceUri, parseSourceUri, contentHash, pullRepoMarkdown, type GitHubFetcher } from '../src/lib/source-adapter';
import { processSource } from '../src/lib/pipeline';
import type { LlmCaller } from '../src/lib/extract';
describe('source-adapter uri', () => {
it('makeSourceUri / parseSourceUri round-trip', () => {
const uri = makeSourceUri('uncle6me-web', 'kbdb-ingest-plugin', 'system-dev/wiki/cards/ingest/掛載架構.md');
expect(uri).toBe('github:uncle6me-web/kbdb-ingest-plugin@system-dev/wiki/cards/ingest/掛載架構.md');
expect(parseSourceUri(uri)).toEqual({
owner: 'uncle6me-web',
repo: 'kbdb-ingest-plugin',
path: 'system-dev/wiki/cards/ingest/掛載架構.md',
});
});
it('content-hash 穩定且隨內容變', async () => {
const a = await contentHash('hello');
expect(a).toBe(await contentHash('hello'));
expect(a).not.toBe(await contentHash('world'));
});
});
const HARVEST_CARD = `---
gloss: 卡標題定義。
---
# 卡A
## 實體
- **甲** — 甲的定義。
- **乙** — 乙的定義。
## 關聯
- 甲 >> 連到 >> 乙
`;
function mockFetcher(files: Record<string, string>): GitHubFetcher {
return {
async listMarkdown() {
return Object.keys(files);
},
async getFile(_o, _r, path) {
return { text: files[path], commit: 'sha1' };
},
};
}
describe('pullRepoMarkdown + processSource', () => {
it('採取路徑 A:拉檔 → harvest → envelope(不 extract', async () => {
const sources = await pullRepoMarkdown(mockFetcher({ 'cards/a.md': HARVEST_CARD }), 'o', 'r');
expect(sources.length).toBe(1);
const result = await processSource(sources[0]);
expect(result.path).toBe('harvest');
expect(result.envelope?.triplets).toEqual([{ subject: '甲', predicate: '連到', object: '乙', predicate_embed: true }]);
expect(result.envelope?.extractor.model).toBe('local-harvest');
});
it('採不到三元組 + 無萃取模型 → skipped(不假萃)', async () => {
const sources = await pullRepoMarkdown(mockFetcher({ 'plain.md': '# 純文字\n沒有三元組。' }), 'o', 'r');
const result = await processSource(sources[0]);
expect(result.path).toBe('skipped');
expect(result.envelope).toBeNull();
});
it('採不到 → fallback extract(路徑 B', async () => {
const caller: LlmCaller = {
model: 'haiku',
call: async () => JSON.stringify({ nodes: [{ name: '甲' }], triplets: [{ subject: '甲', predicate: '是', object: '乙' }] }),
};
const sources = await pullRepoMarkdown(mockFetcher({ 'plain.md': '# 純文字\n甲是乙。' }), 'o', 'r');
const result = await processSource(sources[0], { shallowCaller: caller });
expect(result.path).toBe('extract');
expect(result.envelope?.extractor.model).toBe('haiku');
});
});