feat(cypher-executor): step-level telemetry (LI roadmap 2026-W19 建議)
對應第一份 arcrun-roadmap (block id e924c231) 提的:
「mira_feed_watcher 執行時間偏長(~35秒),無 error 資訊
建議:加入 checkpoint/step-level telemetry,監測瓶頸」
新增 TelemetryEvent:
- node_success — 單一 Component node 跑完
- node_failure — 單一 Component node 失敗
寫入點:GraphExecutor.executeNode catch + 最終 trace.push 之後
- 只記 node.type === 'Component'(Input/Output 跳過避免噪音)
- 含 workflow_name + component_id + duration_ms + (error_code on fail)
- fire-and-forget, 不擋主流程
實測(wiki_synthesis trigger 後):
- 4 個 node_success blocks 寫入 KBDB (4 個 kbdb_get)
- duration 範圍 653ms-2003ms,立刻看到誰是瓶頸
- paused 的 classify (claude_api) 不算 success(trace 已記 paused 狀態)
下次 weekly_review compose_review 會看到 component-level breakdown,
能寫出「kbdb_get 平均 X ms、claude_api 平均 Y ms」等更細的分析。
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -6,6 +6,7 @@ import { tryAuthDispatch } from './actions/auth-dispatcher';
|
||||
import { expandPromptRecipe } from './lib/recipe-expander';
|
||||
import { persistPausedRun, isResumablePending, parseRecipeOutput } from './lib/paused-runs';
|
||||
import { buildMagicVars } from './lib/magic-vars';
|
||||
import { recordTelemetry } from './lib/telemetry';
|
||||
|
||||
export type ComponentLoader = (componentId: string) => Promise<ComponentRunner>;
|
||||
export type WorkflowLoader = (workflowId: string) => Promise<ExecutionGraph>;
|
||||
@@ -352,14 +353,25 @@ export class GraphExecutor {
|
||||
if (e instanceof WorkflowPaused) throw e;
|
||||
|
||||
const errMsg = e.message || String(e);
|
||||
const duration_ms = Date.now() - start;
|
||||
trace.push({
|
||||
nodeId: node.id,
|
||||
type: node.type,
|
||||
input: nodeInput,
|
||||
output: null,
|
||||
error: errMsg,
|
||||
duration_ms: Date.now() - start,
|
||||
duration_ms,
|
||||
});
|
||||
// Step-level telemetry:node 失敗事件(LI SDD M2.x 自評建議)
|
||||
if (this.env && node.type === 'Component') {
|
||||
recordTelemetry(this.env, this.apiKey, {
|
||||
event_type: 'node_failure',
|
||||
workflow_name: graph.name,
|
||||
component_id: node.componentId,
|
||||
error_code: 'node_error',
|
||||
duration_ms,
|
||||
});
|
||||
}
|
||||
// 若已是 ExecutionError(上游節點拋出),保留原始 trace 繼續往上傳
|
||||
if (e instanceof ExecutionError) throw e;
|
||||
throw new ExecutionError(
|
||||
@@ -370,14 +382,26 @@ export class GraphExecutor {
|
||||
);
|
||||
}
|
||||
|
||||
const duration_ms = Date.now() - start;
|
||||
trace.push({
|
||||
nodeId: node.id,
|
||||
type: node.type,
|
||||
input: nodeInput,
|
||||
output: result,
|
||||
duration_ms: Date.now() - start,
|
||||
duration_ms,
|
||||
});
|
||||
|
||||
// Step-level telemetry:node 成功事件(只記 Component,Input/Output 跳過)
|
||||
// LI SDD M2.x:給 weekly_review 提的「效能基準線」建議用 — 每個 node duration 都可追
|
||||
if (this.env && node.type === 'Component') {
|
||||
recordTelemetry(this.env, this.apiKey, {
|
||||
event_type: 'node_success',
|
||||
workflow_name: graph.name,
|
||||
component_id: node.componentId,
|
||||
duration_ms,
|
||||
});
|
||||
}
|
||||
|
||||
// 處理出邊
|
||||
const outEdges = graph.edges.filter(e => e.from === node.id);
|
||||
|
||||
|
||||
@@ -21,7 +21,9 @@ export type TelemetryEvent =
|
||||
| 'run_success'
|
||||
| 'run_fail'
|
||||
| 'validation_error'
|
||||
| 'mcp_tool_call';
|
||||
| 'mcp_tool_call'
|
||||
| 'node_success' // 單一 node 跑完(給 step-level 效能分析用)
|
||||
| 'node_failure'; // 單一 node 失敗
|
||||
|
||||
export interface TelemetryRecord {
|
||||
event_type: TelemetryEvent;
|
||||
|
||||
Reference in New Issue
Block a user