feat(cypher-executor): step-level telemetry (LI roadmap 2026-W19 建議)

對應第一份 arcrun-roadmap (block id e924c231) 提的:
  「mira_feed_watcher 執行時間偏長(~35秒),無 error 資訊
    建議:加入 checkpoint/step-level telemetry,監測瓶頸」

新增 TelemetryEvent:
  - node_success — 單一 Component node 跑完
  - node_failure — 單一 Component node 失敗

寫入點:GraphExecutor.executeNode catch + 最終 trace.push 之後
  - 只記 node.type === 'Component'(Input/Output 跳過避免噪音)
  - 含 workflow_name + component_id + duration_ms + (error_code on fail)
  - fire-and-forget, 不擋主流程

實測(wiki_synthesis trigger 後):
  - 4 個 node_success blocks 寫入 KBDB (4 個 kbdb_get)
  - duration 範圍 653ms-2003ms,立刻看到誰是瓶頸
  - paused 的 classify (claude_api) 不算 success(trace 已記 paused 狀態)

下次 weekly_review compose_review 會看到 component-level breakdown,
能寫出「kbdb_get 平均 X ms、claude_api 平均 Y ms」等更細的分析。

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
2026-05-16 21:47:15 +08:00
parent 8e985684f9
commit 8b54ebb68a
2 changed files with 29 additions and 3 deletions
+26 -2
View File
@@ -6,6 +6,7 @@ import { tryAuthDispatch } from './actions/auth-dispatcher';
import { expandPromptRecipe } from './lib/recipe-expander'; import { expandPromptRecipe } from './lib/recipe-expander';
import { persistPausedRun, isResumablePending, parseRecipeOutput } from './lib/paused-runs'; import { persistPausedRun, isResumablePending, parseRecipeOutput } from './lib/paused-runs';
import { buildMagicVars } from './lib/magic-vars'; import { buildMagicVars } from './lib/magic-vars';
import { recordTelemetry } from './lib/telemetry';
export type ComponentLoader = (componentId: string) => Promise<ComponentRunner>; export type ComponentLoader = (componentId: string) => Promise<ComponentRunner>;
export type WorkflowLoader = (workflowId: string) => Promise<ExecutionGraph>; export type WorkflowLoader = (workflowId: string) => Promise<ExecutionGraph>;
@@ -352,14 +353,25 @@ export class GraphExecutor {
if (e instanceof WorkflowPaused) throw e; if (e instanceof WorkflowPaused) throw e;
const errMsg = e.message || String(e); const errMsg = e.message || String(e);
const duration_ms = Date.now() - start;
trace.push({ trace.push({
nodeId: node.id, nodeId: node.id,
type: node.type, type: node.type,
input: nodeInput, input: nodeInput,
output: null, output: null,
error: errMsg, error: errMsg,
duration_ms: Date.now() - start, duration_ms,
}); });
// Step-level telemetrynode 失敗事件(LI SDD M2.x 自評建議)
if (this.env && node.type === 'Component') {
recordTelemetry(this.env, this.apiKey, {
event_type: 'node_failure',
workflow_name: graph.name,
component_id: node.componentId,
error_code: 'node_error',
duration_ms,
});
}
// 若已是 ExecutionError(上游節點拋出),保留原始 trace 繼續往上傳 // 若已是 ExecutionError(上游節點拋出),保留原始 trace 繼續往上傳
if (e instanceof ExecutionError) throw e; if (e instanceof ExecutionError) throw e;
throw new ExecutionError( throw new ExecutionError(
@@ -370,14 +382,26 @@ export class GraphExecutor {
); );
} }
const duration_ms = Date.now() - start;
trace.push({ trace.push({
nodeId: node.id, nodeId: node.id,
type: node.type, type: node.type,
input: nodeInput, input: nodeInput,
output: result, output: result,
duration_ms: Date.now() - start, duration_ms,
}); });
// Step-level telemetrynode 成功事件(只記 ComponentInput/Output 跳過)
// LI SDD M2.x:給 weekly_review 提的「效能基準線」建議用 — 每個 node duration 都可追
if (this.env && node.type === 'Component') {
recordTelemetry(this.env, this.apiKey, {
event_type: 'node_success',
workflow_name: graph.name,
component_id: node.componentId,
duration_ms,
});
}
// 處理出邊 // 處理出邊
const outEdges = graph.edges.filter(e => e.from === node.id); const outEdges = graph.edges.filter(e => e.from === node.id);
+3 -1
View File
@@ -21,7 +21,9 @@ export type TelemetryEvent =
| 'run_success' | 'run_success'
| 'run_fail' | 'run_fail'
| 'validation_error' | 'validation_error'
| 'mcp_tool_call'; | 'mcp_tool_call'
| 'node_success' // 單一 node 跑完(給 step-level 效能分析用)
| 'node_failure'; // 單一 node 失敗
export interface TelemetryRecord { export interface TelemetryRecord {
event_type: TelemetryEvent; event_type: TelemetryEvent;