feat(cypher-executor): step-level telemetry (LI roadmap 2026-W19 建議)

對應第一份 arcrun-roadmap (block id e924c231) 提的：「mira_feed_watcher 執行時間偏長（~35秒），無 error 資訊建議：加入 checkpoint/step-level telemetry，監測瓶頸」新增 TelemetryEvent： - node_success — 單一 Component node 跑完 - node_failure — 單一 Component node 失敗寫入點：GraphExecutor.executeNode catch + 最終 trace.push 之後 - 只記 node.type === 'Component'（Input/Output 跳過避免噪音） - 含 workflow_name + component_id + duration_ms + (error_code on fail) - fire-and-forget, 不擋主流程實測（wiki_synthesis trigger 後）： - 4 個 node_success blocks 寫入 KBDB (4 個 kbdb_get) - duration 範圍 653ms-2003ms，立刻看到誰是瓶頸 - paused 的 classify (claude_api) 不算 success（trace 已記 paused 狀態）下次 weekly_review compose_review 會看到 component-level breakdown，能寫出「kbdb_get 平均 X ms、claude_api 平均 Y ms」等更細的分析。 Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-16 21:47:15 +08:00
parent 8e985684f9
commit 8b54ebb68a
2 changed files with 29 additions and 3 deletions
@@ -6,6 +6,7 @@ import { tryAuthDispatch } from './actions/auth-dispatcher';
 import { expandPromptRecipe } from './lib/recipe-expander';
 import { persistPausedRun, isResumablePending, parseRecipeOutput } from './lib/paused-runs';
 import { buildMagicVars } from './lib/magic-vars';
+import { recordTelemetry } from './lib/telemetry';

 export type ComponentLoader = (componentId: string) => Promise<ComponentRunner>;
 export type WorkflowLoader = (workflowId: string) => Promise<ExecutionGraph>;
@@ -352,14 +353,25 @@ export class GraphExecutor {
      if (e instanceof WorkflowPaused) throw e;

      const errMsg = e.message || String(e);
+      const duration_ms = Date.now() - start;
      trace.push({
        nodeId: node.id,
        type: node.type,
        input: nodeInput,
        output: null,
        error: errMsg,
-        duration_ms: Date.now() - start,
+        duration_ms,
      });
+      // Step-level telemetry：node 失敗事件（LI SDD M2.x 自評建議）
+      if (this.env && node.type === 'Component') {
+        recordTelemetry(this.env, this.apiKey, {
+          event_type: 'node_failure',
+          workflow_name: graph.name,
+          component_id: node.componentId,
+          error_code: 'node_error',
+          duration_ms,
+        });
+      }
      // 若已是 ExecutionError（上游節點拋出），保留原始 trace 繼續往上傳
      if (e instanceof ExecutionError) throw e;
      throw new ExecutionError(
@@ -370,14 +382,26 @@ export class GraphExecutor {
      );
    }

+    const duration_ms = Date.now() - start;
    trace.push({
      nodeId: node.id,
      type: node.type,
      input: nodeInput,
      output: result,
-      duration_ms: Date.now() - start,
+      duration_ms,
    });

+    // Step-level telemetry：node 成功事件（只記 Component，Input/Output 跳過）
+    // LI SDD M2.x：給 weekly_review 提的「效能基準線」建議用 — 每個 node duration 都可追
+    if (this.env && node.type === 'Component') {
+      recordTelemetry(this.env, this.apiKey, {
+        event_type: 'node_success',
+        workflow_name: graph.name,
+        component_id: node.componentId,
+        duration_ms,
+      });
+    }
+
    // 處理出邊
    const outEdges = graph.edges.filter(e => e.from === node.id);

@@ -21,7 +21,9 @@ export type TelemetryEvent =
  | 'run_success'
  | 'run_fail'
  | 'validation_error'
-  | 'mcp_tool_call';
+  | 'mcp_tool_call'
+  | 'node_success'        // 單一 node 跑完（給 step-level 效能分析用）
+  | 'node_failure';       // 單一 node 失敗

 export interface TelemetryRecord {
  event_type: TelemetryEvent;