Arcrun/scripts/sync-registry-to-kbdb.py

#!/usr/bin/env python3
"""sync-registry-to-kbdb.py — 把 registry/examples + registry/skills 同步進 KBDB

對應 LI SDD M3.4。examples / skills 在 git 是 source of truth，
KBDB 是「給 AI 搜尋 / get」的 query-friendly mirror。

對 KBDB block：
- examples → type=workflow-example
  content = workflow.yaml 全文
  metadata_json = { description, tags }
  tags_json = [...tags.json]
  page_name = example-{slug}    (idempotency key，重複 sync 走 upsert)

- skills → type=agent-skill
  content = {slug}.md 全文
  page_name = skill-{slug}     (idempotency key)
  tags_json = ["agent-skill", "skill:{slug}"]

執行：
    cd matrix/arcrun
    python3 scripts/sync-registry-to-kbdb.py            # 上傳所有
    python3 scripts/sync-registry-to-kbdb.py --dry-run  # 只 list 不寫

需求：
- mira tools/_kbdb_client.py 風格 (urllib + ak_)
- ARCRUN_API_KEY 從 .env 或 env var
- 走 kbdb-*.arcrun.dev 零件 worker endpoints (符合 mira CLAUDE.md §1.7)
"""

import argparse
import json
import os
import sys
import urllib.request
import urllib.error
from pathlib import Path

ARCRUN_ROOT = Path(__file__).resolve().parent.parent
EXAMPLES_DIR = ARCRUN_ROOT / "registry" / "examples"
SKILLS_DIR = ARCRUN_ROOT / "registry" / "skills"

KBDB_UPSERT_URL = "https://kbdb-upsert-block.arcrun.dev/"
USER_AGENT = "arcrun-registry-sync/1.0"
USER_ID = "inkstone_platform_registry"  # 需符合 KBDB partner namespace prefix（inkstone_*）
SOURCE = "registry-git-sync"


def get_api_key() -> str:
    """從 env var 或 polaris/mira/.env 取 ARCRUN_API_KEY。"""
    key = os.environ.get("ARCRUN_API_KEY", "")
    if key:
        return key
    # fallback：找 polaris/mira/.env（leo 既有約定位置）
    mira_env = ARCRUN_ROOT.parent.parent / "polaris" / "mira" / ".env"
    if mira_env.exists():
        for line in mira_env.read_text(encoding="utf-8").splitlines():
            line = line.strip()
            if line.startswith("ARCRUN_API_KEY="):
                return line.split("=", 1)[1].strip()
    raise SystemExit(
        "ARCRUN_API_KEY 未設定。export ARCRUN_API_KEY=ak_... 或加到 polaris/mira/.env"
    )


def kbdb_upsert(api_key: str, payload: dict, dry_run: bool) -> dict:
    """POST kbdb-upsert-block.arcrun.dev — page_name 當 idempotency key"""
    if dry_run:
        return {"dry_run": True, "would_upsert": payload.get("page_name")}
    data = json.dumps(payload, ensure_ascii=False).encode("utf-8")
    req = urllib.request.Request(
        KBDB_UPSERT_URL,
        data=data,
        headers={
            "Content-Type": "application/json",
            "User-Agent": USER_AGENT,
        },
        method="POST",
    )
    try:
        with urllib.request.urlopen(req, timeout=30) as resp:
            return json.loads(resp.read().decode("utf-8"))
    except urllib.error.HTTPError as e:
        body = e.read().decode("utf-8", errors="replace")
        return {"error": f"HTTP {e.code}: {body[:200]}"}


def sync_examples(api_key: str, dry_run: bool) -> tuple[int, int]:
    """同步 registry/examples/{slug}/ 進 KBDB"""
    if not EXAMPLES_DIR.exists():
        print(f"⚠️  {EXAMPLES_DIR} 不存在，跳過 examples 同步")
        return 0, 0

    ok, fail = 0, 0
    for slug_dir in sorted(EXAMPLES_DIR.iterdir()):
        if not slug_dir.is_dir():
            continue
        slug = slug_dir.name
        workflow_yaml = slug_dir / "workflow.yaml"
        description_md = slug_dir / "description.md"
        tags_json = slug_dir / "tags.json"

        if not workflow_yaml.exists():
            print(f"  ⚠️  {slug}: 缺 workflow.yaml，跳過")
            continue

        yaml_content = workflow_yaml.read_text(encoding="utf-8")
        description = (
            description_md.read_text(encoding="utf-8") if description_md.exists() else ""
        )
        tags = (
            json.loads(tags_json.read_text(encoding="utf-8")) if tags_json.exists() else []
        )

        # content = workflow YAML（讓 AI semantic search 命中 YAML 內容）
        # metadata_json = description + tags 結構化
        payload = {
            "api_key": api_key,
            "type": "workflow-example",
            "page_name": f"example-{slug}",
            "source": SOURCE,
            "user_id": USER_ID,
            "content": yaml_content,
            "metadata_json": json.dumps(
                {
                    "slug": slug,
                    "description_md": description,
                    "tags": tags,
                },
                ensure_ascii=False,
            ),
            "tags_json": json.dumps(
                ["workflow-example", f"example:{slug}", *tags],
                ensure_ascii=False,
            ),
        }

        result = kbdb_upsert(api_key, payload, dry_run)
        if "error" in result:
            print(f"  ❌ {slug}: {result['error']}")
            fail += 1
        else:
            action = result.get("data", {}).get("action", "?") if isinstance(result.get("data"), dict) else "?"
            print(f"  ✅ {slug} → {action}")
            ok += 1

    return ok, fail


def sync_skills(api_key: str, dry_run: bool) -> tuple[int, int]:
    """同步 registry/skills/*.md 進 KBDB"""
    if not SKILLS_DIR.exists():
        print(f"⚠️  {SKILLS_DIR} 不存在，跳過 skills 同步")
        return 0, 0

    ok, fail = 0, 0
    for md_file in sorted(SKILLS_DIR.glob("*.md")):
        if md_file.name == "README.md":
            continue
        slug = md_file.stem
        content = md_file.read_text(encoding="utf-8")

        # 簡單抓首行 # X 當 title
        title = slug
        for line in content.splitlines():
            line = line.strip()
            if line.startswith("# "):
                title = line[2:].strip()
                break

        payload = {
            "api_key": api_key,
            "type": "agent-skill",
            "page_name": f"skill-{slug}",
            "source": SOURCE,
            "user_id": USER_ID,
            "content": content,
            "metadata_json": json.dumps(
                {"slug": slug, "title": title},
                ensure_ascii=False,
            ),
            "tags_json": json.dumps(
                ["agent-skill", f"skill:{slug}"],
                ensure_ascii=False,
            ),
        }

        result = kbdb_upsert(api_key, payload, dry_run)
        if "error" in result:
            print(f"  ❌ {slug}: {result['error']}")
            fail += 1
        else:
            action = result.get("data", {}).get("action", "?") if isinstance(result.get("data"), dict) else "?"
            print(f"  ✅ {slug} → {action}")
            ok += 1

    return ok, fail


def main():
    p = argparse.ArgumentParser(description="Sync registry/examples + skills → KBDB")
    p.add_argument("--dry-run", action="store_true", help="只 list 不寫")
    p.add_argument("--examples-only", action="store_true")
    p.add_argument("--skills-only", action="store_true")
    args = p.parse_args()

    api_key = get_api_key()
    print(f"🔑 api_key: {api_key[:12]}... (len={len(api_key)})")
    print(f"📂 root: {ARCRUN_ROOT}")
    if args.dry_run:
        print("(dry-run，不實際寫 KBDB)")
    print()

    examples_ok = examples_fail = 0
    skills_ok = skills_fail = 0

    if not args.skills_only:
        print("📋 Syncing examples → type=workflow-example ...")
        examples_ok, examples_fail = sync_examples(api_key, args.dry_run)
        print(f"   examples: {examples_ok} ok / {examples_fail} fail\n")

    if not args.examples_only:
        print("📋 Syncing skills → type=agent-skill ...")
        skills_ok, skills_fail = sync_skills(api_key, args.dry_run)
        print(f"   skills: {skills_ok} ok / {skills_fail} fail\n")

    total_fail = examples_fail + skills_fail
    if total_fail > 0:
        print(f"⚠️  共 {total_fail} 個項目失敗")
        sys.exit(1)
    print(f"✅ Done. examples={examples_ok}, skills={skills_ok}")


if __name__ == "__main__":
    main()