Arcrun/scripts/sync-registry-to-kbdb.py

#!/usr/bin/env python3
"""sync-registry-to-kbdb.py — 把 registry/examples + registry/skills 同步進 KBDB

對應 LI SDD M3.4。examples / skills 在 git 是 source of truth，
KBDB 是「給 AI 搜尋 / get」的 query-friendly mirror。

2026-06-14 重寫：KBDB 降基本盤後（三表 entries/templates/records，無 v3 blocks 表、
無 kbdb-upsert-block 零件 worker），原打 https://kbdb-upsert-block.arcrun.dev/ 全失效。
改打基本盤 KBDB Worker 的 /entries：
- examples → entry_type=workflow-example
    content = workflow.yaml 全文
    metadata_json = { slug, description_md, tags }
    tags_json = ["workflow-example", "example:{slug}", *tags]
    page_name = example-{slug}    (idempotency key)
- skills → entry_type=agent-skill
    content = {slug}.md 全文
    metadata_json = { slug, title }
    tags_json = ["agent-skill", "skill:{slug}"]
    page_name = skill-{slug}      (idempotency key)

基本盤無 upsert 端點 → 本腳本自己做 idempotency（GET ?page_name= 找到則 PATCH /entries/:id，
否則 POST /entries）。這是 ops 同步腳本（非 CLI/MCP 薄殼），自行編排不違反 rule 07 薄殼原則。

執行：
    cd matrix/arcrun
    KBDB_BASE_URL=https://arcrun-kbdb.<subdomain>.workers.dev python3 scripts/sync-registry-to-kbdb.py
    python3 scripts/sync-registry-to-kbdb.py --dry-run    # 只 list 不寫

設定：
- KBDB_BASE_URL  KBDB 基本盤 Worker 的 base URL（必填，無預設——避免誤打到別的環境）
- KBDB_OWNER_ID  資料歸屬標記（選填，預設 'registry'；基本盤多租戶用 owner_id）
"""

import argparse
import json
import os
import sys
import urllib.request
import urllib.error
from pathlib import Path

ARCRUN_ROOT = Path(__file__).resolve().parent.parent
EXAMPLES_DIR = ARCRUN_ROOT / "registry" / "examples"
SKILLS_DIR = ARCRUN_ROOT / "registry" / "skills"

USER_AGENT = "arcrun-registry-sync/2.0"
OWNER_ID = os.environ.get("KBDB_OWNER_ID", "registry")
SOURCE = "registry-git-sync"


def get_base_url() -> str:
    """KBDB 基本盤 Worker base URL。無預設（避免誤打環境）。"""
    url = os.environ.get("KBDB_BASE_URL", "").rstrip("/")
    if url:
        return url
    raise SystemExit(
        "KBDB_BASE_URL 未設定。\n"
        "  export KBDB_BASE_URL=https://arcrun-kbdb.<subdomain>.workers.dev\n"
        "  （self-hosted 用自己部署的 KBDB Worker URL）"
    )


def _req(method: str, url: str, payload: dict | None = None) -> dict:
    data = json.dumps(payload, ensure_ascii=False).encode("utf-8") if payload is not None else None
    req = urllib.request.Request(
        url,
        data=data,
        headers={"Content-Type": "application/json", "User-Agent": USER_AGENT},
        method=method,
    )
    try:
        with urllib.request.urlopen(req, timeout=30) as resp:
            return json.loads(resp.read().decode("utf-8"))
    except urllib.error.HTTPError as e:
        body = e.read().decode("utf-8", errors="replace")
        return {"error": f"HTTP {e.code}: {body[:200]}"}
    except urllib.error.URLError as e:
        return {"error": f"URL error: {e}"}


def find_entry_id_by_page_name(base_url: str, page_name: str) -> str | None:
    """GET /entries?page_name= → 回既有 entry id（idempotency 用），無則 None。"""
    from urllib.parse import quote
    res = _req("GET", f"{base_url}/entries?page_name={quote(page_name)}&limit=1")
    if "error" in res:
        return None
    entries = res.get("entries") or []
    return entries[0].get("id") if entries else None


def upsert_entry(base_url: str, payload: dict, dry_run: bool) -> dict:
    """page_name 當 idempotency key：找到則 PATCH /entries/:id，否則 POST /entries。"""
    page_name = payload.get("page_name")
    if dry_run:
        existing = None if base_url == "DRY" else find_entry_id_by_page_name(base_url, page_name)
        return {"dry_run": True, "would": "patch" if existing else "post", "page_name": page_name}

    existing_id = find_entry_id_by_page_name(base_url, page_name)
    if existing_id:
        # PATCH 只送可變欄位（entry_type/page_name 不變）
        patch = {k: payload[k] for k in ("content", "tags_json", "metadata_json") if k in payload}
        res = _req("PATCH", f"{base_url}/entries/{existing_id}", patch)
        if "error" not in res:
            res.setdefault("action", "update")
        return res
    res = _req("POST", f"{base_url}/entries", payload)
    if "error" not in res:
        res.setdefault("action", "create")
    return res


def sync_examples(base_url: str, dry_run: bool) -> tuple[int, int]:
    """同步 registry/examples/{slug}/ 進 KBDB（entry_type=workflow-example）"""
    if not EXAMPLES_DIR.exists():
        print(f"⚠️  {EXAMPLES_DIR} 不存在，跳過 examples 同步")
        return 0, 0

    ok, fail = 0, 0
    for slug_dir in sorted(EXAMPLES_DIR.iterdir()):
        if not slug_dir.is_dir():
            continue
        slug = slug_dir.name
        workflow_yaml = slug_dir / "workflow.yaml"
        description_md = slug_dir / "description.md"
        tags_json = slug_dir / "tags.json"

        if not workflow_yaml.exists():
            print(f"  ⚠️  {slug}: 缺 workflow.yaml，跳過")
            continue

        yaml_content = workflow_yaml.read_text(encoding="utf-8")
        description = description_md.read_text(encoding="utf-8") if description_md.exists() else ""
        tags = json.loads(tags_json.read_text(encoding="utf-8")) if tags_json.exists() else []

        payload = {
            "entry_type": "workflow-example",
            "page_name": f"example-{slug}",
            "owner_id": OWNER_ID,
            "content": yaml_content,
            "metadata_json": json.dumps(
                {"slug": slug, "description_md": description, "tags": tags, "source": SOURCE},
                ensure_ascii=False,
            ),
            "tags_json": json.dumps(
                ["workflow-example", f"example:{slug}", *tags], ensure_ascii=False
            ),
        }

        result = upsert_entry(base_url, payload, dry_run)
        if "error" in result:
            print(f"  ❌ {slug}: {result['error']}")
            fail += 1
        else:
            print(f"  ✅ {slug} → {result.get('action', 'dry-run:' + result.get('would', '?'))}")
            ok += 1

    return ok, fail


def sync_skills(base_url: str, dry_run: bool) -> tuple[int, int]:
    """同步 registry/skills/*.md 進 KBDB（entry_type=agent-skill）"""
    if not SKILLS_DIR.exists():
        print(f"⚠️  {SKILLS_DIR} 不存在，跳過 skills 同步")
        return 0, 0

    ok, fail = 0, 0
    for md_file in sorted(SKILLS_DIR.glob("*.md")):
        if md_file.name == "README.md":
            continue
        slug = md_file.stem
        content = md_file.read_text(encoding="utf-8")

        title = slug
        for line in content.splitlines():
            line = line.strip()
            if line.startswith("# "):
                title = line[2:].strip()
                break

        payload = {
            "entry_type": "agent-skill",
            "page_name": f"skill-{slug}",
            "owner_id": OWNER_ID,
            "content": content,
            "metadata_json": json.dumps(
                {"slug": slug, "title": title, "source": SOURCE}, ensure_ascii=False
            ),
            "tags_json": json.dumps(["agent-skill", f"skill:{slug}"], ensure_ascii=False),
        }

        result = upsert_entry(base_url, payload, dry_run)
        if "error" in result:
            print(f"  ❌ {slug}: {result['error']}")
            fail += 1
        else:
            print(f"  ✅ {slug} → {result.get('action', 'dry-run:' + result.get('would', '?'))}")
            ok += 1

    return ok, fail


def main():
    p = argparse.ArgumentParser(description="Sync registry/examples + skills → KBDB base (/entries)")
    p.add_argument("--dry-run", action="store_true", help="只 list 不寫")
    p.add_argument("--examples-only", action="store_true")
    p.add_argument("--skills-only", action="store_true")
    args = p.parse_args()

    base_url = "DRY" if args.dry_run and not os.environ.get("KBDB_BASE_URL") else get_base_url()
    print(f"🌐 KBDB base: {base_url}")
    print(f"📂 root: {ARCRUN_ROOT}  (owner_id={OWNER_ID})")
    if args.dry_run:
        print("(dry-run，不實際寫 KBDB)")
    print()

    examples_ok = examples_fail = 0
    skills_ok = skills_fail = 0

    if not args.skills_only:
        print("📋 Syncing examples → entry_type=workflow-example ...")
        examples_ok, examples_fail = sync_examples(base_url, args.dry_run)
        print(f"   examples: {examples_ok} ok / {examples_fail} fail\n")

    if not args.examples_only:
        print("📋 Syncing skills → entry_type=agent-skill ...")
        skills_ok, skills_fail = sync_skills(base_url, args.dry_run)
        print(f"   skills: {skills_ok} ok / {skills_fail} fail\n")

    total_fail = examples_fail + skills_fail
    if total_fail > 0:
        print(f"⚠️  共 {total_fail} 個項目失敗")
        sys.exit(1)
    print(f"✅ Done. examples={examples_ok}, skills={skills_ok}")


if __name__ == "__main__":
    main()