myclaude/skills/harness/hooks/harness-claim.py

#!/usr/bin/env python3
from __future__ import annotations

import datetime as _dt
import hashlib
import json
import os
import shutil
import socket
import sys
import time
from pathlib import Path
from typing import Any, Optional


def _utc_now() -> _dt.datetime:
    return _dt.datetime.now(tz=_dt.timezone.utc)


def _iso_z(dt: _dt.datetime) -> str:
    dt = dt.astimezone(_dt.timezone.utc).replace(microsecond=0)
    return dt.isoformat().replace("+00:00", "Z")


def _parse_iso(ts: Any) -> Optional[_dt.datetime]:
    if not isinstance(ts, str) or not ts.strip():
        return None
    s = ts.strip()
    if s.endswith("Z"):
        s = s[:-1] + "+00:00"
    try:
        dt = _dt.datetime.fromisoformat(s)
    except Exception:
        return None
    if dt.tzinfo is None:
        dt = dt.replace(tzinfo=_dt.timezone.utc)
    return dt.astimezone(_dt.timezone.utc)


def _read_payload() -> dict[str, Any]:
    raw = sys.stdin.read()
    if not raw.strip():
        return {}
    try:
        data = json.loads(raw)
        return data if isinstance(data, dict) else {}
    except Exception:
        return {}


def _find_state_root(payload: dict[str, Any]) -> Optional[Path]:
    state_root = os.environ.get("HARNESS_STATE_ROOT")
    if state_root:
        p = Path(state_root)
        if (p / "harness-tasks.json").is_file():
            try:
                return p.resolve()
            except Exception:
                return p

    candidates: list[Path] = []
    env_dir = os.environ.get("CLAUDE_PROJECT_DIR")
    if env_dir:
        candidates.append(Path(env_dir))

    cwd = payload.get("cwd") or os.getcwd()
    candidates.append(Path(cwd))

    seen: set[str] = set()
    for base in candidates:
        try:
            base = base.resolve()
        except Exception:
            continue
        if str(base) in seen:
            continue
        seen.add(str(base))
        for parent in [base, *list(base.parents)[:8]]:
            if (parent / "harness-tasks.json").is_file():
                return parent
    return None


def _lockdir_for_root(root: Path) -> Path:
    h = hashlib.sha256(str(root).encode("utf-8")).hexdigest()[:16]
    return Path("/tmp") / f"harness-{h}.lock"


def _pid_alive(pid: int) -> bool:
    try:
        os.kill(pid, 0)
        return True
    except Exception:
        return False


def _read_pid(lockdir: Path) -> Optional[int]:
    try:
        raw = (lockdir / "pid").read_text("utf-8").strip()
        return int(raw) if raw else None
    except Exception:
        return None


def _acquire_lock(lockdir: Path, timeout_seconds: float) -> None:
    deadline = time.time() + timeout_seconds
    missing_pid_since: Optional[float] = None
    while True:
        try:
            lockdir.mkdir(mode=0o700)
            (lockdir / "pid").write_text(str(os.getpid()), encoding="utf-8")
            return
        except FileExistsError:
            pid = _read_pid(lockdir)
            if pid is None:
                if missing_pid_since is None:
                    missing_pid_since = time.time()
                if time.time() - missing_pid_since < 1.0:
                    if time.time() >= deadline:
                        raise TimeoutError("lock busy (pid missing)")
                    time.sleep(0.05)
                    continue
            else:
                missing_pid_since = None
                if _pid_alive(pid):
                    if time.time() >= deadline:
                        raise TimeoutError(f"lock busy (pid={pid})")
                    time.sleep(0.05)
                    continue

            stale = lockdir.with_name(f"{lockdir.name}.stale.{os.getpid()}.{int(time.time())}")
            try:
                lockdir.rename(stale)
            except Exception:
                if time.time() >= deadline:
                    raise TimeoutError("lock contention")
                time.sleep(0.05)
                continue
            shutil.rmtree(stale, ignore_errors=True)
            missing_pid_since = None
            continue


def _release_lock(lockdir: Path) -> None:
    shutil.rmtree(lockdir, ignore_errors=True)


def _load_state(path: Path) -> dict[str, Any]:
    with path.open("r", encoding="utf-8") as f:
        data = json.load(f)
    if not isinstance(data, dict):
        raise ValueError("harness-tasks.json must be an object")
    return data


def _atomic_write_json(path: Path, data: dict[str, Any]) -> None:
    bak = path.with_name(f"{path.name}.bak")
    tmp = path.with_name(f"{path.name}.tmp")
    shutil.copy2(path, bak)
    tmp.write_text(json.dumps(data, ensure_ascii=False, indent=2) + "\n", encoding="utf-8")
    os.replace(tmp, path)


def _priority_rank(v: Any) -> int:
    return {"P0": 0, "P1": 1, "P2": 2}.get(str(v or ""), 9)


def _eligible_tasks(tasks: list[dict[str, Any]]) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]:
    completed = {str(t.get("id", "")) for t in tasks if str(t.get("status", "")) == "completed"}

    def deps_ok(t: dict[str, Any]) -> bool:
        deps = t.get("depends_on") or []
        if not isinstance(deps, list):
            return False
        return all(str(d) in completed for d in deps)

    def attempts(t: dict[str, Any]) -> int:
        try:
            return int(t.get("attempts") or 0)
        except Exception:
            return 0

    def max_attempts(t: dict[str, Any]) -> int:
        try:
            v = t.get("max_attempts")
            return int(v) if v is not None else 3
        except Exception:
            return 3

    pending = [t for t in tasks if str(t.get("status", "")) == "pending" and deps_ok(t)]
    retry = [
        t
        for t in tasks
        if str(t.get("status", "")) == "failed"
        and attempts(t) < max_attempts(t)
        and deps_ok(t)
    ]

    def key(t: dict[str, Any]) -> tuple[int, str]:
        return (_priority_rank(t.get("priority")), str(t.get("id", "")))

    pending.sort(key=key)
    retry.sort(key=key)
    return pending, retry


def _reap_stale_leases(tasks: list[dict[str, Any]], now: _dt.datetime) -> bool:
    changed = False
    for t in tasks:
        if str(t.get("status", "")) != "in_progress":
            continue
        exp = _parse_iso(t.get("lease_expires_at"))
        if exp is None or exp > now:
            continue

        try:
            t["attempts"] = int(t.get("attempts") or 0) + 1
        except Exception:
            t["attempts"] = 1

        err = f"[SESSION_TIMEOUT] Lease expired (claimed_by={t.get('claimed_by')})"
        log = t.get("error_log")
        if isinstance(log, list):
            log.append(err)
        else:
            t["error_log"] = [err]

        t["status"] = "failed"
        t.pop("claimed_by", None)
        t.pop("lease_expires_at", None)
        t.pop("claimed_at", None)
        changed = True
    return changed


def main() -> int:
    payload = _read_payload()
    root = _find_state_root(payload)
    if root is None:
        print(json.dumps({"claimed": False, "error": "state root not found"}, ensure_ascii=False))
        return 0

    tasks_path = root / "harness-tasks.json"
    lockdir = _lockdir_for_root(root)

    timeout_s = float(os.environ.get("HARNESS_LOCK_TIMEOUT_SECONDS") or "5")
    _acquire_lock(lockdir, timeout_s)
    try:
        state = _load_state(tasks_path)
        session_config = state.get("session_config") or {}
        if not isinstance(session_config, dict):
            session_config = {}
        concurrency_mode = str(session_config.get("concurrency_mode") or "exclusive")
        is_concurrent = concurrency_mode == "concurrent"
        tasks_raw = state.get("tasks") or []
        if not isinstance(tasks_raw, list):
            raise ValueError("tasks must be a list")
        tasks = [t for t in tasks_raw if isinstance(t, dict)]

        now = _utc_now()
        if _reap_stale_leases(tasks, now):
            state["tasks"] = tasks
            _atomic_write_json(tasks_path, state)

        pending, retry = _eligible_tasks(tasks)
        task = pending[0] if pending else (retry[0] if retry else None)
        if task is None:
            print(json.dumps({"claimed": False}, ensure_ascii=False))
            return 0

        worker_id = os.environ.get("HARNESS_WORKER_ID") or ""
        if is_concurrent and not worker_id:
            print(json.dumps({"claimed": False, "error": "missing HARNESS_WORKER_ID"}, ensure_ascii=False))
            return 0
        if not worker_id:
            worker_id = f"{socket.gethostname()}:{os.getpid()}"
        lease_seconds = int(os.environ.get("HARNESS_LEASE_SECONDS") or "1800")
        exp = now + _dt.timedelta(seconds=lease_seconds)

        task["status"] = "in_progress"
        task["claimed_by"] = worker_id
        task["claimed_at"] = _iso_z(now)
        task["lease_expires_at"] = _iso_z(exp)
        state["tasks"] = tasks
        _atomic_write_json(tasks_path, state)

        out = {
            "claimed": True,
            "worker_id": worker_id,
            "task_id": str(task.get("id") or ""),
            "title": str(task.get("title") or ""),
            "lease_expires_at": task["lease_expires_at"],
        }
        print(json.dumps(out, ensure_ascii=False))
        return 0
    finally:
        _release_lock(lockdir)


if __name__ == "__main__":
    raise SystemExit(main())