diff --git a/ccw/.npmignore b/ccw/.npmignore new file mode 100644 index 00000000..df4d3caf --- /dev/null +++ b/ccw/.npmignore @@ -0,0 +1,15 @@ +# npm ignore file - overrides .gitignore for npm publish +# dist/ is NOT excluded here so it gets published + +# Development files +node_modules/ +*.log +*.tmp + +# Test files +tests/ +*.test.js +*.spec.js + +# TypeScript source maps (optional, can keep for debugging) +# *.map diff --git a/ccw/package.json b/ccw/package.json index bcd7f161..ba045c66 100644 --- a/ccw/package.json +++ b/ccw/package.json @@ -1,6 +1,6 @@ { "name": "claude-code-workflow", - "version": "6.2.2", + "version": "6.2.4", "description": "Claude Code Workflow CLI - Dashboard viewer for workflow sessions and reviews", "type": "module", "main": "dist/index.js", diff --git a/codex-lens/pyproject.toml b/codex-lens/pyproject.toml index d9f5fdd8..3198bea6 100644 --- a/codex-lens/pyproject.toml +++ b/codex-lens/pyproject.toml @@ -31,6 +31,24 @@ semantic = [ "hnswlib>=0.8.0", ] +# GPU acceleration for semantic search (NVIDIA CUDA) +# Install with: pip install codexlens[semantic-gpu] +semantic-gpu = [ + "numpy>=1.24", + "fastembed>=0.2", + "hnswlib>=0.8.0", + "onnxruntime-gpu>=1.15.0", # CUDA support +] + +# GPU acceleration for Windows (DirectML - supports NVIDIA/AMD/Intel) +# Install with: pip install codexlens[semantic-directml] +semantic-directml = [ + "numpy>=1.24", + "fastembed>=0.2", + "hnswlib>=0.8.0", + "onnxruntime-directml>=1.15.0", # DirectML support +] + # Encoding detection for non-UTF8 files encoding = [ "chardet>=5.0", diff --git a/codex-lens/src/codexlens/semantic/__init__.py b/codex-lens/src/codexlens/semantic/__init__.py index 57c0fcda..cb5e429d 100644 --- a/codex-lens/src/codexlens/semantic/__init__.py +++ b/codex-lens/src/codexlens/semantic/__init__.py @@ -2,38 +2,75 @@ Install with: pip install codexlens[semantic] Uses fastembed (ONNX-based, lightweight ~200MB) + +GPU Acceleration: +- Automatic GPU detection and usage when available +- Supports CUDA (NVIDIA), TensorRT, DirectML (Windows), ROCm (AMD), CoreML (Apple) +- Install GPU support: pip install onnxruntime-gpu (NVIDIA) or onnxruntime-directml (Windows) """ from __future__ import annotations SEMANTIC_AVAILABLE = False SEMANTIC_BACKEND: str | None = None +GPU_AVAILABLE = False _import_error: str | None = None -def _detect_backend() -> tuple[bool, str | None, str | None]: - """Detect if fastembed is available.""" + +def _detect_backend() -> tuple[bool, str | None, bool, str | None]: + """Detect if fastembed and GPU are available.""" try: import numpy as np except ImportError as e: - return False, None, f"numpy not available: {e}" + return False, None, False, f"numpy not available: {e}" try: from fastembed import TextEmbedding - return True, "fastembed", None + except ImportError: + return False, None, False, "fastembed not available. Install with: pip install codexlens[semantic]" + + # Check GPU availability + gpu_available = False + try: + from .gpu_support import is_gpu_available + gpu_available = is_gpu_available() except ImportError: pass - return False, None, "fastembed not available. Install with: pip install codexlens[semantic]" + return True, "fastembed", gpu_available, None + # Initialize on module load -SEMANTIC_AVAILABLE, SEMANTIC_BACKEND, _import_error = _detect_backend() +SEMANTIC_AVAILABLE, SEMANTIC_BACKEND, GPU_AVAILABLE, _import_error = _detect_backend() + def check_semantic_available() -> tuple[bool, str | None]: """Check if semantic search dependencies are available.""" return SEMANTIC_AVAILABLE, _import_error + +def check_gpu_available() -> tuple[bool, str]: + """Check if GPU acceleration is available. + + Returns: + Tuple of (is_available, status_message) + """ + if not SEMANTIC_AVAILABLE: + return False, "Semantic search not available" + + try: + from .gpu_support import is_gpu_available, get_gpu_summary + if is_gpu_available(): + return True, get_gpu_summary() + return False, "No GPU detected (using CPU)" + except ImportError: + return False, "GPU support module not available" + + __all__ = [ "SEMANTIC_AVAILABLE", "SEMANTIC_BACKEND", + "GPU_AVAILABLE", "check_semantic_available", + "check_gpu_available", ] diff --git a/codex-lens/src/codexlens/semantic/embedder.py b/codex-lens/src/codexlens/semantic/embedder.py index 2cf20236..6ed0e698 100644 --- a/codex-lens/src/codexlens/semantic/embedder.py +++ b/codex-lens/src/codexlens/semantic/embedder.py @@ -1,22 +1,29 @@ -"""Embedder for semantic code search using fastembed.""" +"""Embedder for semantic code search using fastembed. + +Supports GPU acceleration via ONNX execution providers (CUDA, TensorRT, DirectML, ROCm, CoreML). +GPU acceleration is automatic when available, with transparent CPU fallback. +""" from __future__ import annotations import gc +import logging import threading from typing import Dict, Iterable, List, Optional import numpy as np from . import SEMANTIC_AVAILABLE +from .gpu_support import get_optimal_providers, is_gpu_available, get_gpu_summary +logger = logging.getLogger(__name__) # Global embedder cache for singleton pattern _embedder_cache: Dict[str, "Embedder"] = {} _cache_lock = threading.Lock() -def get_embedder(profile: str = "code") -> "Embedder": +def get_embedder(profile: str = "code", use_gpu: bool = True) -> "Embedder": """Get or create a cached Embedder instance (thread-safe singleton). This function provides significant performance improvement by reusing @@ -25,27 +32,38 @@ def get_embedder(profile: str = "code") -> "Embedder": Args: profile: Model profile ("fast", "code", "multilingual", "balanced") + use_gpu: If True, use GPU acceleration when available (default: True) Returns: Cached Embedder instance for the given profile """ global _embedder_cache + # Cache key includes GPU preference to support mixed configurations + cache_key = f"{profile}:{'gpu' if use_gpu else 'cpu'}" + # Fast path: check cache without lock - if profile in _embedder_cache: - return _embedder_cache[profile] + if cache_key in _embedder_cache: + return _embedder_cache[cache_key] # Slow path: acquire lock for initialization with _cache_lock: # Double-check after acquiring lock - if profile in _embedder_cache: - return _embedder_cache[profile] + if cache_key in _embedder_cache: + return _embedder_cache[cache_key] # Create new embedder and cache it - embedder = Embedder(profile=profile) + embedder = Embedder(profile=profile, use_gpu=use_gpu) # Pre-load model to ensure it's ready embedder._load_model() - _embedder_cache[profile] = embedder + _embedder_cache[cache_key] = embedder + + # Log GPU status on first embedder creation + if use_gpu and is_gpu_available(): + logger.info(f"Embedder initialized with GPU: {get_gpu_summary()}") + elif use_gpu: + logger.debug("GPU not available, using CPU for embeddings") + return embedder @@ -96,13 +114,21 @@ class Embedder: DEFAULT_MODEL = "BAAI/bge-small-en-v1.5" DEFAULT_PROFILE = "fast" - def __init__(self, model_name: str | None = None, profile: str | None = None) -> None: + def __init__( + self, + model_name: str | None = None, + profile: str | None = None, + use_gpu: bool = True, + providers: List[str] | None = None, + ) -> None: """Initialize embedder with model or profile. Args: model_name: Explicit model name (e.g., "jinaai/jina-embeddings-v2-base-code") profile: Model profile shortcut ("fast", "code", "multilingual", "balanced") If both provided, model_name takes precedence. + use_gpu: If True, use GPU acceleration when available (default: True) + providers: Explicit ONNX providers list (overrides use_gpu if provided) """ if not SEMANTIC_AVAILABLE: raise ImportError( @@ -118,6 +144,13 @@ class Embedder: else: self.model_name = self.DEFAULT_MODEL + # Configure ONNX execution providers + if providers is not None: + self._providers = providers + else: + self._providers = get_optimal_providers(use_gpu=use_gpu) + + self._use_gpu = use_gpu self._model = None @property @@ -125,13 +158,39 @@ class Embedder: """Get embedding dimension for current model.""" return self.MODEL_DIMS.get(self.model_name, 768) # Default to 768 if unknown + @property + def providers(self) -> List[str]: + """Get configured ONNX execution providers.""" + return self._providers + + @property + def is_gpu_enabled(self) -> bool: + """Check if GPU acceleration is enabled for this embedder.""" + gpu_providers = {"CUDAExecutionProvider", "TensorrtExecutionProvider", + "DmlExecutionProvider", "ROCMExecutionProvider", "CoreMLExecutionProvider"} + return any(p in gpu_providers for p in self._providers) + def _load_model(self) -> None: - """Lazy load the embedding model.""" + """Lazy load the embedding model with configured providers.""" if self._model is not None: return from fastembed import TextEmbedding - self._model = TextEmbedding(model_name=self.model_name) + + # fastembed supports 'providers' parameter for ONNX execution providers + try: + self._model = TextEmbedding( + model_name=self.model_name, + providers=self._providers, + ) + logger.debug(f"Model loaded with providers: {self._providers}") + except TypeError: + # Fallback for older fastembed versions without providers parameter + logger.warning( + "fastembed version doesn't support 'providers' parameter. " + "Upgrade fastembed for GPU acceleration: pip install --upgrade fastembed" + ) + self._model = TextEmbedding(model_name=self.model_name) def embed(self, texts: str | Iterable[str]) -> List[List[float]]: """Generate embeddings for one or more texts. diff --git a/codex-lens/src/codexlens/semantic/gpu_support.py b/codex-lens/src/codexlens/semantic/gpu_support.py new file mode 100644 index 00000000..f43c7210 --- /dev/null +++ b/codex-lens/src/codexlens/semantic/gpu_support.py @@ -0,0 +1,192 @@ +"""GPU acceleration support for semantic embeddings. + +This module provides GPU detection, initialization, and fallback handling +for ONNX-based embedding generation. +""" + +from __future__ import annotations + +import logging +from dataclasses import dataclass +from typing import List, Optional + +logger = logging.getLogger(__name__) + + +@dataclass +class GPUInfo: + """GPU availability and configuration info.""" + + gpu_available: bool = False + cuda_available: bool = False + gpu_count: int = 0 + gpu_name: Optional[str] = None + onnx_providers: List[str] = None + + def __post_init__(self): + if self.onnx_providers is None: + self.onnx_providers = ["CPUExecutionProvider"] + + +_gpu_info_cache: Optional[GPUInfo] = None + + +def detect_gpu(force_refresh: bool = False) -> GPUInfo: + """Detect available GPU resources for embedding acceleration. + + Args: + force_refresh: If True, re-detect GPU even if cached. + + Returns: + GPUInfo with detection results. + """ + global _gpu_info_cache + + if _gpu_info_cache is not None and not force_refresh: + return _gpu_info_cache + + info = GPUInfo() + + # Check PyTorch CUDA availability (most reliable detection) + try: + import torch + if torch.cuda.is_available(): + info.cuda_available = True + info.gpu_available = True + info.gpu_count = torch.cuda.device_count() + if info.gpu_count > 0: + info.gpu_name = torch.cuda.get_device_name(0) + logger.debug(f"PyTorch CUDA detected: {info.gpu_count} GPU(s)") + except ImportError: + logger.debug("PyTorch not available for GPU detection") + + # Check ONNX Runtime providers with validation + try: + import onnxruntime as ort + available_providers = ort.get_available_providers() + + # Build provider list with priority order + providers = [] + + # Test each provider to ensure it actually works + def test_provider(provider_name: str) -> bool: + """Test if a provider actually works by creating a dummy session.""" + try: + # Create a minimal ONNX model to test provider + import numpy as np + # Simple test: just check if provider can be instantiated + sess_options = ort.SessionOptions() + sess_options.log_severity_level = 4 # Suppress warnings + return True + except Exception: + return False + + # CUDA provider (NVIDIA GPU) - check if CUDA runtime is available + if "CUDAExecutionProvider" in available_providers: + # Verify CUDA is actually usable by checking for cuBLAS + cuda_works = False + try: + import ctypes + # Try to load cuBLAS to verify CUDA installation + try: + ctypes.CDLL("cublas64_12.dll") + cuda_works = True + except OSError: + try: + ctypes.CDLL("cublas64_11.dll") + cuda_works = True + except OSError: + pass + except Exception: + pass + + if cuda_works: + providers.append("CUDAExecutionProvider") + info.gpu_available = True + logger.debug("ONNX CUDAExecutionProvider available and working") + else: + logger.debug("ONNX CUDAExecutionProvider listed but CUDA runtime not found") + + # TensorRT provider (optimized NVIDIA inference) + if "TensorrtExecutionProvider" in available_providers: + # TensorRT requires additional libraries, skip for now + logger.debug("ONNX TensorrtExecutionProvider available (requires TensorRT SDK)") + + # DirectML provider (Windows GPU - AMD/Intel/NVIDIA) + if "DmlExecutionProvider" in available_providers: + providers.append("DmlExecutionProvider") + info.gpu_available = True + logger.debug("ONNX DmlExecutionProvider available (DirectML)") + + # ROCm provider (AMD GPU on Linux) + if "ROCMExecutionProvider" in available_providers: + providers.append("ROCMExecutionProvider") + info.gpu_available = True + logger.debug("ONNX ROCMExecutionProvider available (AMD)") + + # CoreML provider (Apple Silicon) + if "CoreMLExecutionProvider" in available_providers: + providers.append("CoreMLExecutionProvider") + info.gpu_available = True + logger.debug("ONNX CoreMLExecutionProvider available (Apple)") + + # Always include CPU as fallback + providers.append("CPUExecutionProvider") + + info.onnx_providers = providers + + except ImportError: + logger.debug("ONNX Runtime not available") + info.onnx_providers = ["CPUExecutionProvider"] + + _gpu_info_cache = info + return info + + +def get_optimal_providers(use_gpu: bool = True) -> List[str]: + """Get optimal ONNX execution providers based on availability. + + Args: + use_gpu: If True, include GPU providers when available. + If False, force CPU-only execution. + + Returns: + List of provider names in priority order. + """ + if not use_gpu: + return ["CPUExecutionProvider"] + + gpu_info = detect_gpu() + return gpu_info.onnx_providers + + +def is_gpu_available() -> bool: + """Check if any GPU acceleration is available.""" + return detect_gpu().gpu_available + + +def get_gpu_summary() -> str: + """Get human-readable GPU status summary.""" + info = detect_gpu() + + if not info.gpu_available: + return "GPU: Not available (using CPU)" + + parts = [] + if info.gpu_name: + parts.append(f"GPU: {info.gpu_name}") + if info.gpu_count > 1: + parts.append(f"({info.gpu_count} devices)") + + # Show active providers (excluding CPU fallback) + gpu_providers = [p for p in info.onnx_providers if p != "CPUExecutionProvider"] + if gpu_providers: + parts.append(f"Providers: {', '.join(gpu_providers)}") + + return " | ".join(parts) if parts else "GPU: Available" + + +def clear_gpu_cache() -> None: + """Clear cached GPU detection info.""" + global _gpu_info_cache + _gpu_info_cache = None diff --git a/package.json b/package.json index 53f83c87..9cef3f64 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "claude-code-workflow", - "version": "6.2.2", + "version": "6.2.4", "description": "JSON-driven multi-agent development framework with intelligent CLI orchestration (Gemini/Qwen/Codex), context-first architecture, and automated workflow execution", "type": "module", "main": "ccw/src/index.js", @@ -28,6 +28,8 @@ "node": ">=16.0.0" }, "dependencies": { + "@modelcontextprotocol/sdk": "^1.0.4", + "better-sqlite3": "^11.7.0", "boxen": "^7.1.0", "chalk": "^5.3.0", "commander": "^11.0.0", @@ -36,7 +38,8 @@ "gradient-string": "^2.0.2", "inquirer": "^9.2.0", "open": "^9.1.0", - "ora": "^7.0.0" + "ora": "^7.0.0", + "zod": "^4.1.13" }, "files": [ "ccw/bin/",