mirror of
https://github.com/catlog22/Claude-Code-Workflow.git
synced 2026-02-09 02:24:11 +08:00
- Introduced a new JSON file for verbose output of the Codex Lens search results. - Added unit tests for binary search functionality in `test_stage1_binary_search_uses_chunk_lines.py`. - Implemented regression tests for staged cascade Stage 2 expansion depth in `test_staged_cascade_lsp_depth.py`. - Created unit tests for staged cascade Stage 2 realtime LSP graph expansion in `test_staged_cascade_realtime_lsp.py`. - Enhanced the ChainSearchEngine to respect configuration settings for staged LSP depth and improve search accuracy.
416 lines
114 KiB
JSON
416 lines
114 KiB
JSON
{
|
|
"success": true,
|
|
"result": {
|
|
"query": "class Config",
|
|
"method": "cascade",
|
|
"count": 50,
|
|
"results": [
|
|
{
|
|
"path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\hybrid_search\\data_structures.py",
|
|
"score": 0.06081658330145309,
|
|
"excerpt": " @classmethod\n def from_dict(cls, data: Dict[str, Any]) -> \"CallHierarchyItem\":\n return cls(\n name=data[\"name\"],\n kind=data[\"kind\"],\n file_path=data[\"file...",
|
|
"content": " @classmethod\n def from_dict(cls, data: Dict[str, Any]) -> \"CallHierarchyItem\":\n return cls(\n name=data[\"name\"],\n kind=data[\"kind\"],\n file_path=data[\"file_path\"],\n range=Range.from_dict(data[\"range\"]),\n detail=data.get(\"detail\"),\n )\n\n\n@dataclass\nclass CodeSymbolNode:\n\n id: str\n name: str\n kind: str\n file_path: str\n range: Range\n embedding: Optional[List[float]] = None\n raw_code: str = \"\"\n docstring: str = \"\"\n score: float = 0.0\n\n def __post_init__(self) -> None:\n if not self.id:\n raise ValueError(\"id cannot be empty\")\n if not self.name:\n raise ValueError(\"name cannot be empty\")\n if not self.kind:\n raise ValueError(\"kind cannot be empty\")\n if not self.file_path:\n raise ValueError(\"file_path cannot be empty\")\n\n def __hash__(self) -> int:\n return hash(self.id)\n\n def __eq__(self, other: object) -> bool:\n if not isinstance(other, CodeSymbolNode):\n return False\n return self.id == other.id\n\n def to_dict(self) -> Dict[str, Any]:\n",
|
|
"source": null,
|
|
"symbol": null
|
|
},
|
|
{
|
|
"path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py",
|
|
"score": 0.056576452190618645,
|
|
"excerpt": "from rich.table import Table\n\nfrom codexlens.config import Config\nfrom codexlens.entities import IndexedFile, SearchResult, Symbol\nfrom codexlens.errors import CodexLensError, ConfigError, ParseError,...",
|
|
"content": "import os\nimport shutil\nimport sqlite3\nfrom pathlib import Path\nfrom typing import Annotated, Any, Dict, Iterable, List, Optional\n\nimport typer\nfrom rich.progress import BarColumn, Progress, SpinnerColumn, TextColumn, TimeElapsedColumn\nfrom rich.table import Table\n\nfrom codexlens.config import Config\nfrom codexlens.entities import IndexedFile, SearchResult, Symbol\nfrom codexlens.errors import CodexLensError, ConfigError, ParseError, StorageError, SearchError\nfrom codexlens.parsers.factory import ParserFactory\nfrom codexlens.storage.path_mapper import PathMapper\nfrom codexlens.storage.registry import RegistryStore, ProjectInfo\nfrom codexlens.storage.index_tree import IndexTreeBuilder\nfrom codexlens.storage.dir_index import DirIndexStore\nfrom codexlens.search.chain_search import ChainSearchEngine, SearchOptions\nfrom codexlens.watcher import WatcherManager, WatcherConfig\n",
|
|
"source": null,
|
|
"symbol": null
|
|
},
|
|
{
|
|
"path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\config.py",
|
|
"score": 0.05655744432847353,
|
|
"excerpt": "\"\"\"Configuration system for CodexLens.\"\"\"\n\nfrom __future__ import annotations",
|
|
"content": "\"\"\"Configuration system for CodexLens.\"\"\"\n\nfrom __future__ import annotations\n\nimport json\nimport logging\nimport os\nfrom dataclasses import dataclass, field\nfrom functools import cached_property\nfrom pathlib import Path\nfrom typing import Any, Dict, List, Optional",
|
|
"source": null,
|
|
"symbol": null
|
|
},
|
|
{
|
|
"path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\search\\chain_search.py",
|
|
"score": 0.049219375000264694,
|
|
"excerpt": "\nfrom concurrent.futures import ThreadPoolExecutor, as_completed\nfrom dataclasses import dataclass, field\nfrom pathlib import Path\nfrom typing import List, Optional, Dict, Any, Literal, Tuple, TYPE_CH...",
|
|
"content": "\"\"\"Chain search engine for recursive multi-directory searching.\n\nProvides parallel search across directory hierarchies using indexed _index.db files.\nSupports depth-limited traversal, result aggregation, and symbol search.\n\"\"\"\n\nfrom __future__ import annotations\n\nfrom concurrent.futures import ThreadPoolExecutor, as_completed\nfrom dataclasses import dataclass, field\nfrom pathlib import Path\nfrom typing import List, Optional, Dict, Any, Literal, Tuple, TYPE_CHECKING\nimport json\nimport logging\nimport os\nimport time\n\nfrom codexlens.entities import SearchResult, Symbol\n\nif TYPE_CHECKING:",
|
|
"source": null,
|
|
"symbol": null
|
|
},
|
|
{
|
|
"path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\indexing\\embedding.py",
|
|
"score": 0.047931429239828446,
|
|
"excerpt": " def __init__(\n self,\n model_name: Optional[str] = None,\n use_gpu: bool = True,\n expand_dim: bool = True,\n ) -> None:\n from codexlens.semantic import SEMANTIC_...",
|
|
"content": " def __init__(\n self,\n model_name: Optional[str] = None,\n use_gpu: bool = True,\n expand_dim: bool = True,\n ) -> None:\n from codexlens.semantic import SEMANTIC_AVAILABLE\n\n if not SEMANTIC_AVAILABLE:\n raise ImportError(\n \"Semantic search dependencies not available. \"\n \"Install with: pip install codexlens[semantic]\"\n )\n\n self._model_name = model_name or self.DEFAULT_MODEL\n self._use_gpu = use_gpu\n self._expand_dim = expand_dim\n self._model = None\n self._native_dim: Optional[int] = None\n\n \n self._expansion_matrix: Optional[np.ndarray] = None\n\n @property\n def model_name(self) -> str:\n return self._model_name\n\n @property\n def embedding_dim(self) -> int:\n if self._expand_dim:\n return self.TARGET_DIM\n \n if self._native_dim is not None:\n return self._native_dim\n \n model_dims = {\n \"BAAI/bge-large-en-v1.5\": 1024,\n \"BAAI/bge-base-en-v1.5\": 768,\n \"BAAI/bge-small-en-v1.5\": 384,\n \"intfloat/multilingual-e5-large\": 1024,\n }\n return model_dims.get(self._model_name, 1024)\n\n @property\n def max_tokens(self) -> int:\n return 512 \n\n",
|
|
"source": null,
|
|
"symbol": null
|
|
},
|
|
{
|
|
"path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\semantic\\rotational_embedder.py",
|
|
"score": 0.04283104206542711,
|
|
"excerpt": "import threading\nimport time\nfrom dataclasses import dataclass, field\nfrom enum import Enum\nfrom typing import Any, Dict, Iterable, List, Optional",
|
|
"content": "Provides intelligent load balancing across multiple LiteLLM embedding endpoints\nto maximize throughput while respecting rate limits.\n\"\"\"\n\nfrom __future__ import annotations\n\nimport logging\nimport random\nimport threading\nimport time\nfrom dataclasses import dataclass, field\nfrom enum import Enum\nfrom typing import Any, Dict, Iterable, List, Optional\n\nimport numpy as np\n\nfrom .base import BaseEmbedder\n\nlogger = logging.getLogger(__name__)\n\n",
|
|
"source": null,
|
|
"symbol": null
|
|
},
|
|
{
|
|
"path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\lsp\\standalone_manager.py",
|
|
"score": 0.036886112765573215,
|
|
"excerpt": "- Direct subprocess spawning of language servers\n- JSON-RPC 2.0 communication over stdin/stdout\n- Multi-language support via configuration file (lsp-servers.json)\n- Process lifecycle management with a...",
|
|
"content": "\"\"\"Standalone Language Server Manager for direct LSP communication.\n\nThis module provides direct communication with language servers via JSON-RPC over stdio,\neliminating the need for VSCode Bridge. Similar to cclsp architecture.\n\nFeatures:\n- Direct subprocess spawning of language servers\n- JSON-RPC 2.0 communication over stdin/stdout\n- Multi-language support via configuration file (lsp-servers.json)\n- Process lifecycle management with auto-restart\n- Compatible interface with existing LspBridge\n\"\"\"\n\nfrom __future__ import annotations\n\nimport asyncio\nimport json\nimport logging\nimport os",
|
|
"source": null,
|
|
"symbol": null
|
|
},
|
|
{
|
|
"path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\api\\models.py",
|
|
"score": 0.03457410829143062,
|
|
"excerpt": " container: Containing class/module (if any)\n score: Match score for ranking\n return {k: v for k, v in asdict(self).items() if v is not None}\n\n\n# =================================...",
|
|
"content": " container: Containing class/module (if any)\n score: Match score for ranking\n return {k: v for k, v in asdict(self).items() if v is not None}\n\n\n# =============================================================================\n# Section 4.4: find_references dataclasses\n# =============================================================================\n\n@dataclass\nclass ReferenceResult:\n file_path: str\n line: int\n column: int\n context_line: str\n relationship: str # call | import | type_annotation | inheritance\n\n def to_dict(self) -> dict:\n return asdict(self)\n\n\n@dataclass\nclass GroupedReferences:\n definition: DefinitionResult\n references: List[ReferenceResult] = field(default_factory=list)\n\n def to_dict(self) -> dict:\n return {\n \"definition\": self.definition.to_dict(),\n \"references\": [r.to_dict() for r in self.references],\n }\n\n\n",
|
|
"source": null,
|
|
"symbol": null
|
|
},
|
|
{
|
|
"path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\parsers\\treesitter_parser.py",
|
|
"score": 0.03341093379138448,
|
|
"excerpt": "\n if TREE_SITTER_AVAILABLE:\n self._initialize_parser()\n\n def _initialize_parser(self) -> None:\n if TreeSitterParser is None or TreeSitterLanguage is None:\n retur...",
|
|
"content": "\n if TREE_SITTER_AVAILABLE:\n self._initialize_parser()\n\n def _initialize_parser(self) -> None:\n if TreeSitterParser is None or TreeSitterLanguage is None:\n return\n\n try:\n \n if self.language_id == \"python\":\n import tree_sitter_python\n self._language = TreeSitterLanguage(tree_sitter_python.language())\n elif self.language_id == \"javascript\":\n import tree_sitter_javascript\n self._language = TreeSitterLanguage(tree_sitter_javascript.language())\n elif self.language_id == \"typescript\":\n import tree_sitter_typescript\n \n if self.path is not None and self.path.suffix.lower() == \".tsx\":\n self._language = TreeSitterLanguage(tree_sitter_typescript.language_tsx())\n else:\n self._language = TreeSitterLanguage(tree_sitter_typescript.language_typescript())\n else:\n return\n\n \n self._parser = TreeSitterParser()\n if hasattr(self._parser, \"set_language\"):\n self._parser.set_language(self._language) \n else:\n self._parser.language = self._language \n\n except Exception:\n \n self._parser = None\n self._language = None\n\n def is_available(self) -> bool:\n return self._parser is not None and self._language is not None\n\n def _parse_tree(self, text: str) -> Optional[tuple[bytes, TreeSitterNode]]:\n if not self.is_available() or self._parser is None:\n",
|
|
"source": null,
|
|
"symbol": null
|
|
},
|
|
{
|
|
"path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\watcher\\incremental_indexer.py",
|
|
"score": 0.029568673189485736,
|
|
"excerpt": "\nimport logging\nfrom dataclasses import dataclass\nfrom pathlib import Path\nfrom typing import List, Optional",
|
|
"content": "\"\"\"Incremental indexer for processing file changes.\"\"\"\n\nfrom __future__ import annotations\n\nimport logging\nfrom dataclasses import dataclass\nfrom pathlib import Path\nfrom typing import List, Optional\n\nfrom codexlens.config import Config\nfrom codexlens.parsers.factory import ParserFactory\nfrom codexlens.storage.dir_index import DirIndexStore\nfrom codexlens.storage.global_index import GlobalSymbolIndex\nfrom codexlens.storage.path_mapper import PathMapper\nfrom codexlens.storage.registry import RegistryStore\n",
|
|
"source": null,
|
|
"symbol": null
|
|
},
|
|
{
|
|
"path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\frequency_strategy.py",
|
|
"score": 0.029334400167733504,
|
|
"excerpt": "\nfrom collections import defaultdict\nfrom dataclasses import dataclass\nfrom typing import TYPE_CHECKING, Dict, List, Optional, Literal\n",
|
|
"content": "\nUse cases:\n- Prioritize commonly called methods/functions\n- Filter out one-off results that may be less relevant\n- Deduplicate results pointing to the same symbol from different locations\n\"\"\"\n\nfrom __future__ import annotations\n\nfrom collections import defaultdict\nfrom dataclasses import dataclass\nfrom typing import TYPE_CHECKING, Dict, List, Optional, Literal\n\nfrom .base import BaseClusteringStrategy, ClusteringConfig\n\nif TYPE_CHECKING:\n import numpy as np\n from codexlens.entities import SearchResult\n\n\n@dataclass",
|
|
"source": null,
|
|
"symbol": null
|
|
},
|
|
{
|
|
"path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\search\\association_tree\\data_structures.py",
|
|
"score": 0.027925539288870704,
|
|
"excerpt": "\n def __len__(self) -> int:\n return len(self.all_nodes)\n\n def __repr__(self) -> str:\n return (\n f\"CallTree(roots={len(self.roots)}, nodes={len(self.all_nodes)}, \"\n ...",
|
|
"content": "\n def __len__(self) -> int:\n return len(self.all_nodes)\n\n def __repr__(self) -> str:\n return (\n f\"CallTree(roots={len(self.roots)}, nodes={len(self.all_nodes)}, \"\n f\"depth={self.depth_reached})\"\n )\n\n\n@dataclass\nclass UniqueNode:\n\n file_path: str\n name: str\n kind: str\n range: Range\n min_depth: int = 0\n occurrences: int = 1\n paths: List[List[str]] = field(default_factory=list)\n context_nodes: List[str] = field(default_factory=list)\n score: float = 0.0\n\n @property\n def node_key(self) -> tuple[str, int, int]:\n return (\n self.file_path,\n self.range.start_line,\n self.range.end_line,\n )\n\n def add_path(self, path: List[str]) -> None:\n",
|
|
"source": null,
|
|
"symbol": null
|
|
},
|
|
{
|
|
"path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\storage\\path_mapper.py",
|
|
"score": 0.024369821963687643,
|
|
"excerpt": "def _get_configured_index_root() -> Path:\n \"\"\"Get the index root from environment or config file.\n",
|
|
"content": "def _get_configured_index_root() -> Path:\n \"\"\"Get the index root from environment or config file.\n\n Priority order:\n 1. CODEXLENS_INDEX_DIR environment variable\n 2. index_dir from ~/.codexlens/config.json\n 3. Default: ~/.codexlens/indexes\n \"\"\"\n env_override = os.getenv(\"CODEXLENS_INDEX_DIR\")\n if env_override:\n return Path(env_override).expanduser().resolve()\n\n config_file = Path.home() / \".codexlens\" / \"config.json\"\n if config_file.exists():\n try:\n cfg = json.loads(config_file.read_text(encoding=\"utf-8\"))\n if \"index_dir\" in cfg:\n return Path(cfg[\"index_dir\"]).expanduser().resolve()\n except (json.JSONDecodeError, OSError):\n pass\n\n return Path.home() / \".codexlens\" / \"indexes\"",
|
|
"source": null,
|
|
"symbol": null
|
|
},
|
|
{
|
|
"path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py",
|
|
"score": 0.023949795081967214,
|
|
"excerpt": "class VectorStore:\n \"\"\"SQLite-based vector storage with HNSW-accelerated similarity search.\n",
|
|
"content": "class VectorStore:\n \"\"\"SQLite-based vector storage with HNSW-accelerated similarity search.\n\n Performance optimizations:\n - HNSW index for O(log N) approximate nearest neighbor search\n - Embedding matrix cached in memory for batch similarity computation (fallback)\n - NumPy vectorized operations instead of Python loops (fallback)\n - Lazy content loading - only fetch full content for top-k results\n - Thread-safe cache invalidation\n - Bulk insert mode for efficient batch operations\n \"\"\"\n\n # Default embedding dimension (used when creating new index)\n DEFAULT_DIM = 768\n\n def __init__(self, db_path: str | Path) -> None:\n if not NUMPY_AVAILABLE:\n raise ImportError(\n \"Semantic search dependencies not available. \"\n \"Install with: pip install codexlens[semantic]\"\n )\n\n self.db_path = Path(db_path)\n self.db_path.parent.mkdir(parents=True, exist_ok=True)\n\n # Embedding cache for fast similarity search (fallback)\n self._cache_lock = threading.RLock()\n self._embedding_matrix: Optional[np.ndarray] = None\n self._embedding_norms: Optional[np.ndarray] = None\n self._chunk_ids: Optional[List[int]] = None\n self._cache_version: int = 0\n\n # ANN index for O(log N) search\n self._ann_index: Optional[ANNIndex] = None\n self._ann_dim: Optional[int] = None\n self._ann_write_lock = threading.Lock() # Protects ANN index modifications\n\n # Bulk insert mode tracking\n self._bulk_insert_mode: bool = False\n self._bulk_insert_ids: List[int] = []\n self._bulk_insert_embeddings: List[np.ndarray] = []\n\n self._init_schema()\n self._init_ann_index()\n\n def _init_schema(self) -> None:\n \"\"\"Initialize vector storage schema.\"\"\"\n with sqlite3.connect(self.db_path) as conn:\n # Enable memory mapping for faster reads\n conn.execute(\"PRAGMA mmap_size = 30000000000\") # 30GB limit\n conn.execute(\"\"\"\n CREATE TABLE IF NOT EXISTS semantic_chunks (\n id INTEGER PRIMARY KEY AUTOINCREMENT,\n file_path TEXT NOT NULL,\n content TEXT NOT NULL,\n embedding BLOB NOT NULL,\n metadata TEXT,\n category TEXT DEFAULT 'code',\n created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP\n )\n \"\"\")\n conn.execute(\"\"\"\n CREATE INDEX IF NOT EXISTS idx_chunks_file\n ON semantic_chunks(file_path)\n \"\"\")\n conn.execute(\"\"\"\n CREATE INDEX IF NOT EXISTS idx_chunks_category\n ON semantic_chunks(category)\n \"\"\")\n # Model configuration table - tracks which model generated the embeddings\n conn.execute(\"\"\"\n CREATE TABLE IF NOT EXISTS embeddings_config (\n id INTEGER PRIMARY KEY CHECK (id = 1),\n model_profile TEXT NOT NULL,\n model_name TEXT NOT NULL,\n embedding_dim INTEGER NOT NULL,\n backend TEXT NOT NULL DEFAULT 'fastembed',\n created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,\n updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP\n )\n \"\"\")\n\n # Migration: Add backend column to existing tables\n self._migrate_backend_column(conn)\n # Migration: Add category column\n self._migrate_category_column(conn)\n\n conn.commit()\n\n def _migrate_backend_column(self, conn: sqlite3.Connection) -> None:\n \"\"\"Add backend column to existing embeddings_config table if not present.\n\n Args:\n conn: Active SQLite connection\n \"\"\"\n # Check if backend column exists\n cursor = conn.execute(\"PRAGMA table_info(embeddings_config)\")\n columns = [row[1] for row in cursor.fetchall()]\n\n if 'backend' not in columns:\n logger.info(\"Migrating embeddings_config table: adding backend column\")\n conn.execute(\"\"\"\n ALTER TABLE embeddings_config\n ADD COLUMN backend TEXT NOT NULL DEFAULT 'fastembed'\n \"\"\")\n\n def _migrate_category_column(self, conn: sqlite3.Connection) -> None:\n \"\"\"Add category column to existing semantic_chunks table if not present.\n\n Args:\n conn: Active SQLite connection\n \"\"\"\n # Check if category column exists\n cursor = conn.execute(\"PRAGMA table_info(semantic_chunks)\")\n columns = [row[1] for row in cursor.fetchall()]\n\n if 'category' not in columns:\n logger.info(\"Migrating semantic_chunks table: adding category column\")\n conn.execute(\"\"\"\n ALTER TABLE semantic_chunks\n ADD COLUMN category TEXT DEFAULT 'code'\n \"\"\")\n # Create index for fast category filtering\n conn.execute(\"\"\"\n CREATE INDEX IF NOT EXISTS idx_chunks_category\n ON semantic_chunks(category)\n \"\"\")\n\n def _init_ann_index(self) -> None:\n \"\"\"Initialize ANN index (lazy loading from existing data).\"\"\"\n if not HNSWLIB_AVAILABLE:\n logger.debug(\"hnswlib not available, using brute-force search\")\n return\n\n # Try to detect embedding dimension from existing data\n dim = self._detect_embedding_dim()\n if dim is None:\n # No data yet, will initialize on first add\n logger.debug(\"No embeddings found, ANN index will be created on first add\")\n return\n\n self._ann_dim = dim\n\n try:\n self._ann_index = ANNIndex(self.db_path, dim)\n if self._ann_index.load():\n logger.debug(\n \"Loaded ANN index with %d vectors\", self._ann_index.count()\n )\n else:\n # Index file doesn't exist, try to build from SQLite data\n logger.debug(\"ANN index file not found, rebuilding from SQLite\")\n self._rebuild_ann_index_internal()\n except Exception as e:\n logger.warning(\"Failed to initialize ANN index: %s\", e)\n self._ann_index = None\n\n def _detect_embedding_dim(self) -> Optional[int]:\n \"\"\"Detect embedding dimension from existing data.\"\"\"\n with sqlite3.connect(self.db_path) as conn:\n row = conn.execute(\n \"SELECT embedding FROM semantic_chunks LIMIT 1\"\n ).fetchone()\n if row and row[0]:\n # Embedding is stored as float32 blob\n blob = row[0]\n return len(blob) // np.dtype(np.float32).itemsize\n return None\n\n @property\n def dimension(self) -> Optional[int]:\n \"\"\"Return the dimension of embeddings in the store.\n\n Returns:\n Embedding dimension if available, None if store is empty.\n \"\"\"\n if self._ann_dim is not None:\n return self._ann_dim\n self._ann_dim = self._detect_embedding_dim()\n return self._ann_dim\n\n def _rebuild_ann_index_internal(self) -> int:\n \"\"\"Internal method to rebuild ANN index from SQLite data.\"\"\"\n if self._ann_index is None:\n return 0\n\n with sqlite3.connect(self.db_path) as conn:\n conn.execute(\"PRAGMA mmap_size = 30000000000\")\n rows = conn.execute(\n \"SELECT id, embedding FROM semantic_chunks\"\n ).fetchall()\n\n if not rows:\n return 0\n\n # Extract IDs and embeddings\n ids = [r[0] for r in rows]\n embeddings = np.vstack([\n np.frombuffer(r[1], dtype=np.float32) for r in rows\n ])\n\n # Add to ANN index\n self._ann_index.add_vectors(ids, embeddings)\n self._ann_index.save()\n\n logger.info(\"Rebuilt ANN index with %d vectors\", len(ids))\n return len(ids)\n\n def rebuild_ann_index(self) -> int:\n \"\"\"Rebuild HNSW index from all chunks in SQLite.\n\n Use this method to:\n - Migrate existing data to use ANN search\n - Repair corrupted index\n - Reclaim space after many deletions\n\n Returns:\n Number of vectors indexed.\n \"\"\"\n if not HNSWLIB_AVAILABLE:\n logger.warning(\"hnswlib not available, cannot rebuild ANN index\")\n return 0\n\n # Detect dimension\n dim = self._detect_embedding_dim()\n if dim is None:\n logger.warning(\"No embeddings found, cannot rebuild ANN index\")\n return 0\n\n self._ann_dim = dim\n\n # Create new index\n try:\n self._ann_index = ANNIndex(self.db_path, dim)\n return self._rebuild_ann_index_internal()\n except Exception as e:\n logger.error(\"Failed to rebuild ANN index: %s\", e)\n self._ann_index = None\n return 0\n\n def _invalidate_cache(self) -> None:\n \"\"\"Invalidate the embedding cache (thread-safe).\"\"\"\n with self._cache_lock:\n self._embedding_matrix = None\n self._embedding_norms = None\n self._chunk_ids = None\n self._cache_version += 1\n\n def _refresh_cache(self) -> bool:\n \"\"\"Load embeddings into numpy matrix for fast similarity search.\n\n Returns:\n True if cache was refreshed successfully, False if no data.\n \"\"\"\n with self._cache_lock:\n with sqlite3.connect(self.db_path) as conn:\n conn.execute(\"PRAGMA mmap_size = 30000000000\")\n rows = conn.execute(\n \"SELECT id, embedding FROM semantic_chunks\"\n ).fetchall()\n\n if not rows:\n self._embedding_matrix = None\n self._embedding_norms = None\n self._chunk_ids = None\n return False\n\n # Extract IDs and embeddings\n self._chunk_ids = [r[0] for r in rows]\n\n # Bulk convert binary blobs to numpy matrix\n embeddings = [\n np.frombuffer(r[1], dtype=np.float32) for r in rows\n ]\n self._embedding_matrix = np.vstack(embeddings)\n\n # Pre-compute norms for faster similarity calculation\n self._embedding_norms = np.linalg.norm(\n self._embedding_matrix, axis=1, keepdims=True\n )\n # Avoid division by zero\n self._embedding_norms = np.where(\n self._embedding_norms == 0, EPSILON, self._embedding_norms\n )\n\n return True\n\n def _ensure_ann_index(self, dim: int) -> bool:\n \"\"\"Ensure ANN index is initialized with correct dimension.\n\n This method is thread-safe and uses double-checked locking.\n\n Args:\n dim: Embedding dimension\n\n Returns:\n True if ANN index is ready, False otherwise\n \"\"\"\n if not HNSWLIB_AVAILABLE:\n return False\n\n # Fast path: index already initialized (no lock needed)\n if self._ann_index is not None:\n return True\n\n # Slow path: acquire lock for initialization\n with self._ann_write_lock:\n # Double-check after acquiring lock\n if self._ann_index is not None:\n return True\n\n try:\n self._ann_dim = dim\n self._ann_index = ANNIndex(self.db_path, dim)\n self._ann_index.load() # Try to load existing\n return True\n except Exception as e:\n logger.warning(\"Failed to initialize ANN index: %s\", e)\n self._ann_index = None\n return False\n\n def add_chunk(\n self, chunk: SemanticChunk, file_path: str, category: str = \"code\"\n ) -> int:\n \"\"\"Add a single chunk with its embedding.\n\n Args:\n chunk: SemanticChunk with embedding\n file_path: Path to the source file\n category: File category ('code' or 'doc'), default 'code'\n\n Returns:\n The inserted chunk ID.\n \"\"\"\n if chunk.embedding is None:\n raise ValueError(\"Chunk must have embedding before adding to store\")\n\n embedding_arr = np.array(chunk.embedding, dtype=np.float32)\n embedding_blob = embedding_arr.tobytes()\n metadata_json = json.dumps(chunk.metadata) if chunk.metadata else None\n\n with sqlite3.connect(self.db_path) as conn:\n cursor = conn.execute(\n \"\"\"\n INSERT INTO semantic_chunks (file_path, content, embedding, metadata, category)\n VALUES (?, ?, ?, ?, ?)\n \"\"\",\n (file_path, chunk.content, embedding_blob, metadata_json, category)\n )\n conn.commit()\n chunk_id = cursor.lastrowid or 0\n\n # Add to ANN index\n if self._ensure_ann_index(len(chunk.embedding)):\n with self._ann_write_lock:\n try:\n self._ann_index.add_vectors([chunk_id], embedding_arr.reshape(1, -1))\n self._ann_index.save()\n except Exception as e:\n logger.warning(\"Failed to add to ANN index: %s\", e)\n\n # Invalidate cache after modification\n self._invalidate_cache()\n return chunk_id\n\n def add_chunks(\n self, chunks: List[SemanticChunk], file_path: str, category: str = \"code\"\n ) -> List[int]:\n \"\"\"Add multiple chunks with embeddings (batch insert).\n\n Args:\n chunks: List of SemanticChunk objects with embeddings\n file_path: Path to the source file\n category: File category ('code' or 'doc'), default 'code'\n\n Returns:\n List of inserted chunk IDs.\n \"\"\"\n if not chunks:\n return []\n\n # Prepare batch data\n batch_data = []\n embeddings_list = []\n for chunk in chunks:\n if chunk.embedding is None:\n raise ValueError(\"All chunks must have embeddings\")\n embedding_arr = np.array(chunk.embedding, dtype=np.float32)\n embedding_blob = embedding_arr.tobytes()\n metadata_json = json.dumps(chunk.metadata) if chunk.metadata else None\n batch_data.append((file_path, chunk.content, embedding_blob, metadata_json, category))\n embeddings_list.append(embedding_arr)\n\n # Batch insert to SQLite\n with sqlite3.connect(self.db_path) as conn:\n # Get starting ID before insert\n row = conn.execute(\"SELECT MAX(id) FROM semantic_chunks\").fetchone()\n start_id = (row[0] or 0) + 1\n\n conn.executemany(\n \"\"\"\n INSERT INTO semantic_chunks (file_path, content, embedding, metadata, category)\n VALUES (?, ?, ?, ?, ?)\n \"\"\",\n batch_data\n )\n conn.commit()\n # Calculate inserted IDs based on starting ID\n ids = list(range(start_id, start_id + len(chunks)))\n\n # Add to ANN index\n if embeddings_list and self._ensure_ann_index(len(embeddings_list[0])):\n with self._ann_write_lock:\n try:\n embeddings_matrix = np.vstack(embeddings_list)\n self._ann_index.add_vectors(ids, embeddings_matrix)\n self._ann_index.save()\n except Exception as e:\n logger.warning(\"Failed to add batch to ANN index: %s\", e)\n\n # Invalidate cache after modification\n self._invalidate_cache()\n return ids\n\n def add_chunks_batch(\n self,\n chunks_with_paths: List[Tuple[SemanticChunk, str]],\n update_ann: bool = True,\n auto_save_ann: bool = True,\n categories: Optional[List[str]] = None,\n ) -> List[int]:\n \"\"\"Batch insert chunks from multiple files in a single transaction.\n\n This method is optimized for bulk operations during index generation.\n\n Args:\n chunks_with_paths: List of (chunk, file_path) tuples\n update_ann: If True, update ANN index with new vectors (default: True)\n auto_save_ann: If True, save ANN index after update (default: True).\n Set to False for bulk inserts to reduce I/O overhead.\n categories: Optional list of categories per chunk. If None, defaults to 'code'.\n If provided, must match length of chunks_with_paths.\n\n Returns:\n List of inserted chunk IDs\n \"\"\"\n if not chunks_with_paths:\n return []\n\n batch_size = len(chunks_with_paths)\n\n # Validate categories if provided\n if categories is not None and len(categories) != batch_size:\n raise ValueError(\n f\"categories length ({len(categories)}) must match \"\n f\"chunks_with_paths length ({batch_size})\"\n )\n\n # Prepare batch data\n batch_data = []\n embeddings_list = []\n for i, (chunk, file_path) in enumerate(chunks_with_paths):\n if chunk.embedding is None:\n raise ValueError(\"All chunks must have embeddings\")\n # Optimize: avoid repeated np.array() if already numpy\n if isinstance(chunk.embedding, np.ndarray):\n embedding_arr = chunk.embedding.astype(np.float32)\n else:\n embedding_arr = np.array(chunk.embedding, dtype=np.float32)\n embedding_blob = embedding_arr.tobytes()\n metadata_json = json.dumps(chunk.metadata) if chunk.metadata else None\n category = categories[i] if categories else \"code\"\n batch_data.append((file_path, chunk.content, embedding_blob, metadata_json, category))\n embeddings_list.append(embedding_arr)\n\n # Batch insert to SQLite in single transaction\n with sqlite3.connect(self.db_path) as conn:\n # Get starting ID before insert\n row = conn.execute(\"SELECT MAX(id) FROM semantic_chunks\").fetchone()\n start_id = (row[0] or 0) + 1\n\n _validate_chunk_id_range(start_id, batch_size)\n\n conn.executemany(\n \"\"\"\n INSERT INTO semantic_chunks (file_path, content, embedding, metadata, category)\n VALUES (?, ?, ?, ?, ?)\n \"\"\",\n batch_data\n )\n conn.commit()\n # Calculate inserted IDs based on starting ID\n ids = list(range(start_id, start_id + batch_size))\n\n # Handle ANN index updates\n if embeddings_list and update_ann and self._ensure_ann_index(len(embeddings_list[0])):\n with self._ann_write_lock:\n # In bulk insert mode, accumulate for later batch update\n if self._bulk_insert_mode:\n self._bulk_insert_ids.extend(ids)\n self._bulk_insert_embeddings.extend(embeddings_list)\n else:\n # Normal mode: update immediately\n try:\n embeddings_matrix = np.vstack(embeddings_list)\n self._ann_index.add_vectors(ids, embeddings_matrix)\n if auto_save_ann:\n self._ann_index.save()\n except Exception as e:\n logger.warning(\"Failed to add batch to ANN index: %s\", e)\n\n # Invalidate cache after modification\n self._invalidate_cache()\n return ids\n\n def add_chunks_batch_numpy(\n self,\n chunks_with_paths: List[Tuple[SemanticChunk, str]],\n embeddings_matrix: np.ndarray,\n update_ann: bool = True,\n auto_save_ann: bool = True,\n categories: Optional[List[str]] = None,\n ) -> List[int]:\n \"\"\"Batch insert chunks with pre-computed numpy embeddings matrix.\n\n This method accepts embeddings as a numpy matrix to avoid list->array conversions.\n Useful when embeddings are already in numpy format from batch encoding.\n\n Args:\n chunks_with_paths: List of (chunk, file_path) tuples (embeddings can be None)\n embeddings_matrix: Pre-computed embeddings as (N, D) numpy array\n update_ann: If True, update ANN index with new vectors (default: True)\n auto_save_ann: If True, save ANN index after update (default: True)\n categories: Optional list of categories per chunk. If None, defaults to 'code'.\n\n Returns:\n List of inserted chunk IDs\n \"\"\"\n if not chunks_with_paths:\n return []\n\n batch_size = len(chunks_with_paths)\n\n if len(chunks_with_paths) != embeddings_matrix.shape[0]:\n raise ValueError(\n f\"Mismatch: {len(chunks_with_paths)} chunks but \"\n f\"{embeddings_matrix.shape[0]} embeddings\"\n )\n\n # Validate categories if provided\n if categories is not None and len(categories) != batch_size:\n raise ValueError(\n f\"categories length ({len(categories)}) must match \"\n f\"chunks_with_paths length ({batch_size})\"\n )\n\n # Ensure float32 format\n embeddings_matrix = embeddings_matrix.astype(np.float32)\n\n # Prepare batch data\n batch_data = []\n for i, (chunk, file_path) in enumerate(chunks_with_paths):\n embedding_arr = embeddings_matrix[i]\n embedding_blob = embedding_arr.tobytes()\n metadata_json = json.dumps(chunk.metadata) if chunk.metadata else None\n category = categories[i] if categories else \"code\"\n batch_data.append((file_path, chunk.content, embedding_blob, metadata_json, category))\n\n # Batch insert to SQLite in single transaction\n with sqlite3.connect(self.db_path) as conn:\n # Get starting ID before insert\n row = conn.execute(\"SELECT MAX(id) FROM semantic_chunks\").fetchone()\n start_id = (row[0] or 0) + 1\n\n _validate_chunk_id_range(start_id, batch_size)\n\n conn.executemany(\n \"\"\"\n INSERT INTO semantic_chunks (file_path, content, embedding, metadata, category)\n VALUES (?, ?, ?, ?, ?)\n \"\"\",\n batch_data\n )\n conn.commit()\n # Calculate inserted IDs based on starting ID\n ids = list(range(start_id, start_id + batch_size))\n\n # Handle ANN index updates\n if update_ann and self._ensure_ann_index(embeddings_matrix.shape[1]):\n with self._ann_write_lock:\n # In bulk insert mode, accumulate for later batch update\n if self._bulk_insert_mode:\n self._bulk_insert_ids.extend(ids)\n # Split matrix into individual arrays for accumulation\n self._bulk_insert_embeddings.extend([embeddings_matrix[i] for i in range(len(ids))])\n else:\n # Normal mode: update immediately\n try:\n self._ann_index.add_vectors(ids, embeddings_matrix)\n if auto_save_ann:\n self._ann_index.save()\n except Exception as e:\n logger.warning(\"Failed to add batch to ANN index: %s\", e)\n\n # Invalidate cache after modification\n self._invalidate_cache()\n return ids\n\n def begin_bulk_insert(self) -> None:\n \"\"\"Begin bulk insert mode - disable ANN auto-update for better performance.\n\n Usage:\n store.begin_bulk_insert()\n try:\n for batch in batches:\n store.add_chunks_batch(batch, auto_save_ann=False)\n finally:\n store.end_bulk_insert()\n\n Or use context manager:\n with store.bulk_insert():\n for batch in batches:\n store.add_chunks_batch(batch)\n \"\"\"\n with self._ann_write_lock:\n self._bulk_insert_mode = True\n self._bulk_insert_ids.clear()\n self._bulk_insert_embeddings.clear()\n logger.debug(\"Entered bulk insert mode\")\n\n def end_bulk_insert(self) -> None:\n \"\"\"End bulk insert mode and rebuild ANN index from accumulated data.\n\n This method should be called after all bulk inserts are complete to\n update the ANN index in a single batch operation.\n \"\"\"\n with self._ann_write_lock:\n if not self._bulk_insert_mode:\n logger.warning(\"end_bulk_insert called but not in bulk insert mode\")\n return\n\n self._bulk_insert_mode = False\n bulk_ids = list(self._bulk_insert_ids)\n bulk_embeddings = list(self._bulk_insert_embeddings)\n self._bulk_insert_ids.clear()\n self._bulk_insert_embeddings.clear()\n\n # Update ANN index with accumulated data.\n if bulk_ids and bulk_embeddings:\n if self._ensure_ann_index(len(bulk_embeddings[0])):\n with self._ann_write_lock:\n try:\n embeddings_matrix = np.vstack(bulk_embeddings)\n self._ann_index.add_vectors(bulk_ids, embeddings_matrix)\n self._ann_index.save()\n logger.info(\n \"Bulk insert complete: added %d vectors to ANN index\",\n len(bulk_ids),\n )\n except Exception as e:\n logger.error(\"Failed to update ANN index after bulk insert: %s\", e)\n\n logger.debug(\"Exited bulk insert mode\")\n\n class BulkInsertContext:\n \"\"\"Context manager for bulk insert operations.\"\"\"\n\n def __init__(self, store: \"VectorStore\") -> None:\n self.store = store\n\n def __enter__(self) -> \"VectorStore\":\n self.store.begin_bulk_insert()\n return self.store\n\n def __exit__(self, exc_type, exc_val, exc_tb) -> None:\n self.store.end_bulk_insert()\n\n def bulk_insert(self) -> \"VectorStore.BulkInsertContext\":\n \"\"\"Return a context manager for bulk insert operations.\n\n Usage:\n with store.bulk_insert():\n for batch in batches:\n store.add_chunks_batch(batch)\n \"\"\"\n return self.BulkInsertContext(self)\n\n def delete_file_chunks(self, file_path: str) -> int:\n \"\"\"Delete all chunks for a file.\n\n Returns:\n Number of deleted chunks.\n \"\"\"\n # Get chunk IDs before deletion (for ANN index)\n chunk_ids_to_delete = []\n if self._ann_index is not None:\n with sqlite3.connect(self.db_path) as conn:\n rows = conn.execute(\n \"SELECT id FROM semantic_chunks WHERE file_path = ?\",\n (file_path,)\n ).fetchall()\n chunk_ids_to_delete = [r[0] for r in rows]\n\n # Delete from SQLite\n with sqlite3.connect(self.db_path) as conn:\n cursor = conn.execute(\n \"DELETE FROM semantic_chunks WHERE file_path = ?\",\n (file_path,)\n )\n conn.commit()\n deleted = cursor.rowcount\n\n # Remove from ANN index\n if deleted > 0 and self._ann_index is not None and chunk_ids_to_delete:\n with self._ann_write_lock:\n try:\n self._ann_index.remove_vectors(chunk_ids_to_delete)\n self._ann_index.save()\n except Exception as e:\n logger.warning(\"Failed to remove from ANN index: %s\", e)\n\n if deleted > 0:\n self._invalidate_cache()\n return deleted\n\n def search_similar(\n self,\n query_embedding: List[float],\n top_k: int = 10,\n min_score: float = 0.0,\n return_full_content: bool = True,\n category: Optional[str] = None,\n ) -> List[SearchResult]:\n \"\"\"Find chunks most similar to query embedding.\n\n Uses HNSW index for O(log N) search when available, falls back to\n brute-force NumPy search otherwise.\n\n Args:\n query_embedding: Query vector.\n top_k: Maximum results to return.\n min_score: Minimum cosine similarity score in [0.0, 1.0].\n return_full_content: If True, return full code block content.\n category: Optional category filter ('code' or 'doc'). If None, returns all.\n\n Returns:\n List of SearchResult ordered by similarity (highest first).\n \"\"\"\n query_vec = np.array(query_embedding, dtype=np.float32)\n\n if not 0.0 <= min_score <= 1.0:\n raise ValueError(\n f\"Invalid min_score: {min_score}. Must be within [0.0, 1.0] for cosine similarity.\"\n )\n\n # Try HNSW search first (O(log N))\n if (\n HNSWLIB_AVAILABLE\n and self._ann_index is not None\n and self._ann_index.is_loaded\n and self._ann_index.count() > 0\n ):\n try:\n return self._search_with_ann(\n query_vec, top_k, min_score, return_full_content, category\n )\n except Exception as e:\n logger.warning(\"ANN search failed, falling back to brute-force: %s\", e)\n\n # Fallback to brute-force search (O(N))\n return self._search_brute_force(\n query_vec, top_k, min_score, return_full_content, category\n )\n\n def _search_with_ann(\n self,\n query_vec: np.ndarray,\n top_k: int,\n min_score: float,\n return_full_content: bool,\n category: Optional[str] = None,\n ) -> List[SearchResult]:\n \"\"\"Search using HNSW index (O(log N)).\n\n Args:\n query_vec: Query vector as numpy array\n top_k: Maximum results to return\n min_score: Minimum cosine similarity score in [0.0, 1.0]\n return_full_content: If True, return full code block content\n category: Optional category filter ('code' or 'doc')\n\n Returns:\n List of SearchResult ordered by similarity (highest first)\n \"\"\"\n # Limit top_k to available vectors to prevent hnswlib error\n ann_count = self._ann_index.count()\n # When category filtering, fetch more candidates to compensate for filtering\n fetch_k = top_k * 3 if category else top_k\n effective_top_k = min(fetch_k, ann_count) if ann_count > 0 else 0\n\n if effective_top_k == 0:\n return []\n\n # HNSW search returns (ids, distances)\n # For cosine space: distance = 1 - similarity\n ids, distances = self._ann_index.search(query_vec, effective_top_k)\n\n if ids is None or distances is None:\n logger.debug(\n \"ANN search returned null results (ids=%s, distances=%s)\",\n ids,\n distances,\n )\n return []\n\n if len(ids) == 0 or len(distances) == 0:\n logger.debug(\n \"ANN search returned empty results (ids=%s, distances=%s)\",\n ids,\n distances,\n )\n return []\n\n if len(ids) != len(distances):\n logger.warning(\n \"ANN search returned mismatched result lengths (%d ids, %d distances)\",\n len(ids),\n len(distances),\n )\n return []\n\n # Convert distances to similarity scores\n scores = [1.0 - d for d in distances]\n\n # Filter by min_score\n filtered = [\n (chunk_id, score)\n for chunk_id, score in zip(ids, scores)\n if score >= min_score\n ]\n\n if not filtered:\n return []\n\n top_ids = [f[0] for f in filtered]\n top_scores = [f[1] for f in filtered]\n\n # Fetch content from SQLite with category filtering\n results = self._fetch_results_by_ids(\n top_ids, top_scores, return_full_content, category\n )\n # Apply final limit after category filtering\n return results[:top_k]\n\n def _search_brute_force(\n self,\n query_vec: np.ndarray,\n top_k: int,\n min_score: float,\n return_full_content: bool,\n category: Optional[str] = None,\n ) -> List[SearchResult]:\n \"\"\"Brute-force search using NumPy (O(N) fallback).\n\n Args:\n query_vec: Query vector as numpy array\n top_k: Maximum results to return\n min_score: Minimum cosine similarity score in [0.0, 1.0]\n return_full_content: If True, return full code block content\n category: Optional category filter ('code' or 'doc')\n\n Returns:\n List of SearchResult ordered by similarity (highest first)\n \"\"\"\n logger.warning(\n \"Using brute-force vector search (hnswlib not available). \"\n \"This may cause high memory usage for large indexes. \"\n \"Install hnswlib for better performance: pip install hnswlib\"\n )\n\n with self._cache_lock:\n # Refresh cache if needed\n if self._embedding_matrix is None:\n if not self._refresh_cache():\n return [] # No data\n\n # Vectorized cosine similarity\n query_vec = query_vec.reshape(1, -1)\n query_norm = np.linalg.norm(query_vec)\n if query_norm == 0:\n return []\n\n # Compute all similarities at once: (N,) scores\n # similarity = (A @ B.T) / (||A|| * ||B||)\n dot_products = np.dot(self._embedding_matrix, query_vec.T).flatten()\n scores = dot_products / (self._embedding_norms.flatten() * query_norm)\n\n # Filter by min_score and get top-k indices\n valid_mask = scores >= min_score\n valid_indices = np.where(valid_mask)[0]\n\n if len(valid_indices) == 0:\n return []\n\n # When category filtering, fetch more candidates to compensate for filtering\n fetch_k = top_k * 3 if category else top_k\n\n # Sort by score descending and take top candidates\n valid_scores = scores[valid_indices]\n sorted_order = np.argsort(valid_scores)[::-1][:fetch_k]\n top_indices = valid_indices[sorted_order]\n top_scores = valid_scores[sorted_order]\n\n # Get chunk IDs for top results\n top_ids = [self._chunk_ids[i] for i in top_indices]\n\n # Fetch content only for top-k results (lazy loading) with category filtering\n results = self._fetch_results_by_ids(\n top_ids, top_scores.tolist(), return_full_content, category\n )\n # Apply final limit after category filtering\n return results[:top_k]\n\n def _fetch_results_by_ids(\n self,\n chunk_ids: List[int],\n scores: List[float],\n return_full_content: bool,\n category: Optional[str] = None,\n ) -> List[SearchResult]:\n \"\"\"Fetch full result data for specific chunk IDs.\n\n Args:\n chunk_ids: List of chunk IDs to fetch.\n scores: Corresponding similarity scores.\n return_full_content: Whether to include full content.\n category: Optional category filter ('code' or 'doc').\n\n Returns:\n List of SearchResult objects.\n \"\"\"\n if not chunk_ids:\n return []\n\n # Build parameterized query for IN clause\n placeholders = \",\".join(\"?\" * len(chunk_ids))\n _validate_sql_placeholders(placeholders, len(chunk_ids))\n\n # SQL injection prevention:\n # - Only a validated placeholders string (commas + '?') is interpolated into the query.\n # - User-provided values are passed separately via sqlite3 parameters.\n # - Category filter is added as a separate parameter\n if category:\n query = \"\"\"\n SELECT id, file_path, content, metadata\n FROM semantic_chunks\n WHERE id IN ({placeholders}) AND category = ?\n \"\"\".format(placeholders=placeholders)\n params = list(chunk_ids) + [category]\n else:\n query = \"\"\"\n SELECT id, file_path, content, metadata\n FROM semantic_chunks\n WHERE id IN ({placeholders})\n \"\"\".format(placeholders=placeholders)\n params = chunk_ids\n\n with sqlite3.connect(self.db_path) as conn:\n conn.execute(\"PRAGMA mmap_size = 30000000000\")\n rows = conn.execute(query, params).fetchall()\n\n # Build ID -> row mapping\n id_to_row = {r[0]: r for r in rows}\n\n results = []\n for chunk_id, score in zip(chunk_ids, scores):\n row = id_to_row.get(chunk_id)\n if not row:\n continue\n\n _, file_path, content, metadata_json = row\n metadata = json.loads(metadata_json) if metadata_json else {}\n\n # Build excerpt (short preview)\n excerpt = content[:200] + \"...\" if len(content) > 200 else content\n\n # Extract symbol information from metadata\n symbol_name = metadata.get(\"symbol_name\")\n symbol_kind = metadata.get(\"symbol_kind\")\n start_line = metadata.get(\"start_line\")\n end_line = metadata.get(\"end_line\")\n\n # Build Symbol object if we have symbol info\n symbol = None\n if symbol_name and symbol_kind and start_line and end_line:\n try:\n from codexlens.entities import Symbol\n symbol = Symbol(\n name=symbol_name,\n kind=symbol_kind,\n range=(start_line, end_line)\n )\n except Exception:\n pass\n\n results.append(SearchResult(\n path=file_path,\n score=score,\n excerpt=excerpt,\n content=content if return_full_content else None,\n symbol=symbol,\n metadata=metadata,\n start_line=start_line,\n end_line=end_line,\n symbol_name=symbol_name,\n symbol_kind=symbol_kind,\n ))\n\n return results\n\n def count_chunks(self) -> int:\n \"\"\"Count total chunks in store.\"\"\"\n with sqlite3.connect(self.db_path) as conn:\n row = conn.execute(\"SELECT COUNT(*) FROM semantic_chunks\").fetchone()\n return row[0] if row else 0\n\n def get_all_chunks(self) -> List[SemanticChunk]:\n \"\"\"Get all chunks from the store.\n\n Returns:\n List of SemanticChunk objects with id and content.\n \"\"\"\n with sqlite3.connect(self.db_path) as conn:\n conn.row_factory = sqlite3.Row\n rows = conn.execute(\n \"SELECT id, file_path, content, metadata FROM semantic_chunks\"\n ).fetchall()\n\n chunks = []\n for row in rows:\n chunks.append(SemanticChunk(\n id=row[\"id\"],\n content=row[\"content\"],\n file_path=row[\"file_path\"],\n metadata=json.loads(row[\"metadata\"]) if row[\"metadata\"] else None,\n ))\n return chunks\n\n def clear_cache(self) -> None:\n \"\"\"Manually clear the embedding cache.\"\"\"\n self._invalidate_cache()\n\n @property\n def ann_available(self) -> bool:\n \"\"\"Check if ANN index is available and ready.\"\"\"\n return (\n HNSWLIB_AVAILABLE\n and self._ann_index is not None\n and self._ann_index.is_loaded\n )\n\n @property\n def ann_count(self) -> int:\n \"\"\"Get number of vectors in ANN index.\"\"\"\n if self._ann_index is not None:\n return self._ann_index.count()\n return 0\n\n def get_model_config(self) -> Optional[Dict[str, Any]]:\n \"\"\"Get the model configuration used for embeddings in this store.\n\n Returns:\n Dictionary with model_profile, model_name, embedding_dim, backend, or None if not set.\n \"\"\"\n with sqlite3.connect(self.db_path) as conn:\n row = conn.execute(\n \"SELECT model_profile, model_name, embedding_dim, backend, created_at, updated_at \"\n \"FROM embeddings_config WHERE id = 1\"\n ).fetchone()\n if row:\n return {\n \"model_profile\": row[0],\n \"model_name\": row[1],\n \"embedding_dim\": row[2],\n \"backend\": row[3],\n \"created_at\": row[4],\n \"updated_at\": row[5],\n }\n return None\n\n def set_model_config(\n self, model_profile: str, model_name: str, embedding_dim: int, backend: str = 'fastembed'\n ) -> None:\n \"\"\"Set the model configuration for embeddings in this store.\n\n This should be called when generating new embeddings. If a different\n model was previously used, this will update the configuration.\n\n Args:\n model_profile: Model profile name (fast, code, minilm, etc.)\n model_name: Full model name (e.g., jinaai/jina-embeddings-v2-base-code)\n embedding_dim: Embedding dimension (e.g., 768)\n backend: Backend used for embeddings (fastembed or litellm, default: fastembed)\n \"\"\"\n with sqlite3.connect(self.db_path) as conn:\n conn.execute(\n \"\"\"\n INSERT INTO embeddings_config (id, model_profile, model_name, embedding_dim, backend)\n VALUES (1, ?, ?, ?, ?)\n ON CONFLICT(id) DO UPDATE SET\n model_profile = excluded.model_profile,\n model_name = excluded.model_name,\n embedding_dim = excluded.embedding_dim,\n backend = excluded.backend,\n updated_at = CURRENT_TIMESTAMP\n \"\"\",\n (model_profile, model_name, embedding_dim, backend)\n )\n conn.commit()\n\n def check_model_compatibility(\n self, model_profile: str, model_name: str, embedding_dim: int\n ) -> Tuple[bool, Optional[str]]:\n \"\"\"Check if the given model is compatible with existing embeddings.\n\n Args:\n model_profile: Model profile to check\n model_name: Model name to check\n embedding_dim: Embedding dimension to check\n\n Returns:\n Tuple of (is_compatible, warning_message).\n is_compatible is True if no existing config or configs match.\n warning_message is a user-friendly message if incompatible.\n \"\"\"\n existing = self.get_model_config()\n if existing is None:\n return True, None\n\n # Check dimension first (most critical)\n if existing[\"embedding_dim\"] != embedding_dim:\n return False, (\n f\"Dimension mismatch: existing embeddings use {existing['embedding_dim']}d \"\n f\"({existing['model_profile']}), but requested model uses {embedding_dim}d \"\n f\"({model_profile}). Use --force to regenerate all embeddings.\"\n )\n\n # Check model (different models with same dimension may have different semantic spaces)\n if existing[\"model_profile\"] != model_profile:\n return False, (\n f\"Model mismatch: existing embeddings use '{existing['model_profile']}' \"\n f\"({existing['model_name']}), but requested '{model_profile}' \"\n f\"({model_name}). Use --force to regenerate all embeddings.\"\n )\n\n return True, None\n\n def close(self) -> None:\n \"\"\"Close the vector store and release resources.\n\n This ensures SQLite connections are closed and ANN index is cleared,\n allowing temporary files to be deleted on Windows.\n \"\"\"\n with self._cache_lock:\n self._embedding_matrix = None\n self._embedding_norms = None\n self._chunk_ids = None\n\n with self._ann_write_lock:\n self._ann_index = None\n\n def __enter__(self) -> \"VectorStore\":\n \"\"\"Context manager entry.\"\"\"\n return self\n\n def __exit__(self, exc_type, exc_val, exc_tb) -> None:\n \"\"\"Context manager exit - close resources.\"\"\"\n self.close()",
|
|
"source": null,
|
|
"symbol": null
|
|
},
|
|
{
|
|
"path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\base.py",
|
|
"score": 0.02356190140431283,
|
|
"excerpt": "\"\"\"Base classes for clustering strategies in the hybrid search pipeline.\n\nThis module defines the abstract base class for clustering strategies used",
|
|
"content": "\"\"\"Base classes for clustering strategies in the hybrid search pipeline.\n\nThis module defines the abstract base class for clustering strategies used\nin the staged hybrid search pipeline. Strategies cluster search results\nbased on their embeddings and select representative results from each cluster.\n\"\"\"\n\nfrom __future__ import annotations\n\nfrom abc import ABC, abstractmethod\nfrom dataclasses import dataclass, field",
|
|
"source": null,
|
|
"symbol": null
|
|
},
|
|
{
|
|
"path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\parsers\\factory.py",
|
|
"score": 0.022717150737751757,
|
|
"excerpt": "\nimport re\nfrom dataclasses import dataclass\nfrom pathlib import Path\nfrom typing import Dict, List, Optional, Protocol",
|
|
"content": "\"\"\"Parser factory for CodexLens.\n\nPython and JavaScript/TypeScript parsing use Tree-Sitter grammars when\navailable. Regex fallbacks are retained to preserve the existing parser\ninterface and behavior in minimal environments.\n\"\"\"\n\nfrom __future__ import annotations\n\nimport re\nfrom dataclasses import dataclass\nfrom pathlib import Path\nfrom typing import Dict, List, Optional, Protocol\n\nfrom codexlens.config import Config\nfrom codexlens.entities import CodeRelationship, IndexedFile, RelationshipType, Symbol\nfrom codexlens.parsers.treesitter_parser import TreeSitterSymbolParser\n\n\nclass Parser(Protocol):\n def parse(self, text: str, path: Path) -> IndexedFile: ...",
|
|
"source": null,
|
|
"symbol": null
|
|
},
|
|
{
|
|
"path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\lsp\\server.py",
|
|
"score": 0.022282698690396483,
|
|
"excerpt": "\"\"\"codex-lens LSP Server implementation using pygls.\n\nThis module provides the main Language Server class and entry point.\n\"\"\"\n",
|
|
"content": "\"\"\"codex-lens LSP Server implementation using pygls.\n\nThis module provides the main Language Server class and entry point.\n\"\"\"\n\nfrom __future__ import annotations\n\nimport argparse\nimport logging\nimport sys\nfrom pathlib import Path\nfrom typing import Optional\n",
|
|
"source": null,
|
|
"symbol": null
|
|
},
|
|
{
|
|
"path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\search\\hybrid_search.py",
|
|
"score": 0.022258499170812605,
|
|
"excerpt": " logger.log(level, \"[TIMING] %s: %.2fms\", name, elapsed_ms)\n\nfrom codexlens.config import Config\nfrom codexlens.config import VECTORS_HNSW_NAME\nfrom codexlens.entities import SearchResult",
|
|
"content": " logger: Logger instance to use\n level: Logging level (default DEBUG)\n \"\"\"\n start = time.perf_counter()\n try:\n yield\n finally:\n elapsed_ms = (time.perf_counter() - start) * 1000\n logger.log(level, \"[TIMING] %s: %.2fms\", name, elapsed_ms)\n\nfrom codexlens.config import Config\nfrom codexlens.config import VECTORS_HNSW_NAME\nfrom codexlens.entities import SearchResult\nfrom codexlens.search.ranking import (\n DEFAULT_WEIGHTS,\n FTS_FALLBACK_WEIGHTS,\n QueryIntent,\n apply_symbol_boost,\n cross_encoder_rerank,\n detect_query_intent,\n filter_results_by_category,",
|
|
"source": null,
|
|
"symbol": null
|
|
},
|
|
{
|
|
"path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py",
|
|
"score": 0.022204010428648113,
|
|
"excerpt": "import threading\nimport time\nfrom dataclasses import dataclass\nfrom pathlib import Path\nfrom typing import Any, Dict, List, Optional, Tuple",
|
|
"content": "\"\"\"\n\nfrom __future__ import annotations\n\nimport logging\nimport hashlib\nimport re\nimport sqlite3\nimport threading\nimport time\nfrom dataclasses import dataclass\nfrom pathlib import Path\nfrom typing import Any, Dict, List, Optional, Tuple\n\nfrom codexlens.config import Config\nfrom codexlens.entities import CodeRelationship, SearchResult, Symbol\nfrom codexlens.errors import StorageError\nfrom codexlens.storage.global_index import GlobalSymbolIndex\n\n\n@dataclass",
|
|
"source": null,
|
|
"symbol": null
|
|
},
|
|
{
|
|
"path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\watcher\\manager.py",
|
|
"score": 0.022191896701700627,
|
|
"excerpt": "from typing import Callable, List, Optional\n\nfrom codexlens.config import Config\nfrom codexlens.storage.path_mapper import PathMapper\nfrom codexlens.storage.registry import RegistryStore",
|
|
"content": "from __future__ import annotations\n\nimport json\nimport logging\nimport signal\nimport threading\nimport time\nfrom pathlib import Path\nfrom typing import Callable, List, Optional\n\nfrom codexlens.config import Config\nfrom codexlens.storage.path_mapper import PathMapper\nfrom codexlens.storage.registry import RegistryStore\n\nfrom .events import FileEvent, IndexResult, PendingQueueStatus, WatcherConfig, WatcherStats\nfrom .file_watcher import FileWatcher\nfrom .incremental_indexer import IncrementalIndexer\n\nlogger = logging.getLogger(__name__)\n\n",
|
|
"source": null,
|
|
"symbol": null
|
|
},
|
|
{
|
|
"path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\api\\semantic.py",
|
|
"score": 0.021943278996721462,
|
|
"excerpt": "\nThis module provides the semantic_search() function for combining\nvector, structural, and keyword search with configurable fusion strategies.\n\"\"\"\n",
|
|
"content": "\"\"\"Semantic search API with RRF fusion.\n\nThis module provides the semantic_search() function for combining\nvector, structural, and keyword search with configurable fusion strategies.\n\"\"\"\n\nfrom __future__ import annotations\n\nimport logging\nfrom pathlib import Path\nfrom typing import List, Optional\n\nfrom .models import SemanticResult\nfrom .utils import resolve_project",
|
|
"source": null,
|
|
"symbol": null
|
|
},
|
|
{
|
|
"path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\watcher\\file_watcher.py",
|
|
"score": 0.021943278996721462,
|
|
"excerpt": "from watchdog.events import FileSystemEventHandler\n\nfrom .events import ChangeType, FileEvent, WatcherConfig, PendingQueueStatus\nfrom ..config import Config\n",
|
|
"content": "\nimport logging\nimport threading\nimport time\nfrom pathlib import Path\nfrom typing import Callable, Dict, List, Optional\n\nfrom watchdog.observers import Observer\nfrom watchdog.events import FileSystemEventHandler\n\nfrom .events import ChangeType, FileEvent, WatcherConfig, PendingQueueStatus\nfrom ..config import Config\n\nlogger = logging.getLogger(__name__)\n\n# Maximum queue size to prevent unbounded memory growth\n# When exceeded, forces immediate flush to avoid memory exhaustion\nMAX_QUEUE_SIZE = 50000\n\n\nclass _CodexLensHandler(FileSystemEventHandler):",
|
|
"source": null,
|
|
"symbol": null
|
|
},
|
|
{
|
|
"path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\semantic\\reranker\\api_reranker.py",
|
|
"score": 0.02150910700179165,
|
|
"excerpt": "\n \n env_api_base = _get_env_with_fallback(\"RERANKER_API_BASE\", self._workspace_root)\n self.api_base = (api_base or env_api_base or defaults[\"api_base\"]).strip().rstrip(\"/\")\n ...",
|
|
"content": "\n \n env_api_base = _get_env_with_fallback(\"RERANKER_API_BASE\", self._workspace_root)\n self.api_base = (api_base or env_api_base or defaults[\"api_base\"]).strip().rstrip(\"/\")\n self.endpoint = defaults[\"endpoint\"]\n\n \n env_model = _get_env_with_fallback(\"RERANKER_MODEL\", self._workspace_root)\n self.model_name = (model_name or env_model or defaults[\"default_model\"]).strip()\n if not self.model_name:\n raise ValueError(\"model_name cannot be blank\")\n\n \n resolved_key = api_key or _get_env_with_fallback(env_api_key, self._workspace_root) or \"\"\n resolved_key = resolved_key.strip()\n if not resolved_key:\n raise ValueError(\n f\"Missing API key for reranker provider '{self.provider}'. \"\n f\"Pass api_key=... or set ${env_api_key}.\"\n )\n self._api_key = resolved_key\n\n self.timeout_s = float(timeout) if timeout and float(timeout) > 0 else 30.0\n self.max_retries = int(max_retries) if max_retries and int(max_retries) >= 0 else 3\n self.backoff_base_s = float(backoff_base_s) if backoff_base_s and float(backoff_base_s) > 0 else 0.5\n self.backoff_max_s = float(backoff_max_s) if backoff_max_s and float(backoff_max_s) > 0 else 8.0\n\n headers = {\n \"Authorization\": f\"Bearer {self._api_key}\",\n \"Content-Type\": \"application/json\",\n }\n if self.provider == \"cohere\":\n headers.setdefault(\"Cohere-Version\", \"2022-12-06\")\n\n self._client = httpx.Client(\n base_url=self.api_base,\n headers=headers,\n timeout=self.timeout_s,\n )\n\n \n if max_input_tokens is not None:\n self._max_input_tokens = max_input_tokens\n else:\n \n model_lower = self.model_name.lower()\n if '8b' in model_lower or 'large' in model_lower:\n self._max_input_tokens = 32768\n else:\n self._max_input_tokens = 8192\n\n @property\n def max_input_tokens(self) -> int:\n",
|
|
"source": null,
|
|
"symbol": null
|
|
},
|
|
{
|
|
"path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\cli\\embedding_manager.py",
|
|
"score": 0.02051605801605802,
|
|
"excerpt": " locked_config = get_locked_model_config()\n\n if locked_config is None:\n return {\n \"is_locked\": False,\n \"has_conflict\": False,\n \"locked_config\": None,\n ...",
|
|
"content": " locked_config = get_locked_model_config()\n\n if locked_config is None:\n return {\n \"is_locked\": False,\n \"has_conflict\": False,\n \"locked_config\": None,\n \"target_config\": {\"backend\": target_backend, \"model\": target_model},\n }\n\n has_conflict = (\n locked_config[\"backend\"] != target_backend or\n locked_config[\"model\"] != target_model\n )\n\n return {\n \"is_locked\": True,\n \"has_conflict\": has_conflict,\n \"locked_config\": locked_config,\n \"target_config\": {\"backend\": target_backend, \"model\": target_model},\n }",
|
|
"source": null,
|
|
"symbol": null
|
|
},
|
|
{
|
|
"path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\mcp\\provider.py",
|
|
"score": 0.020229904287875303,
|
|
"excerpt": "\nfrom __future__ import annotations\n\nimport logging\nfrom pathlib import Path\nfrom typing import Optional, List, TYPE_CHECKING\n\nfrom codexlens.mcp.schema import (\n MCPContext,\n SymbolInfo,\n Re...",
|
|
"content": "\nfrom __future__ import annotations\n\nimport logging\nfrom pathlib import Path\nfrom typing import Optional, List, TYPE_CHECKING\n\nfrom codexlens.mcp.schema import (\n MCPContext,\n SymbolInfo,\n ReferenceInfo,\n RelatedSymbol,\n)\n\nif TYPE_CHECKING:\n from codexlens.storage.global_index import GlobalSymbolIndex\n from codexlens.storage.registry import RegistryStore\n from codexlens.search.chain_search import ChainSearchEngine\n\nlogger = logging.getLogger(__name__)\n\n\nclass MCPProvider:\n\n def __init__(\n self,\n global_index: \"GlobalSymbolIndex\",\n search_engine: \"ChainSearchEngine\",\n registry: \"RegistryStore\",\n ) -> None:\n self.global_index = global_index\n self.search_engine = search_engine\n self.registry = registry\n\n def build_context(\n self,\n symbol_name: str,\n context_type: str = \"symbol_explanation\",\n include_references: bool = True,\n include_related: bool = True,\n max_references: int = 10,\n ) -> Optional[MCPContext]:\n \n symbols = self.global_index.search(symbol_name, prefix_mode=False, limit=1)\n\n if not symbols:\n",
|
|
"source": null,
|
|
"symbol": null
|
|
},
|
|
{
|
|
"path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\mcp\\hooks.py",
|
|
"score": 0.020007053720837744,
|
|
"excerpt": "\nfrom __future__ import annotations\n\nimport logging\nfrom pathlib import Path\nfrom typing import Any, Dict, Optional, Callable, TYPE_CHECKING\n\nfrom codexlens.mcp.schema import MCPContext\n\nif TYPE_CHECK...",
|
|
"content": "\nfrom __future__ import annotations\n\nimport logging\nfrom pathlib import Path\nfrom typing import Any, Dict, Optional, Callable, TYPE_CHECKING\n\nfrom codexlens.mcp.schema import MCPContext\n\nif TYPE_CHECKING:\n from codexlens.mcp.provider import MCPProvider\n\nlogger = logging.getLogger(__name__)\n\n\nclass HookManager:\n\n def __init__(self, mcp_provider: \"MCPProvider\") -> None:\n self.mcp_provider = mcp_provider\n self._pre_hooks: Dict[str, Callable] = {}\n self._post_hooks: Dict[str, Callable] = {}\n\n \n self._register_default_hooks()\n\n def _register_default_hooks(self) -> None:\n self._pre_hooks[\"explain\"] = self._pre_explain_hook\n self._pre_hooks[\"refactor\"] = self._pre_refactor_hook\n self._pre_hooks[\"document\"] = self._pre_document_hook\n\n def execute_pre_hook(\n self,\n action: str,\n params: Dict[str, Any],\n ) -> Optional[MCPContext]:\n hook = self._pre_hooks.get(action)\n\n if not hook:\n logger.debug(f\"No pre-hook for action: {action}\")\n return None\n\n try:\n return hook(params)\n except Exception as e:\n logger.error(f\"Pre-hook failed for {action}: {e}\")\n return None\n\n def execute_post_hook(\n self,\n action: str,\n result: Any,\n ) -> None:\n",
|
|
"source": null,
|
|
"symbol": null
|
|
},
|
|
{
|
|
"path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\search\\association_tree\\deduplicator.py",
|
|
"score": 0.019921615989390927,
|
|
"excerpt": "\nfrom __future__ import annotations\n\nimport logging\nfrom typing import Dict, List, Optional\n\nfrom .data_structures import (\n CallTree,\n TreeNode,\n UniqueNode,\n)\n\nlogger = logging.getLogger(__...",
|
|
"content": "\nfrom __future__ import annotations\n\nimport logging\nfrom typing import Dict, List, Optional\n\nfrom .data_structures import (\n CallTree,\n TreeNode,\n UniqueNode,\n)\n\nlogger = logging.getLogger(__name__)\n\n\n\nKIND_WEIGHTS: Dict[str, float] = {\n \n \"function\": 1.0,\n \"method\": 1.0,\n \"12\": 1.0, \n \"6\": 1.0, \n \n \"class\": 0.8,\n \"5\": 0.8, \n \n \"interface\": 0.7,\n \"11\": 0.7, \n \"type\": 0.6,\n \n \"constructor\": 0.9,\n \"9\": 0.9, \n \n \"variable\": 0.4,\n \"13\": 0.4, \n \"constant\": 0.5,\n \"14\": 0.5, \n \n \"unknown\": 0.3,\n}\n\n\nclass ResultDeduplicator:\n\n",
|
|
"source": null,
|
|
"symbol": null
|
|
},
|
|
{
|
|
"path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\semantic\\reranker\\factory.py",
|
|
"score": 0.01962803701934137,
|
|
"excerpt": " if not ok:\n raise ImportError(err)\n\n from .api_reranker import APIReranker\n\n _ = device \n resolved_model_name = (model_name or \"\").strip() or None\n retu...",
|
|
"content": " if not ok:\n raise ImportError(err)\n\n from .api_reranker import APIReranker\n\n _ = device \n resolved_model_name = (model_name or \"\").strip() or None\n return APIReranker(model_name=resolved_model_name, **kwargs)\n\n raise ValueError(\n f\"Unknown backend: {backend}. Supported backends: 'fastembed', 'onnx', 'api', 'litellm', 'legacy'\"\n )\n",
|
|
"source": null,
|
|
"symbol": null
|
|
},
|
|
{
|
|
"path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\storage\\index_tree.py",
|
|
"score": 0.015740967294674172,
|
|
"excerpt": "import time\nfrom concurrent.futures import ProcessPoolExecutor, as_completed\nfrom dataclasses import dataclass\nfrom pathlib import Path\nfrom typing import Dict, List, Optional, Set, Tuple",
|
|
"content": "\"\"\"\n\nfrom __future__ import annotations\n\nimport logging\nimport os\nimport re\nimport sqlite3\nimport time\nfrom concurrent.futures import ProcessPoolExecutor, as_completed\nfrom dataclasses import dataclass\nfrom pathlib import Path\nfrom typing import Dict, List, Optional, Set, Tuple\n\nfrom codexlens.config import Config\nfrom codexlens.parsers.factory import ParserFactory\nfrom codexlens.storage.dir_index import DirIndexStore\nfrom codexlens.storage.global_index import GlobalSymbolIndex\nfrom codexlens.storage.path_mapper import PathMapper\nfrom codexlens.storage.registry import ProjectInfo, RegistryStore\n",
|
|
"source": null,
|
|
"symbol": null
|
|
},
|
|
{
|
|
"path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\semantic\\chunker.py",
|
|
"score": 0.01569458021070924,
|
|
"excerpt": "\nLightweight Mode:\n The ChunkConfig supports a `skip_token_count` option for performance optimization.\n When enabled, token counting uses a fast character-based estimation (char/4)\n instead o...",
|
|
"content": "\"\"\"Code chunking strategies for semantic search.\n\nThis module provides various chunking strategies for breaking down source code\ninto semantic chunks suitable for embedding and search.\n\nLightweight Mode:\n The ChunkConfig supports a `skip_token_count` option for performance optimization.\n When enabled, token counting uses a fast character-based estimation (char/4)\n instead of expensive tiktoken encoding.\n\n Use cases for lightweight mode:\n - Large-scale indexing where speed is critical\n - Scenarios where approximate token counts are acceptable\n - Memory-constrained environments\n - Initial prototyping and development\n\n Example:",
|
|
"source": null,
|
|
"symbol": null
|
|
},
|
|
{
|
|
"path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\noop_strategy.py",
|
|
"score": 0.015496521189120809,
|
|
"excerpt": "from typing import TYPE_CHECKING, List, Optional\n\nfrom .base import BaseClusteringStrategy, ClusteringConfig\n\nif TYPE_CHECKING:",
|
|
"content": "\"\"\"No-op clustering strategy for search results.\n\nNoOpStrategy returns all results ungrouped when clustering dependencies\nare not available or clustering is disabled.\n\"\"\"\n\nfrom __future__ import annotations\n\nfrom typing import TYPE_CHECKING, List, Optional\n\nfrom .base import BaseClusteringStrategy, ClusteringConfig\n\nif TYPE_CHECKING:\n import numpy as np\n from codexlens.entities import SearchResult\n\n\nclass NoOpStrategy(BaseClusteringStrategy):\n \"\"\"No-op clustering strategy that returns all results ungrouped.\n\n This strategy is used as a final fallback when no clustering dependencies",
|
|
"source": null,
|
|
"symbol": null
|
|
},
|
|
{
|
|
"path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\dbscan_strategy.py",
|
|
"score": 0.014896214896214899,
|
|
"excerpt": "from typing import TYPE_CHECKING, List, Optional\n\nfrom .base import BaseClusteringStrategy, ClusteringConfig\n\nif TYPE_CHECKING:",
|
|
"content": "\"\"\"DBSCAN-based clustering strategy for search results.\n\nDBSCAN (Density-Based Spatial Clustering of Applications with Noise)\nis the fallback clustering strategy when HDBSCAN is not available.\n\"\"\"\n\nfrom __future__ import annotations\n\nfrom typing import TYPE_CHECKING, List, Optional\n\nfrom .base import BaseClusteringStrategy, ClusteringConfig\n\nif TYPE_CHECKING:\n import numpy as np\n from codexlens.entities import SearchResult\n\n\nclass DBSCANStrategy(BaseClusteringStrategy):\n \"\"\"DBSCAN-based clustering strategy.\n\n Uses sklearn's DBSCAN algorithm as a fallback when HDBSCAN is not available.",
|
|
"source": null,
|
|
"symbol": null
|
|
},
|
|
{
|
|
"path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\mcp\\schema.py",
|
|
"score": 0.014112903225806453,
|
|
"excerpt": " definition: Optional[str] = None\n references: List[ReferenceInfo] = field(default_factory=list)\n related_symbols: List[RelatedSymbol] = field(default_factory=list)\n metadata: dict = field...",
|
|
"content": " definition: Optional[str] = None\n references: List[ReferenceInfo] = field(default_factory=list)\n related_symbols: List[RelatedSymbol] = field(default_factory=list)\n metadata: dict = field(default_factory=dict)\n\n def to_dict(self) -> dict:\n result = {\n \"version\": self.version,\n \"context_type\": self.context_type,\n \"metadata\": self.metadata,\n }\n\n if self.symbol:\n result[\"symbol\"] = self.symbol.to_dict()\n if self.definition:\n result[\"definition\"] = self.definition\n if self.references:\n result[\"references\"] = [r.to_dict() for r in self.references]\n if self.related_symbols:\n result[\"related_symbols\"] = [s.to_dict() for s in self.related_symbols]\n\n return result\n\n def to_json(self, indent: int = 2) -> str:\n return json.dumps(self.to_dict(), indent=indent)\n\n def to_prompt_injection(self) -> str:\n parts = [\"<code_context>\"]\n\n if self.symbol:\n parts.append(f\"## Symbol: {self.symbol.name}\")\n parts.append(f\"Type: {self.symbol.kind}\")\n parts.append(f\"Location: {self.symbol.file_path}:{self.symbol.line_start}\")\n\n if self.definition:\n parts.append(\"\\n## Definition\")\n parts.append(f\"```\\n{self.definition}\\n```\")\n\n if self.references:\n parts.append(f\"\\n## References ({len(self.references)} found)\")\n for ref in self.references[:5]: \n parts.append(f\"- {ref.file_path}:{ref.line} ({ref.relationship_type})\")\n parts.append(f\" ```\\n {ref.context}\\n ```\")\n\n if self.related_symbols:\n parts.append(\"\\n## Related Symbols\")\n for sym in self.related_symbols[:10]: \n parts.append(f\"- {sym.name} ({sym.relationship})\")\n\n parts.append(\"</code_context>\")\n return \"\\n\".join(parts)\n",
|
|
"source": null,
|
|
"symbol": null
|
|
},
|
|
{
|
|
"path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\lsp\\lsp_bridge.py",
|
|
"score": 0.013999118165784833,
|
|
"excerpt": "\nclass LspBridge:\n \n DEFAULT_BRIDGE_URL = \"http://127.0.0.1:3457\"\n DEFAULT_TIMEOUT = 30.0 \n DEFAULT_CACHE_TTL = 300 \n DEFAULT_MAX_CACHE_SIZE = 1000 \n\n def __init__(\n self,\n...",
|
|
"content": "\nclass LspBridge:\n \n DEFAULT_BRIDGE_URL = \"http://127.0.0.1:3457\"\n DEFAULT_TIMEOUT = 30.0 \n DEFAULT_CACHE_TTL = 300 \n DEFAULT_MAX_CACHE_SIZE = 1000 \n\n def __init__(\n self,\n bridge_url: str = DEFAULT_BRIDGE_URL,\n timeout: float = DEFAULT_TIMEOUT,\n cache_ttl: int = DEFAULT_CACHE_TTL,\n max_cache_size: int = DEFAULT_MAX_CACHE_SIZE,\n use_vscode_bridge: bool = False,\n workspace_root: Optional[str] = None,\n config_file: Optional[str] = None,\n ):\n self.bridge_url = bridge_url\n self.timeout = timeout\n self.cache_ttl = cache_ttl\n",
|
|
"source": null,
|
|
"symbol": null
|
|
},
|
|
{
|
|
"path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\watcher\\events.py",
|
|
"score": 0.013999118165784833,
|
|
"excerpt": "\nfrom __future__ import annotations\n\nimport time\nfrom dataclasses import dataclass, field\nfrom enum import Enum\nfrom pathlib import Path\nfrom typing import List, Optional, Set\n\n\nclass ChangeType(Enum)...",
|
|
"content": "\nfrom __future__ import annotations\n\nimport time\nfrom dataclasses import dataclass, field\nfrom enum import Enum\nfrom pathlib import Path\nfrom typing import List, Optional, Set\n\n\nclass ChangeType(Enum):\n CREATED = \"created\"\n MODIFIED = \"modified\"\n DELETED = \"deleted\"\n MOVED = \"moved\"\n\n\n@dataclass\nclass FileEvent:\n path: Path\n change_type: ChangeType\n timestamp: float\n old_path: Optional[Path] = None \n\n\n@dataclass\nclass WatcherConfig:\n debounce_ms: int = 60000 \n ignored_patterns: Set[str] = field(default_factory=lambda: {\n \n \".git\", \".svn\", \".hg\",\n \n \".venv\", \"venv\", \"env\", \"__pycache__\", \".pytest_cache\", \".mypy_cache\", \".ruff_cache\",\n \n \"node_modules\", \"bower_components\", \".npm\", \".yarn\",\n \n \"dist\", \"build\", \"out\", \"target\", \"bin\", \"obj\", \"_build\", \"coverage\", \"htmlcov\",\n \n \".idea\", \".vscode\", \".vs\", \".eclipse\",\n \n \".codexlens\",\n \n \".cache\", \".parcel-cache\", \".turbo\", \".next\", \".nuxt\",\n \n \"logs\", \"tmp\", \"temp\",\n })\n languages: Optional[List[str]] = None \n\n\n@dataclass\nclass PendingQueueStatus:\n file_count: int = 0\n files: List[str] = field(default_factory=list) \n countdown_seconds: int = 0\n last_event_time: Optional[float] = None\n\n\n@dataclass\nclass IndexResult:\n files_indexed: int = 0\n files_removed: int = 0\n symbols_added: int = 0\n symbols_removed: int = 0\n",
|
|
"source": null,
|
|
"symbol": null
|
|
},
|
|
{
|
|
"path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\storage\\registry.py",
|
|
"score": 0.013902465515368743,
|
|
"excerpt": "\nfrom __future__ import annotations\n\nimport platform\nimport sqlite3\nimport threading\nimport time\nfrom dataclasses import dataclass\nfrom pathlib import Path\nfrom typing import Any, Dict, List, Optional...",
|
|
"content": "\nfrom __future__ import annotations\n\nimport platform\nimport sqlite3\nimport threading\nimport time\nfrom dataclasses import dataclass\nfrom pathlib import Path\nfrom typing import Any, Dict, List, Optional\n\nfrom codexlens.errors import StorageError\n\n\n@dataclass\nclass ProjectInfo:\n\n id: int\n source_root: Path\n index_root: Path\n created_at: float\n last_indexed: float\n total_files: int\n total_dirs: int\n status: str\n\n\n@dataclass\nclass DirMapping:\n\n id: int\n project_id: int\n source_path: Path\n index_path: Path\n depth: int\n files_count: int\n last_updated: float\n\n\nclass RegistryStore:\n\n DEFAULT_DB_PATH = Path.home() / \".codexlens\" / \"registry.db\"\n\n def __init__(self, db_path: Path | None = None) -> None:\n self.db_path = (db_path or self.DEFAULT_DB_PATH).resolve()\n self._lock = threading.RLock()\n self._local = threading.local()\n self._pool_lock = threading.Lock()\n self._pool: Dict[int, sqlite3.Connection] = {}\n self._pool_generation = 0\n\n",
|
|
"source": null,
|
|
"symbol": null
|
|
},
|
|
{
|
|
"path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\storage\\migrations\\migration_001_normalize_keywords.py",
|
|
"score": 0.013678451178451179,
|
|
"excerpt": " PRIMARY KEY (file_id, keyword_id),\n FOREIGN KEY (file_id) REFERENCES files (id) ON DELETE CASCADE,\n FOREIGN KEY (keyword_id) REFERENCES keywords (id) ON DELETE CASCAD...",
|
|
"content": " PRIMARY KEY (file_id, keyword_id),\n FOREIGN KEY (file_id) REFERENCES files (id) ON DELETE CASCADE,\n FOREIGN KEY (keyword_id) REFERENCES keywords (id) ON DELETE CASCADE\n )\n",
|
|
"source": null,
|
|
"symbol": null
|
|
},
|
|
{
|
|
"path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\api\\references.py",
|
|
"score": 0.013661202185792351,
|
|
"excerpt": " project_root: Project root directory path\n symbol_name: Name of the symbol to find references for\n symbol_kind: Optional symbol kind filter (e.g., 'function', 'class')\n in...",
|
|
"content": "def find_references(\n project_root: str,\n symbol_name: str,\n symbol_kind: Optional[str] = None,\n include_definition: bool = True,\n group_by_definition: bool = True,\n limit: int = 100,\n) -> List[GroupedReferences]:\n \"\"\"Find all reference locations for a symbol.\n\n Multi-definition case returns grouped results to resolve ambiguity.\n\n This function wraps ChainSearchEngine.search_references() and groups\n the results by definition location. Each GroupedReferences contains\n a definition and all references that point to it.\n\n Args:\n project_root: Project root directory path\n symbol_name: Name of the symbol to find references for\n symbol_kind: Optional symbol kind filter (e.g., 'function', 'class')\n include_definition: Whether to include the definition location\n in the result (default True)\n group_by_definition: Whether to group references by definition.\n If False, returns a single group with all references.\n (default True)\n limit: Maximum number of references to return (default 100)\n\n Returns:\n List of GroupedReferences. Each group contains:\n - definition: The DefinitionResult for this symbol definition\n - references: List of ReferenceResult pointing to this definition\n\n Raises:\n ValueError: If project_root does not exist or is not a directory\n\n Examples:\n >>> refs = find_references(\"/path/to/project\", \"authenticate\")\n >>> for group in refs:\n ... print(f\"Definition: {group.definition.file_path}:{group.definition.line}\")\n ... for ref in group.references:\n ... print(f\" Reference: {ref.file_path}:{ref.line} ({ref.relationship})\")\n\n Note:\n Reference relationship types are normalized:\n - 'calls' -> 'call'\n - 'imports' -> 'import'\n - 'inherits' -> 'inheritance'\n \"\"\"\n # Validate and resolve project root\n project_path = resolve_project(project_root)\n\n # Import here to avoid circular imports\n from codexlens.config import Config\n from codexlens.storage.registry import RegistryStore\n from codexlens.storage.path_mapper import PathMapper\n from codexlens.storage.global_index import GlobalSymbolIndex\n from codexlens.search.chain_search import ChainSearchEngine\n from codexlens.search.chain_search import ReferenceResult as RawReferenceResult\n from codexlens.entities import Symbol\n\n # Initialize infrastructure\n config = Config()\n registry = RegistryStore()\n mapper = PathMapper(config.index_dir)\n\n # Create chain search engine\n engine = ChainSearchEngine(registry, mapper, config=config)\n\n try:\n # Step 1: Find definitions for the symbol\n definitions: List[DefinitionResult] = []\n\n if include_definition or group_by_definition:\n # Search for symbol definitions\n symbols = engine.search_symbols(\n name=symbol_name,\n source_path=project_path,\n kind=symbol_kind,\n )\n\n # Convert Symbol to DefinitionResult\n for sym in symbols:\n # Only include exact name matches for definitions\n if sym.name != symbol_name:\n continue\n\n # Optionally filter by kind\n if symbol_kind and sym.kind != symbol_kind:\n continue\n\n definitions.append(DefinitionResult(\n name=sym.name,\n kind=sym.kind,\n file_path=sym.file or \"\",\n line=sym.range[0] if sym.range else 1,\n end_line=sym.range[1] if sym.range else 1,\n signature=None, # Not available from Symbol\n container=None, # Not available from Symbol\n score=1.0,\n ))\n\n # Step 2: Get all references using ChainSearchEngine\n raw_references = engine.search_references(\n symbol_name=symbol_name,\n source_path=project_path,\n depth=-1,\n limit=limit,\n )\n\n # Step 3: Transform raw references to API ReferenceResult\n api_references: List[ReferenceResult] = []\n for raw_ref in raw_references:\n api_ref = _transform_to_reference_result(raw_ref)\n api_references.append(api_ref)\n\n # Step 4: Group references by definition\n if group_by_definition and definitions:\n return _group_references_by_definition(\n definitions=definitions,\n references=api_references,\n include_definition=include_definition,\n )\n else:\n # Return single group with placeholder definition or first definition\n if definitions:\n definition = definitions[0]\n else:\n # Create placeholder definition when no definition found\n definition = DefinitionResult(\n name=symbol_name,\n kind=symbol_kind or \"unknown\",\n file_path=\"\",\n line=0,\n end_line=0,\n signature=None,\n container=None,\n score=0.0,\n )\n\n return [GroupedReferences(\n definition=definition,\n references=api_references,\n )]\n\n finally:\n engine.close()",
|
|
"source": null,
|
|
"symbol": null
|
|
},
|
|
{
|
|
"path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\env_config.py",
|
|
"score": 0.01359062143375869,
|
|
"excerpt": " \n return default\n\n\ndef get_api_config(\n prefix: str,\n *,\n workspace_root: Path | None = None,\n defaults: Dict[str, Any] | None = None,\n) -> Dict[str, Any]:\n defaults = defaults o...",
|
|
"content": " \n return default\n\n\ndef get_api_config(\n prefix: str,\n *,\n workspace_root: Path | None = None,\n defaults: Dict[str, Any] | None = None,\n) -> Dict[str, Any]:\n defaults = defaults or {}\n \n config: Dict[str, Any] = {}\n \n \n field_mapping = {\n \"api_key\": f\"{prefix}_API_KEY\",\n \"api_base\": f\"{prefix}_API_BASE\",\n \"model\": f\"{prefix}_MODEL\",\n \"provider\": f\"{prefix}_PROVIDER\",\n \"timeout\": f\"{prefix}_TIMEOUT\",\n }\n \n for field, env_key in field_mapping.items():\n value = get_env(env_key, workspace_root=workspace_root)\n if value is not None:\n \n if field == \"timeout\":\n try:\n config[field] = float(value)\n except ValueError:\n pass\n else:\n config[field] = value\n elif field in defaults:\n config[field] = defaults[field]\n \n return config\n\n\ndef generate_env_example() -> str:\n lines = [\n \"# CodexLens Environment Configuration\",\n \"# Copy this file to .codexlens/.env and fill in your values\",\n \"\",\n",
|
|
"source": null,
|
|
"symbol": null
|
|
},
|
|
{
|
|
"path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\api\\__init__.py",
|
|
"score": 0.01359062143375869,
|
|
"excerpt": " \"FileContextResult\",\n \"DefinitionResult\",\n \"ReferenceResult\",\n \"GroupedReferences\",\n \"SymbolInfo\",\n \"HoverInfo\",\n \"SemanticResult\",\n \n \"resolve_project\",\n \"normalize_rel...",
|
|
"content": " \"FileContextResult\",\n \"DefinitionResult\",\n \"ReferenceResult\",\n \"GroupedReferences\",\n \"SymbolInfo\",\n \"HoverInfo\",\n \"SemanticResult\",\n \n \"resolve_project\",\n \"normalize_relationship_type\",\n \"rank_by_proximity\",\n \"rank_by_score\",\n \n \"find_definition\",\n \"workspace_symbols\",\n \"get_hover\",\n \"file_context\",\n \"find_references\",\n \"semantic_search\",\n]\n",
|
|
"source": null,
|
|
"symbol": null
|
|
},
|
|
{
|
|
"path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\storage\\migrations\\migration_005_cleanup_unused_fields.py",
|
|
"score": 0.013517665130568358,
|
|
"excerpt": "\nimport logging\nfrom sqlite3 import Connection\n\nlog = logging.getLogger(__name__)\n\n\ndef upgrade(db_conn: Connection):\n cursor = db_conn.cursor()\n\n \n log.info(\"Checking semantic_metadata.keywo...",
|
|
"content": "\nimport logging\nfrom sqlite3 import Connection\n\nlog = logging.getLogger(__name__)\n\n\ndef upgrade(db_conn: Connection):\n cursor = db_conn.cursor()\n\n \n log.info(\"Checking semantic_metadata.keywords column...\")\n\n cursor.execute(\n \"SELECT name FROM sqlite_master WHERE type='table' AND name='semantic_metadata'\"\n",
|
|
"source": null,
|
|
"symbol": null
|
|
},
|
|
{
|
|
"path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\factory.py",
|
|
"score": 0.013495897553868569,
|
|
"excerpt": " >>> \n >>> from codexlens.search.clustering import FrequencyConfig\n >>> freq_config = FrequencyConfig(min_frequency=2, group_by=\"symbol\")\n >>> strategy = get_strategy(\"freq...",
|
|
"content": " >>> \n >>> from codexlens.search.clustering import FrequencyConfig\n >>> freq_config = FrequencyConfig(min_frequency=2, group_by=\"symbol\")\n >>> strategy = get_strategy(\"frequency\", freq_config)\n",
|
|
"source": null,
|
|
"symbol": null
|
|
},
|
|
{
|
|
"path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\semantic\\reranker\\onnx_reranker.py",
|
|
"score": 0.013480392156862746,
|
|
"excerpt": "\n from optimum.onnxruntime import ORTModelForSequenceClassification\n from transformers import AutoTokenizer\n\n if self.providers is None:\n from ..gpu_sup...",
|
|
"content": "\n from optimum.onnxruntime import ORTModelForSequenceClassification\n from transformers import AutoTokenizer\n\n if self.providers is None:\n from ..gpu_support import get_optimal_providers\n\n \n self.providers = get_optimal_providers(\n use_gpu=self.use_gpu, with_device_options=True\n )\n\n \n \n model_kwargs: dict[str, Any] = {}\n try:\n params = signature(ORTModelForSequenceClassification.from_pretrained).parameters\n if \"providers\" in params:\n model_kwargs[\"providers\"] = self.providers\n elif \"provider\" in params:\n provider_name = \"CPUExecutionProvider\"\n if self.providers:\n first = self.providers[0]\n provider_name = first[0] if isinstance(first, tuple) else str(first)\n model_kwargs[\"provider\"] = provider_name\n except Exception:\n model_kwargs = {}\n\n try:\n self._model = ORTModelForSequenceClassification.from_pretrained(\n self.model_name,\n **model_kwargs,\n )\n except TypeError:\n \n self._model = ORTModelForSequenceClassification.from_pretrained(self.model_name)\n\n self._tokenizer = AutoTokenizer.from_pretrained(self.model_name, use_fast=True)\n\n \n input_names: set[str] | None = None\n for attr in (\"input_names\", \"model_input_names\"):\n names = getattr(self._model, attr, None)\n if isinstance(names, (list, tuple)) and names:\n input_names = {str(n) for n in names}\n break\n if input_names is None:\n try:\n session = getattr(self._model, \"model\", None)\n if session is not None and hasattr(session, \"get_inputs\"):\n input_names = {i.name for i in session.get_inputs()}\n except Exception:\n input_names = None\n self._model_input_names = input_names\n\n",
|
|
"source": null,
|
|
"symbol": null
|
|
},
|
|
{
|
|
"path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py",
|
|
"score": 0.013403880070546739,
|
|
"excerpt": " return True\n\n except Exception as e:\n raise StorageError(f\"Failed to load ANN index: {e}\")\n\n def count(self) -> int:\n with self._lock:\n r...",
|
|
"content": " return True\n\n except Exception as e:\n raise StorageError(f\"Failed to load ANN index: {e}\")\n\n def count(self) -> int:\n with self._lock:\n return self._current_count\n\n @property\n def capacity(self) -> int:\n with self._lock:\n return self._max_elements\n\n @property\n def usage_ratio(self) -> float:\n with self._lock:\n if self._max_elements == 0:\n return 0.0\n return self._current_count / self._max_elements\n\n @property\n def is_loaded(self) -> bool:\n with self._lock:\n return self._index is not None and self._current_count > 0\n\n\n\nclass BinaryANNIndex:\n",
|
|
"source": null,
|
|
"symbol": null
|
|
},
|
|
{
|
|
"path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\semantic\\litellm_embedder.py",
|
|
"score": 0.01322751322751323,
|
|
"excerpt": "class LiteLLMEmbedderWrapper(BaseEmbedder):\n \"\"\"Wrapper for ccw-litellm LiteLLMEmbedder.\n",
|
|
"content": "class LiteLLMEmbedderWrapper(BaseEmbedder):\n \"\"\"Wrapper for ccw-litellm LiteLLMEmbedder.\n\n This wrapper adapts the ccw-litellm LiteLLMEmbedder to the CodexLens\n BaseEmbedder interface, enabling seamless integration with CodexLens\n semantic search functionality.\n\n Args:\n model: Model identifier for LiteLLM (default: \"default\")\n **kwargs: Additional arguments passed to LiteLLMEmbedder\n\n Raises:\n ImportError: If ccw-litellm package is not installed\n \"\"\"\n\n def __init__(self, model: str = \"default\", **kwargs) -> None:\n \"\"\"Initialize LiteLLM embedder wrapper.\n\n Args:\n model: Model identifier for LiteLLM (default: \"default\")\n **kwargs: Additional arguments passed to LiteLLMEmbedder\n\n Raises:\n ImportError: If ccw-litellm package is not installed\n \"\"\"\n try:\n from ccw_litellm import LiteLLMEmbedder\n self._embedder = LiteLLMEmbedder(model=model, **kwargs)\n except ImportError as e:\n raise ImportError(\n \"ccw-litellm not installed. Install with: pip install ccw-litellm\"\n ) from e\n\n @property\n def embedding_dim(self) -> int:\n \"\"\"Return embedding dimensions from LiteLLMEmbedder.\n\n Returns:\n int: Dimension of the embedding vectors.\n \"\"\"\n return self._embedder.dimensions\n\n @property\n def model_name(self) -> str:\n \"\"\"Return model name from LiteLLMEmbedder.\n\n Returns:\n str: Name or identifier of the underlying model.\n \"\"\"\n return self._embedder.model_name\n\n @property\n def max_tokens(self) -> int:\n \"\"\"Return maximum token limit for the embedding model.\n\n Returns:\n int: Maximum number of tokens that can be embedded at once.\n Reads from LiteLLM config's max_input_tokens property.\n \"\"\"\n # Get from LiteLLM embedder's max_input_tokens property (now exposed)\n if hasattr(self._embedder, 'max_input_tokens'):\n return self._embedder.max_input_tokens\n\n # Fallback: infer from model name\n model_name_lower = self.model_name.lower()\n\n # Large models (8B or \"large\" in name)\n if '8b' in model_name_lower or 'large' in model_name_lower:\n return 32768\n\n # OpenAI text-embedding-3-* models\n if 'text-embedding-3' in model_name_lower:\n return 8191\n\n # Default fallback\n return 8192\n\n def _sanitize_text(self, text: str) -> str:\n \"\"\"Sanitize text to work around ModelScope API routing bug.\n\n ModelScope incorrectly routes text starting with lowercase 'import'\n to an Ollama endpoint, causing failures. This adds a leading space\n to work around the issue without affecting embedding quality.\n\n Args:\n text: Text to sanitize.\n\n Returns:\n Sanitized text safe for embedding API.\n \"\"\"\n if text.startswith('import'):\n return ' ' + text\n return text\n\n def embed_to_numpy(self, texts: str | Iterable[str], **kwargs) -> np.ndarray:\n \"\"\"Embed texts to numpy array using LiteLLMEmbedder.\n\n Args:\n texts: Single text or iterable of texts to embed.\n **kwargs: Additional arguments (ignored for LiteLLM backend).\n Accepts batch_size for API compatibility with fastembed.\n\n Returns:\n numpy.ndarray: Array of shape (n_texts, embedding_dim) containing embeddings.\n \"\"\"\n if isinstance(texts, str):\n texts = [texts]\n else:\n texts = list(texts)\n\n # Sanitize texts to avoid ModelScope routing bug\n texts = [self._sanitize_text(t) for t in texts]\n\n # LiteLLM handles batching internally, ignore batch_size parameter\n return self._embedder.embed(texts)\n\n def embed_single(self, text: str) -> list[float]:\n \"\"\"Generate embedding for a single text.\n\n Args:\n text: Text to embed.\n\n Returns:\n list[float]: Embedding vector as a list of floats.\n \"\"\"\n # Sanitize text before embedding\n sanitized = self._sanitize_text(text)\n embedding = self._embedder.embed([sanitized])\n return embedding[0].tolist()",
|
|
"source": null,
|
|
"symbol": null
|
|
},
|
|
{
|
|
"path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\search\\association_tree\\builder.py",
|
|
"score": 0.013083213083213086,
|
|
"excerpt": "\nfrom __future__ import annotations\n\nimport asyncio\nimport logging\nfrom pathlib import Path\nfrom typing import Dict, List, Optional, Set\n\nfrom codexlens.hybrid_search.data_structures import CallHierar...",
|
|
"content": "\nfrom __future__ import annotations\n\nimport asyncio\nimport logging\nfrom pathlib import Path\nfrom typing import Dict, List, Optional, Set\n\nfrom codexlens.hybrid_search.data_structures import CallHierarchyItem, Range\nfrom codexlens.lsp.standalone_manager import StandaloneLspManager\nfrom .data_structures import CallTree, TreeNode\n\nlogger = logging.getLogger(__name__)\n\n\nclass AssociationTreeBuilder:\n\n def __init__(\n self,\n lsp_manager: StandaloneLspManager,\n timeout: float = 5.0,\n ):\n self.lsp_manager = lsp_manager\n self.timeout = timeout\n self.visited: Set[str] = set()\n\n async def build_tree(\n self,\n seed_file_path: str,\n seed_line: int,\n seed_character: int = 1,\n",
|
|
"source": null,
|
|
"symbol": null
|
|
},
|
|
{
|
|
"path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\indexing\\symbol_extractor.py",
|
|
"score": 0.012885154061624651,
|
|
"excerpt": "import re\nimport sqlite3\nfrom pathlib import Path\nfrom typing import Any, Dict, List, Optional, Tuple\n\ntry:\n from codexlens.parsers.treesitter_parser import TreeSitterSymbolParser\nexcept Exception:...",
|
|
"content": "import re\nimport sqlite3\nfrom pathlib import Path\nfrom typing import Any, Dict, List, Optional, Tuple\n\ntry:\n from codexlens.parsers.treesitter_parser import TreeSitterSymbolParser\nexcept Exception: \n TreeSitterSymbolParser = None \n\n\nclass SymbolExtractor:\n\n \n PATTERNS = {\n 'python': {\n 'function': r'^(?:async\\s+)?def\\s+(\\w+)\\s*\\(',\n 'class': r'^class\\s+(\\w+)\\s*[:\\(]',\n 'import': r'^(?:from\\s+([\\w.]+)\\s+)?import\\s+([\\w.,\\s]+)',\n 'call': r'(?<![.\\w])(\\w+)\\s*\\(',\n },\n 'typescript': {\n 'function': r'(?:export\\s+)?(?:async\\s+)?function\\s+(\\w+)\\s*[<\\(]',\n 'class': r'(?:export\\s+)?class\\s+(\\w+)',\n 'import': r\"import\\s+.*\\s+from\\s+['\\\"]([^'\\\"]+)['\\\"]\",\n 'call': r'(?<![.\\w])(\\w+)\\s*[<\\(]',\n },\n 'javascript': {\n 'function': r'(?:export\\s+)?(?:async\\s+)?function\\s+(\\w+)\\s*\\(',\n 'class': r'(?:export\\s+)?class\\s+(\\w+)',\n 'import': r\"(?:import|require)\\s*\\(?['\\\"]([^'\\\"]+)['\\\"]\",\n 'call': r'(?<![.\\w])(\\w+)\\s*\\(',\n }\n }\n\n LANGUAGE_MAP = {\n '.py': 'python',\n '.ts': 'typescript',\n '.tsx': 'typescript',\n '.js': 'javascript',\n '.jsx': 'javascript',\n }\n\n def __init__(self, db_path: Path):\n self.db_path = db_path\n self.db_conn: Optional[sqlite3.Connection] = None\n\n",
|
|
"source": null,
|
|
"symbol": null
|
|
},
|
|
{
|
|
"path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\search\\enrichment.py",
|
|
"score": 0.009107468123861567,
|
|
"excerpt": "from typing import List, Dict, Any, Optional\n\nfrom codexlens.config import Config\nfrom codexlens.entities import SearchResult\nfrom codexlens.search.graph_expander import GraphExpander",
|
|
"content": "# codex-lens/src/codexlens/search/enrichment.py\n\"\"\"Relationship enrichment for search results.\"\"\"\nimport sqlite3\nfrom pathlib import Path\nfrom typing import List, Dict, Any, Optional\n\nfrom codexlens.config import Config\nfrom codexlens.entities import SearchResult\nfrom codexlens.search.graph_expander import GraphExpander\nfrom codexlens.storage.path_mapper import PathMapper\n\n\nclass RelationshipEnricher:\n \"\"\"Enriches search results with code graph relationships.\"\"\"\n\n def __init__(self, index_path: Path):\n \"\"\"Initialize with path to index database.",
|
|
"source": null,
|
|
"symbol": null
|
|
},
|
|
{
|
|
"path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\search\\graph_expander.py",
|
|
"score": 0.008960573476702509,
|
|
"excerpt": "from typing import Dict, List, Optional, Sequence, Tuple\n\nfrom codexlens.config import Config\nfrom codexlens.entities import SearchResult\nfrom codexlens.storage.path_mapper import PathMapper",
|
|
"content": "precomputed N-hop neighbors stored in the per-directory index databases.\n\"\"\"\n\nfrom __future__ import annotations\n\nimport logging\nimport sqlite3\nfrom pathlib import Path\nfrom typing import Dict, List, Optional, Sequence, Tuple\n\nfrom codexlens.config import Config\nfrom codexlens.entities import SearchResult\nfrom codexlens.storage.path_mapper import PathMapper\n\nlogger = logging.getLogger(__name__)\n\n\ndef _result_key(result: SearchResult) -> Tuple[str, Optional[str], Optional[int], Optional[int]]:\n return (result.path, result.symbol_name, result.start_line, result.end_line)\n\n",
|
|
"source": null,
|
|
"symbol": null
|
|
},
|
|
{
|
|
"path": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\search\\clustering\\hdbscan_strategy.py",
|
|
"score": 0.008680555555555556,
|
|
"excerpt": "from typing import TYPE_CHECKING, List, Optional\n\nfrom .base import BaseClusteringStrategy, ClusteringConfig\n\nif TYPE_CHECKING:",
|
|
"content": "\"\"\"HDBSCAN-based clustering strategy for search results.\n\nHDBSCAN (Hierarchical Density-Based Spatial Clustering of Applications with Noise)\nis the primary clustering strategy for grouping similar search results.\n\"\"\"\n\nfrom __future__ import annotations\n\nfrom typing import TYPE_CHECKING, List, Optional\n\nfrom .base import BaseClusteringStrategy, ClusteringConfig\n\nif TYPE_CHECKING:\n import numpy as np\n from codexlens.entities import SearchResult\n\n\nclass HDBSCANStrategy(BaseClusteringStrategy):\n \"\"\"HDBSCAN-based clustering strategy.\n\n Uses HDBSCAN algorithm to cluster search results based on embedding similarity.",
|
|
"source": null,
|
|
"symbol": null
|
|
}
|
|
],
|
|
"stats": {
|
|
"dirs_searched": 17,
|
|
"files_matched": 50,
|
|
"time_ms": 7219.313144683838
|
|
}
|
|
}
|
|
}
|