Files
Claude-Code-Workflow/codex-lens/src/codexlens/config.py
catlog22 e21d801523 feat: Add multi-type embedding backends for cascade retrieval
- Implemented BinaryEmbeddingBackend for fast coarse filtering using 256-dimensional binary vectors.
- Developed DenseEmbeddingBackend for high-precision dense vectors (2048 dimensions) for reranking.
- Created CascadeEmbeddingBackend to combine binary and dense embeddings for two-stage retrieval.
- Introduced utility functions for embedding conversion and distance computation.

chore: Migration 010 - Add multi-vector storage support

- Added 'chunks' table to support multi-vector embeddings for cascade retrieval.
- Included new columns: embedding_binary (256-dim) and embedding_dense (2048-dim) for efficient storage.
- Implemented upgrade and downgrade functions to manage schema changes and data migration.
2026-01-02 10:52:43 +08:00

402 lines
15 KiB
Python

"""Configuration system for CodexLens."""
from __future__ import annotations
import json
import logging
import os
from dataclasses import dataclass, field
from functools import cached_property
from pathlib import Path
from typing import Any, Dict, List, Optional
from .errors import ConfigError
# Workspace-local directory name
WORKSPACE_DIR_NAME = ".codexlens"
# Settings file name
SETTINGS_FILE_NAME = "settings.json"
log = logging.getLogger(__name__)
def _default_global_dir() -> Path:
"""Get global CodexLens data directory."""
env_override = os.getenv("CODEXLENS_DATA_DIR")
if env_override:
return Path(env_override).expanduser().resolve()
return (Path.home() / ".codexlens").resolve()
def find_workspace_root(start_path: Path) -> Optional[Path]:
"""Find the workspace root by looking for .codexlens directory.
Searches from start_path upward to find an existing .codexlens directory.
Returns None if not found.
"""
current = start_path.resolve()
# Search up to filesystem root
while current != current.parent:
workspace_dir = current / WORKSPACE_DIR_NAME
if workspace_dir.is_dir():
return current
current = current.parent
# Check root as well
workspace_dir = current / WORKSPACE_DIR_NAME
if workspace_dir.is_dir():
return current
return None
@dataclass
class Config:
"""Runtime configuration for CodexLens.
- data_dir: Base directory for all persistent CodexLens data.
- venv_path: Optional virtualenv used for language tooling.
- supported_languages: Language IDs and their associated file extensions.
- parsing_rules: Per-language parsing and chunking hints.
"""
data_dir: Path = field(default_factory=_default_global_dir)
venv_path: Path = field(default_factory=lambda: _default_global_dir() / "venv")
supported_languages: Dict[str, Dict[str, Any]] = field(
default_factory=lambda: {
"python": {"extensions": [".py"], "tree_sitter_language": "python"},
"javascript": {"extensions": [".js", ".jsx"], "tree_sitter_language": "javascript"},
"typescript": {"extensions": [".ts", ".tsx"], "tree_sitter_language": "typescript"},
"java": {"extensions": [".java"], "tree_sitter_language": "java"},
"go": {"extensions": [".go"], "tree_sitter_language": "go"},
"zig": {"extensions": [".zig"], "tree_sitter_language": "zig"},
"objective-c": {"extensions": [".m", ".mm"], "tree_sitter_language": "objc"},
"markdown": {"extensions": [".md", ".mdx"], "tree_sitter_language": None},
"text": {"extensions": [".txt"], "tree_sitter_language": None},
}
)
parsing_rules: Dict[str, Dict[str, Any]] = field(
default_factory=lambda: {
"default": {
"max_chunk_chars": 4000,
"max_chunk_lines": 200,
"overlap_lines": 20,
}
}
)
llm_enabled: bool = False
llm_tool: str = "gemini"
llm_timeout_ms: int = 300000
llm_batch_size: int = 5
# Hybrid chunker configuration
hybrid_max_chunk_size: int = 2000 # Max characters per chunk before LLM refinement
hybrid_llm_refinement: bool = False # Enable LLM-based semantic boundary refinement
# Embedding configuration
embedding_backend: str = "fastembed" # "fastembed" (local) or "litellm" (API)
embedding_model: str = "code" # For fastembed: profile (fast/code/multilingual/balanced)
# For litellm: model name from config (e.g., "qwen3-embedding")
embedding_use_gpu: bool = True # For fastembed: whether to use GPU acceleration
# SPLADE sparse retrieval configuration
enable_splade: bool = True # Enable SPLADE as default sparse backend
splade_model: str = "naver/splade-cocondenser-ensembledistil"
splade_threshold: float = 0.01 # Min weight to store in index
splade_onnx_path: Optional[str] = None # Custom ONNX model path
# FTS fallback (disabled by default, available via --use-fts)
use_fts_fallback: bool = False # Use FTS instead of SPLADE
# Indexing/search optimizations
global_symbol_index_enabled: bool = True # Enable project-wide symbol index fast path
enable_merkle_detection: bool = True # Enable content-hash based incremental indexing
# Graph expansion (search-time, uses precomputed neighbors)
enable_graph_expansion: bool = False
graph_expansion_depth: int = 2
# Optional search reranking (disabled by default)
enable_reranking: bool = False
reranking_top_k: int = 50
symbol_boost_factor: float = 1.5
# Optional cross-encoder reranking (second stage; requires optional reranker deps)
enable_cross_encoder_rerank: bool = False
reranker_backend: str = "onnx"
reranker_model: str = "cross-encoder/ms-marco-MiniLM-L-6-v2"
reranker_top_k: int = 50
# Cascade search configuration (two-stage retrieval)
enable_cascade_search: bool = False # Enable cascade search (coarse + fine ranking)
cascade_coarse_k: int = 100 # Number of coarse candidates from first stage
cascade_fine_k: int = 10 # Number of final results after reranking
cascade_strategy: str = "binary" # "binary" (fast binary+dense) or "hybrid" (FTS+SPLADE+Vector+CrossEncoder)
# RRF fusion configuration
fusion_method: str = "rrf" # "simple" (weighted sum) or "rrf" (reciprocal rank fusion)
rrf_k: int = 60 # RRF constant (default 60)
# Multi-endpoint configuration for litellm backend
embedding_endpoints: List[Dict[str, Any]] = field(default_factory=list)
# List of endpoint configs: [{"model": "...", "api_key": "...", "api_base": "...", "weight": 1.0}]
embedding_strategy: str = "latency_aware" # round_robin, latency_aware, weighted_random
embedding_cooldown: float = 60.0 # Default cooldown seconds for rate-limited endpoints
def __post_init__(self) -> None:
try:
self.data_dir = self.data_dir.expanduser().resolve()
self.venv_path = self.venv_path.expanduser().resolve()
self.data_dir.mkdir(parents=True, exist_ok=True)
except PermissionError as exc:
raise ConfigError(
f"Permission denied initializing paths (data_dir={self.data_dir}, venv_path={self.venv_path}) "
f"[{type(exc).__name__}]: {exc}"
) from exc
except OSError as exc:
raise ConfigError(
f"Filesystem error initializing paths (data_dir={self.data_dir}, venv_path={self.venv_path}) "
f"[{type(exc).__name__}]: {exc}"
) from exc
except Exception as exc:
raise ConfigError(
f"Unexpected error initializing paths (data_dir={self.data_dir}, venv_path={self.venv_path}) "
f"[{type(exc).__name__}]: {exc}"
) from exc
@cached_property
def cache_dir(self) -> Path:
"""Directory for transient caches."""
return self.data_dir / "cache"
@cached_property
def index_dir(self) -> Path:
"""Directory where index artifacts are stored."""
return self.data_dir / "index"
@cached_property
def db_path(self) -> Path:
"""Default SQLite index path."""
return self.index_dir / "codexlens.db"
def ensure_runtime_dirs(self) -> None:
"""Create standard runtime directories if missing."""
for directory in (self.cache_dir, self.index_dir):
try:
directory.mkdir(parents=True, exist_ok=True)
except PermissionError as exc:
raise ConfigError(
f"Permission denied creating directory {directory} [{type(exc).__name__}]: {exc}"
) from exc
except OSError as exc:
raise ConfigError(
f"Filesystem error creating directory {directory} [{type(exc).__name__}]: {exc}"
) from exc
except Exception as exc:
raise ConfigError(
f"Unexpected error creating directory {directory} [{type(exc).__name__}]: {exc}"
) from exc
def language_for_path(self, path: str | Path) -> str | None:
"""Infer a supported language ID from a file path."""
extension = Path(path).suffix.lower()
for language_id, spec in self.supported_languages.items():
extensions: List[str] = spec.get("extensions", [])
if extension in extensions:
return language_id
return None
def rules_for_language(self, language_id: str) -> Dict[str, Any]:
"""Get parsing rules for a specific language, falling back to defaults."""
return {**self.parsing_rules.get("default", {}), **self.parsing_rules.get(language_id, {})}
@cached_property
def settings_path(self) -> Path:
"""Path to the settings file."""
return self.data_dir / SETTINGS_FILE_NAME
def save_settings(self) -> None:
"""Save embedding and other settings to file."""
embedding_config = {
"backend": self.embedding_backend,
"model": self.embedding_model,
"use_gpu": self.embedding_use_gpu,
}
# Include multi-endpoint config if present
if self.embedding_endpoints:
embedding_config["endpoints"] = self.embedding_endpoints
embedding_config["strategy"] = self.embedding_strategy
embedding_config["cooldown"] = self.embedding_cooldown
settings = {
"embedding": embedding_config,
"llm": {
"enabled": self.llm_enabled,
"tool": self.llm_tool,
"timeout_ms": self.llm_timeout_ms,
"batch_size": self.llm_batch_size,
},
}
with open(self.settings_path, "w", encoding="utf-8") as f:
json.dump(settings, f, indent=2)
def load_settings(self) -> None:
"""Load settings from file if exists."""
if not self.settings_path.exists():
return
try:
with open(self.settings_path, "r", encoding="utf-8") as f:
settings = json.load(f)
# Load embedding settings
embedding = settings.get("embedding", {})
if "backend" in embedding:
backend = embedding["backend"]
if backend in {"fastembed", "litellm"}:
self.embedding_backend = backend
else:
log.warning(
"Invalid embedding backend in %s: %r (expected 'fastembed' or 'litellm')",
self.settings_path,
backend,
)
if "model" in embedding:
self.embedding_model = embedding["model"]
if "use_gpu" in embedding:
self.embedding_use_gpu = embedding["use_gpu"]
# Load multi-endpoint configuration
if "endpoints" in embedding:
self.embedding_endpoints = embedding["endpoints"]
if "strategy" in embedding:
self.embedding_strategy = embedding["strategy"]
if "cooldown" in embedding:
self.embedding_cooldown = embedding["cooldown"]
# Load LLM settings
llm = settings.get("llm", {})
if "enabled" in llm:
self.llm_enabled = llm["enabled"]
if "tool" in llm:
self.llm_tool = llm["tool"]
if "timeout_ms" in llm:
self.llm_timeout_ms = llm["timeout_ms"]
if "batch_size" in llm:
self.llm_batch_size = llm["batch_size"]
except Exception as exc:
log.warning(
"Failed to load settings from %s (%s): %s",
self.settings_path,
type(exc).__name__,
exc,
)
@classmethod
def load(cls) -> "Config":
"""Load config with settings from file."""
config = cls()
config.load_settings()
return config
@dataclass
class WorkspaceConfig:
"""Workspace-local configuration for CodexLens.
Stores index data in project/.codexlens/ directory.
"""
workspace_root: Path
def __post_init__(self) -> None:
self.workspace_root = Path(self.workspace_root).resolve()
@property
def codexlens_dir(self) -> Path:
"""The .codexlens directory in workspace root."""
return self.workspace_root / WORKSPACE_DIR_NAME
@property
def db_path(self) -> Path:
"""SQLite index path for this workspace."""
return self.codexlens_dir / "index.db"
@property
def cache_dir(self) -> Path:
"""Cache directory for this workspace."""
return self.codexlens_dir / "cache"
@property
def env_path(self) -> Path:
"""Path to workspace .env file."""
return self.codexlens_dir / ".env"
def load_env(self, *, override: bool = False) -> int:
"""Load .env file and apply to os.environ.
Args:
override: If True, override existing environment variables
Returns:
Number of variables applied
"""
from .env_config import apply_workspace_env
return apply_workspace_env(self.workspace_root, override=override)
def get_api_config(self, prefix: str) -> dict:
"""Get API configuration from environment.
Args:
prefix: Environment variable prefix (e.g., "RERANKER", "EMBEDDING")
Returns:
Dictionary with api_key, api_base, model, etc.
"""
from .env_config import get_api_config
return get_api_config(prefix, workspace_root=self.workspace_root)
def initialize(self) -> None:
"""Create the .codexlens directory structure."""
try:
self.codexlens_dir.mkdir(parents=True, exist_ok=True)
self.cache_dir.mkdir(parents=True, exist_ok=True)
# Create .gitignore to exclude cache but keep index
gitignore_path = self.codexlens_dir / ".gitignore"
if not gitignore_path.exists():
gitignore_path.write_text(
"# CodexLens workspace data\n"
"cache/\n"
"*.log\n"
".env\n" # Exclude .env from git
)
except Exception as exc:
raise ConfigError(f"Failed to initialize workspace at {self.codexlens_dir}: {exc}") from exc
def exists(self) -> bool:
"""Check if workspace is already initialized."""
return self.codexlens_dir.is_dir() and self.db_path.exists()
@classmethod
def from_path(cls, path: Path) -> Optional["WorkspaceConfig"]:
"""Create WorkspaceConfig from a path by finding workspace root.
Returns None if no workspace found.
"""
root = find_workspace_root(path)
if root is None:
return None
return cls(workspace_root=root)
@classmethod
def create_at(cls, path: Path) -> "WorkspaceConfig":
"""Create a new workspace at the given path."""
config = cls(workspace_root=path)
config.initialize()
return config