From 1998f3ae8a9db9ab0c017c50db7ecec7d3480ed3 Mon Sep 17 00:00:00 2001 From: rhyme Date: Tue, 23 Dec 2025 14:51:08 +0800 Subject: [PATCH] fix(codexlens): correct fastembed 0.7.4 cache path and download trigger - Update cache path to ~/.cache/huggingface (HuggingFace Hub default) - Fix model path format: models--{org}--{model} - Add .embed() call to trigger actual download in download_model() - Ensure cross-platform compatibility (Linux/Windows) --- codex-lens/src/codexlens/cli/model_manager.py | 48 ++++++++++++------- 1 file changed, 30 insertions(+), 18 deletions(-) diff --git a/codex-lens/src/codexlens/cli/model_manager.py b/codex-lens/src/codexlens/cli/model_manager.py index 5369a2d5..7e526b0d 100644 --- a/codex-lens/src/codexlens/cli/model_manager.py +++ b/codex-lens/src/codexlens/cli/model_manager.py @@ -79,36 +79,37 @@ def get_cache_dir() -> Path: """Get fastembed cache directory. Returns: - Path to cache directory (usually ~/.cache/fastembed or %LOCALAPPDATA%\\Temp\\fastembed_cache) + Path to cache directory (~/.cache/huggingface or custom path) """ # Check HF_HOME environment variable first if "HF_HOME" in os.environ: return Path(os.environ["HF_HOME"]) - # Default cache locations - if os.name == "nt": # Windows - cache_dir = Path(os.environ.get("LOCALAPPDATA", Path.home() / "AppData" / "Local")) / "Temp" / "fastembed_cache" - else: # Unix-like - cache_dir = Path.home() / ".cache" / "fastembed" - - return cache_dir + # fastembed 0.7.4+ uses HuggingFace cache when cache_dir is specified + # Models are stored directly under the cache directory + return Path.home() / ".cache" / "huggingface" def _get_model_cache_path(cache_dir: Path, info: Dict) -> Path: """Get the actual cache path for a model. - fastembed uses ONNX versions of models with different names than the original. - This function returns the correct path based on the cache_name field. + fastembed 0.7.4+ uses HuggingFace Hub's naming convention: + - Prefix: 'models--' + - Replace '/' with '--' in model name + Example: jinaai/jina-embeddings-v2-base-code + -> models--jinaai--jina-embeddings-v2-base-code Args: - cache_dir: The fastembed cache directory + cache_dir: The fastembed cache directory (HuggingFace hub path) info: Model profile info dictionary Returns: Path to the model cache directory """ - cache_name = info.get("cache_name", info["model_name"]) - return cache_dir / f"models--{cache_name.replace('/', '--')}" + # HuggingFace Hub naming: models--{org}--{model} + model_name = info["model_name"] + sanitized_name = f"models--{model_name.replace('/', '--')}" + return cache_dir / sanitized_name def list_models() -> Dict[str, any]: @@ -194,18 +195,29 @@ def download_model(profile: str, progress_callback: Optional[callable] = None) - model_name = info["model_name"] try: - # Download model by instantiating TextEmbedding - # This will automatically download to cache if not present + # Get cache directory + cache_dir = get_cache_dir() + + # Download model by instantiating TextEmbedding with explicit cache_dir + # This ensures fastembed uses the correct HuggingFace Hub cache location if progress_callback: progress_callback(f"Downloading {model_name}...") - embedder = TextEmbedding(model_name=model_name) + # CRITICAL: Must specify cache_dir to use HuggingFace cache + # and call embed() to trigger actual download + embedder = TextEmbedding(model_name=model_name, cache_dir=str(cache_dir)) + + # Trigger actual download by calling embed + # TextEmbedding.__init__ alone doesn't download files + if progress_callback: + progress_callback(f"Initializing {model_name}...") + + list(embedder.embed(["test"])) # Trigger download if progress_callback: progress_callback(f"Model {model_name} downloaded successfully") - # Get cache info using correct cache_name - cache_dir = get_cache_dir() + # Get cache info using correct HuggingFace Hub path model_cache_path = _get_model_cache_path(cache_dir, info) cache_size = 0