feat: Add API indexer and enhance embedding management

- Add new API indexer script for document processing - Update embedding manager with improved functionality - Remove old cache files and update dependencies - Modify workflow execute documentation 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
2026-02-08 02:14:08 +08:00 · 2025-09-23 19:40:22 +08:00
parent 984fa3a4f3
commit 410d0efd7b
8 changed files with 506 additions and 337 deletions
--- a/.claude/python_script/core/pycache/embedding_manager.cpython-313.pyc
+++ b/.claude/python_script/core/pycache/embedding_manager.cpython-313.pyc
--- a/.claude/python_script/core/embedding_manager.py
+++ b/.claude/python_script/core/embedding_manager.py
@@ -75,6 +75,7 @@ class EmbeddingManager:
        self.similarity_threshold = config.get('embedding', {}).get('similarity_threshold', 0.6)
        self.max_context_length = config.get('embedding', {}).get('max_context_length', 512)
        self.batch_size = config.get('embedding', {}).get('batch_size', 32)
+        self.trust_remote_code = config.get('embedding', {}).get('trust_remote_code', False)

        # Setup cache directories
        self.cache_dir.mkdir(parents=True, exist_ok=True)
@@ -95,7 +96,11 @@ class EmbeddingManager:
        if self._model is None:
            try:
                self.logger.info(f"Loading embedding model: {self.model_name}")
-                self._model = SentenceTransformer(self.model_name)
+                # Initialize with trust_remote_code for CodeSage V2
+                if self.trust_remote_code:
+                    self._model = SentenceTransformer(self.model_name, trust_remote_code=True)
+                else:
+                    self._model = SentenceTransformer(self.model_name)
                self.logger.info(f"Model loaded successfully")
            except Exception as e:
                self.logger.error(f"Failed to load embedding model: {e}")
@@ -203,7 +208,7 @@ class EmbeddingManager:
            with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                content = f.read()

-            # Truncate content if too long
+            # Truncate content if too long (CodeSage V2 supports longer contexts)
            if len(content) > self.max_context_length * 4:  # Approximate token limit
                content = content[:self.max_context_length * 4]