feat: Add API indexer and enhance embedding management

- Add new API indexer script for document processing - Update embedding manager with improved functionality - Remove old cache files and update dependencies - Modify workflow execute documentation 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
2026-02-05 01:50:27 +08:00 · 2025-09-23 19:40:22 +08:00
parent 984fa3a4f3
commit 410d0efd7b
8 changed files with 506 additions and 337 deletions
--- a/.claude/python_script/api_indexer.py
+++ b/.claude/python_script/api_indexer.py
@@ -0,0 +1,141 @@
+#!/usr/bin/env python3
+"""
+API Documentation Indexer
+Parses Markdown documentation to create a searchable index of classes and methods.
+"""
+
+import os
+import re
+import json
+import logging
+from pathlib import Path
+from typing import Dict, Any
+
+from core.file_indexer import FileIndexer
+
+class ApiIndexer:
+    def __init__(self, config: Dict, root_path: str = "."):
+        self.config = config
+        self.root_path = Path(root_path).resolve()
+        self.file_indexer = FileIndexer(config, root_path)
+        self.api_index_file = self.file_indexer.cache_dir / "api_index.json"
+        self.logger = logging.getLogger(__name__)
+
+    def build_index(self):
+        """Builds the API index from Markdown files."""
+        self.logger.info("Building API index...")
+        file_index = self.file_indexer.load_index()
+        if not file_index:
+            self.logger.info("File index not found, building it first.")
+            self.file_indexer.build_index()
+            file_index = self.file_indexer.load_index()
+
+        api_index = {}
+        for file_info in file_index.values():
+            if file_info.extension == ".md":
+                self.logger.debug(f"Parsing {file_info.path}")
+                try:
+                    with open(file_info.path, "r", encoding="utf-8") as f:
+                        content = f.read()
+                        self._parse_markdown(content, file_info.relative_path, api_index)
+                except Exception as e:
+                    self.logger.error(f"Error parsing {file_info.path}: {e}")
+
+        self._save_index(api_index)
+        self.logger.info(f"API index built with {len(api_index)} classes.")
+
+    def _parse_markdown(self, content: str, file_path: str, api_index: Dict):
+        """Parses a single Markdown file for class and method info."""
+        class_name_match = re.search(r"^#\s+([A-Za-z0-9_]+)", content)
+        if not class_name_match:
+            return
+
+        class_name = class_name_match.group(1)
+        api_index[class_name] = {
+            "file_path": file_path,
+            "description": "",
+            "methods": {}
+        }
+
+        # Simple description extraction
+        desc_match = re.search(r"\*\*Description:\*\*\s*(.+)", content)
+        if desc_match:
+            api_index[class_name]["description"] = desc_match.group(1).strip()
+
+        # Method extraction
+        method_sections = re.split(r"###\s+", content)[1:]
+        for i, section in enumerate(method_sections):
+            method_signature_match = re.search(r"`(.+?)`", section)
+            if not method_signature_match:
+                continue
+            
+            signature = method_signature_match.group(1)
+            method_name_match = re.search(r"([A-Za-z0-9_]+)\(“, signature)
+            if not method_name_match:
+                continue
+            
+            method_name = method_name_match.group(1)
+            
+            method_description = ""
+            method_desc_match = re.search(r"\*\*Description:\*\*\s*(.+)", section)
+            if method_desc_match:
+                method_description = method_desc_match.group(1).strip()
+
+            # A simple way to get a line number approximation
+            line_number = content.count("\n", 0, content.find(f"### `{signature}`")) + 1
+
+            api_index[class_name]["methods"Показать больше] = {
+                "signature": signature,
+                "description": method_description,
+                "line_number": line_number
+            }
+
+    def _save_index(self, api_index: Dict):
+        """Saves the API index to a file."""
+        try:
+            with open(self.api_index_file, "w", encoding="utf-8") as f:
+                json.dump(api_index, f, indent=2)
+        except IOError as e:
+            self.logger.error(f"Could not save API index: {e}")
+
+    def search(self, class_name: str, method_name: str = None) -> Any:
+        """Searches the API index for a class or method."""
+        if not self.api_index_file.exists():
+            self.build_index()
+
+        with open(self.api_index_file, "r", encoding="utf-8") as f:
+            api_index = json.load(f)
+
+        if class_name not in api_index:
+            return None
+
+        if method_name:
+            return api_index[class_name]["methods"].get(method_name)
+        else:
+            return api_index[class_name]
+
+if __name__ == "__main__":
+    from core.config import get_config
+    import argparse
+
+    logging.basicConfig(level=logging.INFO)
+    
+    parser = argparse.ArgumentParser(description="API Documentation Indexer.")
+    parser.add_argument("--build", action="store_true", help="Build the API index.")
+    parser.add_argument("--search_class", help="Search for a class.")
+    parser.add_argument("--search_method", help="Search for a method within a class (requires --search_class).")
+    
+    args = parser.parse_args()
+
+    config = get_config()
+    api_indexer = ApiIndexer(config.to_dict())
+
+    if args.build:
+        api_indexer.build_index()
+    
+    if args.search_class:
+        result = api_indexer.search(args.search_class, args.search_method)
+        if result:
+            print(json.dumps(result, indent=2))
+        else:
+            print("Not found.")
--- a/.claude/python_script/cache/embedding_index.json
+++ b/.claude/python_script/cache/embedding_index.json
@@ -1,156 +0,0 @@
-{
-  "analyzer.py": {
-    "file_path": "analyzer.py",
-    "content_hash": "9a7665c34d5ac84634342f8b1425bb13",
-    "embedding_hash": "fb5b5a58ec8e070620747c7313b0b2b6",
-    "created_time": 1758175163.6748724,
-    "vector_size": 384
-  },
-  "config.yaml": {
-    "file_path": "config.yaml",
-    "content_hash": "fc0526eea28cf37d15425035d2dd17d9",
-    "embedding_hash": "4866d8bd2b14c16c448c34c0251d199e",
-    "created_time": 1758175163.6748896,
-    "vector_size": 384
-  },
-  "install.sh": {
-    "file_path": "install.sh",
-    "content_hash": "6649df913eadef34fa2f253aed541dfd",
-    "embedding_hash": "54af072da7c1139108c79b64bd1ee291",
-    "created_time": 1758175163.6748998,
-    "vector_size": 384
-  },
-  "requirements.txt": {
-    "file_path": "requirements.txt",
-    "content_hash": "e981a0aa103bdec4a99b75831967766d",
-    "embedding_hash": "37bc877ea041ad606234262423cf578a",
-    "created_time": 1758175163.6749053,
-    "vector_size": 384
-  },
-  "setup.py": {
-    "file_path": "setup.py",
-    "content_hash": "7b93af473bfe37284c6cf493458bc421",
-    "embedding_hash": "bdda9a6e8d3bd34465436b119a17e263",
-    "created_time": 1758175163.6749127,
-    "vector_size": 384
-  },
-  "__init__.py": {
-    "file_path": "__init__.py",
-    "content_hash": "c981c4ffc664bbd3c253d0dc82f48ac6",
-    "embedding_hash": "3ab1a0c5d0d4bd832108b7a6ade0ad9c",
-    "created_time": 1758175163.6749194,
-    "vector_size": 384
-  },
-  "cache\\file_index.json": {
-    "file_path": "cache\\file_index.json",
-    "content_hash": "6534fef14d12e39aff1dc0dcf5b91d1d",
-    "embedding_hash": "d76efa530f0d21e52f9d5b3a9ccc358c",
-    "created_time": 1758175163.6749268,
-    "vector_size": 384
-  },
-  "core\\config.py": {
-    "file_path": "core\\config.py",
-    "content_hash": "ee72a95cea7397db8dd25b10a4436eaa",
-    "embedding_hash": "65d1fca1cf59bcd36409c3b11f50aab1",
-    "created_time": 1758175163.6749349,
-    "vector_size": 384
-  },
-  "core\\context_analyzer.py": {
-    "file_path": "core\\context_analyzer.py",
-    "content_hash": "2e9ac2050e463c9d3f94bad23e65d4e5",
-    "embedding_hash": "dfb51c8eaafd96ac544b3d9c8dcd3f51",
-    "created_time": 1758175163.674943,
-    "vector_size": 384
-  },
-  "core\\embedding_manager.py": {
-    "file_path": "core\\embedding_manager.py",
-    "content_hash": "cafa24b0431c6463266dde8b37fc3ab7",
-    "embedding_hash": "531c3206f0caf9789873719cdd644e99",
-    "created_time": 1758175163.6749508,
-    "vector_size": 384
-  },
-  "core\\file_indexer.py": {
-    "file_path": "core\\file_indexer.py",
-    "content_hash": "0626c89c060d6022261ca094aed47093",
-    "embedding_hash": "93d5fc6e84334d3bd9be0f07f9823b20",
-    "created_time": 1758175163.6749592,
-    "vector_size": 384
-  },
-  "core\\gitignore_parser.py": {
-    "file_path": "core\\gitignore_parser.py",
-    "content_hash": "5f1d87fb03bc3b19833406be0fa5125f",
-    "embedding_hash": "784be673b6b428cce60ab5390bfc7f08",
-    "created_time": 1758175163.6749675,
-    "vector_size": 384
-  },
-  "core\\path_matcher.py": {
-    "file_path": "core\\path_matcher.py",
-    "content_hash": "89132273951a091610c1579ccc44f3a7",
-    "embedding_hash": "e01ca0180c2834a514ad6d8e62315ce0",
-    "created_time": 1758175163.6749754,
-    "vector_size": 384
-  },
-  "core\\__init__.py": {
-    "file_path": "core\\__init__.py",
-    "content_hash": "3a323be141f1ce6b9d9047aa444029b0",
-    "embedding_hash": "3fc5a5427067e59b054428083a5899ca",
-    "created_time": 1758175163.6749818,
-    "vector_size": 384
-  },
-  "tools\\module_analyzer.py": {
-    "file_path": "tools\\module_analyzer.py",
-    "content_hash": "926289c2fd8d681ed20c445d2ac34fa1",
-    "embedding_hash": "3378fcde062914859b765d8dfce1207f",
-    "created_time": 1758175163.67499,
-    "vector_size": 384
-  },
-  "tools\\tech_stack.py": {
-    "file_path": "tools\\tech_stack.py",
-    "content_hash": "eef6eabcbc8ba0ece0dfacb9314f3585",
-    "embedding_hash": "bc3aa5334ef17328490bc5a8162d776a",
-    "created_time": 1758175163.674997,
-    "vector_size": 384
-  },
-  "tools\\workflow_updater.py": {
-    "file_path": "tools\\workflow_updater.py",
-    "content_hash": "40d7d884e0db24eb45aa27739fef8210",
-    "embedding_hash": "00488f4acdb7fe1b5126da4da3bb9869",
-    "created_time": 1758175163.6750047,
-    "vector_size": 384
-  },
-  "tools\\__init__.py": {
-    "file_path": "tools\\__init__.py",
-    "content_hash": "41bf583571f4355e4af90842d0674b1f",
-    "embedding_hash": "fccd7745f9e1e242df3bace7cee9759c",
-    "created_time": 1758175163.6750097,
-    "vector_size": 384
-  },
-  "utils\\cache.py": {
-    "file_path": "utils\\cache.py",
-    "content_hash": "dc7c08bcd9af9ae465020997e4b9127e",
-    "embedding_hash": "68394bc0f57a0f66b83a57249b39957d",
-    "created_time": 1758175163.6750169,
-    "vector_size": 384
-  },
-  "utils\\colors.py": {
-    "file_path": "utils\\colors.py",
-    "content_hash": "8ce555a2dcf4057ee7adfb3286d47da2",
-    "embedding_hash": "1b18e22acb095e83ed291b6c5dc7a2ce",
-    "created_time": 1758175163.6750243,
-    "vector_size": 384
-  },
-  "utils\\io_helpers.py": {
-    "file_path": "utils\\io_helpers.py",
-    "content_hash": "fb276a0e46b28f80d5684368a8b15e57",
-    "embedding_hash": "f6ff8333b1afc5b98d4644f334c18cda",
-    "created_time": 1758175163.6750326,
-    "vector_size": 384
-  },
-  "utils\\__init__.py": {
-    "file_path": "utils\\__init__.py",
-    "content_hash": "f305ede9cbdec2f2e0189a4b89558b7e",
-    "embedding_hash": "7d3f10fe4210d40eafd3c065b8e0c8b7",
-    "created_time": 1758175163.6750393,
-    "vector_size": 384
-  }
-}
--- a/.claude/python_script/cache/embeddings.pkl
+++ b/.claude/python_script/cache/embeddings.pkl
--- a/.claude/python_script/config.yaml
+++ b/.claude/python_script/config.yaml
@@ -66,11 +66,12 @@ file_extensions:
 # Embedding/RAG configuration
 embedding:
  enabled: true  # Set to true to enable RAG features
-  model: "all-MiniLM-L6-v2"  # Lightweight sentence transformer
+  model: "codesage/codesage-large-v2"  # CodeSage V2 for code embeddings
  cache_dir: "cache"
-  similarity_threshold: 0.3
-  max_context_length: 512
-  batch_size: 32
+  similarity_threshold: 0.6  # Higher threshold for better code similarity
+  max_context_length: 2048  # Increased for CodeSage V2 capabilities
+  batch_size: 8  # Reduced for larger model
+  trust_remote_code: true  # Required for CodeSage V2

 # Context analysis settings
 context_analysis:
--- a/.claude/python_script/core/pycache/embedding_manager.cpython-313.pyc
+++ b/.claude/python_script/core/pycache/embedding_manager.cpython-313.pyc
--- a/.claude/python_script/core/embedding_manager.py
+++ b/.claude/python_script/core/embedding_manager.py
@@ -75,6 +75,7 @@ class EmbeddingManager:
        self.similarity_threshold = config.get('embedding', {}).get('similarity_threshold', 0.6)
        self.max_context_length = config.get('embedding', {}).get('max_context_length', 512)
        self.batch_size = config.get('embedding', {}).get('batch_size', 32)
+        self.trust_remote_code = config.get('embedding', {}).get('trust_remote_code', False)

        # Setup cache directories
        self.cache_dir.mkdir(parents=True, exist_ok=True)
@@ -95,7 +96,11 @@ class EmbeddingManager:
        if self._model is None:
            try:
                self.logger.info(f"Loading embedding model: {self.model_name}")
-                self._model = SentenceTransformer(self.model_name)
+                # Initialize with trust_remote_code for CodeSage V2
+                if self.trust_remote_code:
+                    self._model = SentenceTransformer(self.model_name, trust_remote_code=True)
+                else:
+                    self._model = SentenceTransformer(self.model_name)
                self.logger.info(f"Model loaded successfully")
            except Exception as e:
                self.logger.error(f"Failed to load embedding model: {e}")
@@ -203,7 +208,7 @@ class EmbeddingManager:
            with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                content = f.read()

-            # Truncate content if too long
+            # Truncate content if too long (CodeSage V2 supports longer contexts)
            if len(content) > self.max_context_length * 4:  # Approximate token limit
                content = content[:self.max_context_length * 4]

--- a/.claude/python_script/requirements.txt
+++ b/.claude/python_script/requirements.txt
@@ -2,14 +2,18 @@
 numpy>=1.21.0
 scikit-learn>=1.0.0

-# Sentence Transformers for advanced embeddings
-sentence-transformers>=2.2.0
+# Sentence Transformers for advanced embeddings (CodeSage V2 compatible)
+sentence-transformers>=3.0.0
+transformers>=4.40.0

-# Optional: For better performance and additional models
-torch>=1.9.0
+# PyTorch for model execution (required for CodeSage V2)
+torch>=2.0.0

 # Development and testing
 pytest>=6.0.0

 # Data handling
 pandas>=1.3.0
+
+# Additional dependencies for CodeSage V2
+accelerate>=0.26.0