feat: Add API indexer and enhance embedding management

- Add new API indexer script for document processing
- Update embedding manager with improved functionality
- Remove old cache files and update dependencies
- Modify workflow execute documentation

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
catlog22
2025-09-23 19:40:22 +08:00
parent 984fa3a4f3
commit 410d0efd7b
8 changed files with 506 additions and 337 deletions

View File

@@ -0,0 +1,141 @@
#!/usr/bin/env python3
"""
API Documentation Indexer
Parses Markdown documentation to create a searchable index of classes and methods.
"""
import os
import re
import json
import logging
from pathlib import Path
from typing import Dict, Any
from core.file_indexer import FileIndexer
class ApiIndexer:
def __init__(self, config: Dict, root_path: str = "."):
self.config = config
self.root_path = Path(root_path).resolve()
self.file_indexer = FileIndexer(config, root_path)
self.api_index_file = self.file_indexer.cache_dir / "api_index.json"
self.logger = logging.getLogger(__name__)
def build_index(self):
"""Builds the API index from Markdown files."""
self.logger.info("Building API index...")
file_index = self.file_indexer.load_index()
if not file_index:
self.logger.info("File index not found, building it first.")
self.file_indexer.build_index()
file_index = self.file_indexer.load_index()
api_index = {}
for file_info in file_index.values():
if file_info.extension == ".md":
self.logger.debug(f"Parsing {file_info.path}")
try:
with open(file_info.path, "r", encoding="utf-8") as f:
content = f.read()
self._parse_markdown(content, file_info.relative_path, api_index)
except Exception as e:
self.logger.error(f"Error parsing {file_info.path}: {e}")
self._save_index(api_index)
self.logger.info(f"API index built with {len(api_index)} classes.")
def _parse_markdown(self, content: str, file_path: str, api_index: Dict):
"""Parses a single Markdown file for class and method info."""
class_name_match = re.search(r"^#\s+([A-Za-z0-9_]+)", content)
if not class_name_match:
return
class_name = class_name_match.group(1)
api_index[class_name] = {
"file_path": file_path,
"description": "",
"methods": {}
}
# Simple description extraction
desc_match = re.search(r"\*\*Description:\*\*\s*(.+)", content)
if desc_match:
api_index[class_name]["description"] = desc_match.group(1).strip()
# Method extraction
method_sections = re.split(r"###\s+", content)[1:]
for i, section in enumerate(method_sections):
method_signature_match = re.search(r"`(.+?)`", section)
if not method_signature_match:
continue
signature = method_signature_match.group(1)
method_name_match = re.search(r"([A-Za-z0-9_]+)\(“, signature)
if not method_name_match:
continue
method_name = method_name_match.group(1)
method_description = ""
method_desc_match = re.search(r"\*\*Description:\*\*\s*(.+)", section)
if method_desc_match:
method_description = method_desc_match.group(1).strip()
# A simple way to get a line number approximation
line_number = content.count("\n", 0, content.find(f"### `{signature}`")) + 1
api_index[class_name]["methods"Показать больше] = {
"signature": signature,
"description": method_description,
"line_number": line_number
}
def _save_index(self, api_index: Dict):
"""Saves the API index to a file."""
try:
with open(self.api_index_file, "w", encoding="utf-8") as f:
json.dump(api_index, f, indent=2)
except IOError as e:
self.logger.error(f"Could not save API index: {e}")
def search(self, class_name: str, method_name: str = None) -> Any:
"""Searches the API index for a class or method."""
if not self.api_index_file.exists():
self.build_index()
with open(self.api_index_file, "r", encoding="utf-8") as f:
api_index = json.load(f)
if class_name not in api_index:
return None
if method_name:
return api_index[class_name]["methods"].get(method_name)
else:
return api_index[class_name]
if __name__ == "__main__":
from core.config import get_config
import argparse
logging.basicConfig(level=logging.INFO)
parser = argparse.ArgumentParser(description="API Documentation Indexer.")
parser.add_argument("--build", action="store_true", help="Build the API index.")
parser.add_argument("--search_class", help="Search for a class.")
parser.add_argument("--search_method", help="Search for a method within a class (requires --search_class).")
args = parser.parse_args()
config = get_config()
api_indexer = ApiIndexer(config.to_dict())
if args.build:
api_indexer.build_index()
if args.search_class:
result = api_indexer.search(args.search_class, args.search_method)
if result:
print(json.dumps(result, indent=2))
else:
print("Not found.")

View File

@@ -1,156 +0,0 @@
{
"analyzer.py": {
"file_path": "analyzer.py",
"content_hash": "9a7665c34d5ac84634342f8b1425bb13",
"embedding_hash": "fb5b5a58ec8e070620747c7313b0b2b6",
"created_time": 1758175163.6748724,
"vector_size": 384
},
"config.yaml": {
"file_path": "config.yaml",
"content_hash": "fc0526eea28cf37d15425035d2dd17d9",
"embedding_hash": "4866d8bd2b14c16c448c34c0251d199e",
"created_time": 1758175163.6748896,
"vector_size": 384
},
"install.sh": {
"file_path": "install.sh",
"content_hash": "6649df913eadef34fa2f253aed541dfd",
"embedding_hash": "54af072da7c1139108c79b64bd1ee291",
"created_time": 1758175163.6748998,
"vector_size": 384
},
"requirements.txt": {
"file_path": "requirements.txt",
"content_hash": "e981a0aa103bdec4a99b75831967766d",
"embedding_hash": "37bc877ea041ad606234262423cf578a",
"created_time": 1758175163.6749053,
"vector_size": 384
},
"setup.py": {
"file_path": "setup.py",
"content_hash": "7b93af473bfe37284c6cf493458bc421",
"embedding_hash": "bdda9a6e8d3bd34465436b119a17e263",
"created_time": 1758175163.6749127,
"vector_size": 384
},
"__init__.py": {
"file_path": "__init__.py",
"content_hash": "c981c4ffc664bbd3c253d0dc82f48ac6",
"embedding_hash": "3ab1a0c5d0d4bd832108b7a6ade0ad9c",
"created_time": 1758175163.6749194,
"vector_size": 384
},
"cache\\file_index.json": {
"file_path": "cache\\file_index.json",
"content_hash": "6534fef14d12e39aff1dc0dcf5b91d1d",
"embedding_hash": "d76efa530f0d21e52f9d5b3a9ccc358c",
"created_time": 1758175163.6749268,
"vector_size": 384
},
"core\\config.py": {
"file_path": "core\\config.py",
"content_hash": "ee72a95cea7397db8dd25b10a4436eaa",
"embedding_hash": "65d1fca1cf59bcd36409c3b11f50aab1",
"created_time": 1758175163.6749349,
"vector_size": 384
},
"core\\context_analyzer.py": {
"file_path": "core\\context_analyzer.py",
"content_hash": "2e9ac2050e463c9d3f94bad23e65d4e5",
"embedding_hash": "dfb51c8eaafd96ac544b3d9c8dcd3f51",
"created_time": 1758175163.674943,
"vector_size": 384
},
"core\\embedding_manager.py": {
"file_path": "core\\embedding_manager.py",
"content_hash": "cafa24b0431c6463266dde8b37fc3ab7",
"embedding_hash": "531c3206f0caf9789873719cdd644e99",
"created_time": 1758175163.6749508,
"vector_size": 384
},
"core\\file_indexer.py": {
"file_path": "core\\file_indexer.py",
"content_hash": "0626c89c060d6022261ca094aed47093",
"embedding_hash": "93d5fc6e84334d3bd9be0f07f9823b20",
"created_time": 1758175163.6749592,
"vector_size": 384
},
"core\\gitignore_parser.py": {
"file_path": "core\\gitignore_parser.py",
"content_hash": "5f1d87fb03bc3b19833406be0fa5125f",
"embedding_hash": "784be673b6b428cce60ab5390bfc7f08",
"created_time": 1758175163.6749675,
"vector_size": 384
},
"core\\path_matcher.py": {
"file_path": "core\\path_matcher.py",
"content_hash": "89132273951a091610c1579ccc44f3a7",
"embedding_hash": "e01ca0180c2834a514ad6d8e62315ce0",
"created_time": 1758175163.6749754,
"vector_size": 384
},
"core\\__init__.py": {
"file_path": "core\\__init__.py",
"content_hash": "3a323be141f1ce6b9d9047aa444029b0",
"embedding_hash": "3fc5a5427067e59b054428083a5899ca",
"created_time": 1758175163.6749818,
"vector_size": 384
},
"tools\\module_analyzer.py": {
"file_path": "tools\\module_analyzer.py",
"content_hash": "926289c2fd8d681ed20c445d2ac34fa1",
"embedding_hash": "3378fcde062914859b765d8dfce1207f",
"created_time": 1758175163.67499,
"vector_size": 384
},
"tools\\tech_stack.py": {
"file_path": "tools\\tech_stack.py",
"content_hash": "eef6eabcbc8ba0ece0dfacb9314f3585",
"embedding_hash": "bc3aa5334ef17328490bc5a8162d776a",
"created_time": 1758175163.674997,
"vector_size": 384
},
"tools\\workflow_updater.py": {
"file_path": "tools\\workflow_updater.py",
"content_hash": "40d7d884e0db24eb45aa27739fef8210",
"embedding_hash": "00488f4acdb7fe1b5126da4da3bb9869",
"created_time": 1758175163.6750047,
"vector_size": 384
},
"tools\\__init__.py": {
"file_path": "tools\\__init__.py",
"content_hash": "41bf583571f4355e4af90842d0674b1f",
"embedding_hash": "fccd7745f9e1e242df3bace7cee9759c",
"created_time": 1758175163.6750097,
"vector_size": 384
},
"utils\\cache.py": {
"file_path": "utils\\cache.py",
"content_hash": "dc7c08bcd9af9ae465020997e4b9127e",
"embedding_hash": "68394bc0f57a0f66b83a57249b39957d",
"created_time": 1758175163.6750169,
"vector_size": 384
},
"utils\\colors.py": {
"file_path": "utils\\colors.py",
"content_hash": "8ce555a2dcf4057ee7adfb3286d47da2",
"embedding_hash": "1b18e22acb095e83ed291b6c5dc7a2ce",
"created_time": 1758175163.6750243,
"vector_size": 384
},
"utils\\io_helpers.py": {
"file_path": "utils\\io_helpers.py",
"content_hash": "fb276a0e46b28f80d5684368a8b15e57",
"embedding_hash": "f6ff8333b1afc5b98d4644f334c18cda",
"created_time": 1758175163.6750326,
"vector_size": 384
},
"utils\\__init__.py": {
"file_path": "utils\\__init__.py",
"content_hash": "f305ede9cbdec2f2e0189a4b89558b7e",
"embedding_hash": "7d3f10fe4210d40eafd3c065b8e0c8b7",
"created_time": 1758175163.6750393,
"vector_size": 384
}
}

Binary file not shown.

View File

@@ -66,11 +66,12 @@ file_extensions:
# Embedding/RAG configuration
embedding:
enabled: true # Set to true to enable RAG features
model: "all-MiniLM-L6-v2" # Lightweight sentence transformer
model: "codesage/codesage-large-v2" # CodeSage V2 for code embeddings
cache_dir: "cache"
similarity_threshold: 0.3
max_context_length: 512
batch_size: 32
similarity_threshold: 0.6 # Higher threshold for better code similarity
max_context_length: 2048 # Increased for CodeSage V2 capabilities
batch_size: 8 # Reduced for larger model
trust_remote_code: true # Required for CodeSage V2
# Context analysis settings
context_analysis:

View File

@@ -75,6 +75,7 @@ class EmbeddingManager:
self.similarity_threshold = config.get('embedding', {}).get('similarity_threshold', 0.6)
self.max_context_length = config.get('embedding', {}).get('max_context_length', 512)
self.batch_size = config.get('embedding', {}).get('batch_size', 32)
self.trust_remote_code = config.get('embedding', {}).get('trust_remote_code', False)
# Setup cache directories
self.cache_dir.mkdir(parents=True, exist_ok=True)
@@ -95,7 +96,11 @@ class EmbeddingManager:
if self._model is None:
try:
self.logger.info(f"Loading embedding model: {self.model_name}")
self._model = SentenceTransformer(self.model_name)
# Initialize with trust_remote_code for CodeSage V2
if self.trust_remote_code:
self._model = SentenceTransformer(self.model_name, trust_remote_code=True)
else:
self._model = SentenceTransformer(self.model_name)
self.logger.info(f"Model loaded successfully")
except Exception as e:
self.logger.error(f"Failed to load embedding model: {e}")
@@ -203,7 +208,7 @@ class EmbeddingManager:
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
content = f.read()
# Truncate content if too long
# Truncate content if too long (CodeSage V2 supports longer contexts)
if len(content) > self.max_context_length * 4: # Approximate token limit
content = content[:self.max_context_length * 4]

View File

@@ -2,14 +2,18 @@
numpy>=1.21.0
scikit-learn>=1.0.0
# Sentence Transformers for advanced embeddings
sentence-transformers>=2.2.0
# Sentence Transformers for advanced embeddings (CodeSage V2 compatible)
sentence-transformers>=3.0.0
transformers>=4.40.0
# Optional: For better performance and additional models
torch>=1.9.0
# PyTorch for model execution (required for CodeSage V2)
torch>=2.0.0
# Development and testing
pytest>=6.0.0
# Data handling
pandas>=1.3.0
# Additional dependencies for CodeSage V2
accelerate>=0.26.0