feat: Add pycli bash wrapper with hierarchical vector database support

- Create unified bash wrapper (pycli) for Python CLI tools
- Implement hierarchical vector database with smart parent discovery
- Add comprehensive installation script with auto-configuration
- Remove redundant analyzer.py and api_indexer.py files
- Enhance Python scripts with environment variable support
- Update documentation to focus on pycli unified interface

Key Features:
- Automatic parent directory vector DB discovery
- No redundant vectorization in subdirectories
- Central vector database storage in ~/.claude/vector_db
- Configurable Python interpreter paths
- One-command installation and setup

🤖 Generated with Claude Code

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
catlog22
2025-09-23 22:09:55 +08:00
parent 194d2722a3
commit c337204242
9 changed files with 1353 additions and 726 deletions

View File

@@ -1,305 +0,0 @@
#!/usr/bin/env python3
"""
Unified Path-Aware Analyzer
Main entry point for the refactored analyzer system.
Provides a clean, simple API for intelligent file analysis.
"""
import os
import sys
import argparse
import logging
import json
import time
from pathlib import Path
from typing import Dict, List, Optional, Any
# Add current directory to path for imports
sys.path.insert(0, str(Path(__file__).parent))
from core.config import get_config
from core.file_indexer import FileIndexer, IndexStats
from core.context_analyzer import ContextAnalyzer, AnalysisResult
from core.path_matcher import PathMatcher, PathMatchingResult
from core.embedding_manager import EmbeddingManager
from utils.colors import Colors
class Analyzer:
"""Main analyzer class with simplified API."""
def __init__(self, config_path: Optional[str] = None, root_path: str = "."):
self.root_path = Path(root_path).resolve()
self.config = get_config(config_path)
# Setup logging
logging.basicConfig(
level=getattr(logging, self.config.get('logging.level', 'INFO')),
format=self.config.get('logging.format', '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
)
self.logger = logging.getLogger(__name__)
# Initialize core components
self.indexer = FileIndexer(self.config, str(self.root_path))
self.context_analyzer = ContextAnalyzer(self.config)
self.path_matcher = PathMatcher(self.config)
# Initialize embedding manager if enabled
self.embedding_manager = None
if self.config.is_embedding_enabled():
try:
self.embedding_manager = EmbeddingManager(self.config)
except ImportError:
self.logger.warning("Embedding dependencies not available. Install sentence-transformers for enhanced functionality.")
def build_index(self) -> IndexStats:
"""Build or update the file index."""
print(Colors.yellow("Building file index..."))
start_time = time.time()
self.indexer.build_index()
stats = self.indexer.get_stats()
elapsed = time.time() - start_time
if stats:
print(Colors.green(f"Index built: {stats.total_files} files, ~{stats.total_tokens:,} tokens ({elapsed:.2f}s)"))
else:
print(Colors.green(f"Index built successfully ({elapsed:.2f}s)"))
return stats
def analyze(self, prompt: str, mode: str = "auto", patterns: Optional[List[str]] = None,
token_limit: Optional[int] = None, use_embeddings: Optional[bool] = None) -> Dict[str, Any]:
"""Analyze and return relevant file paths for a given prompt."""
print(Colors.yellow("Analyzing project and prompt..."))
start_time = time.time()
# Load or build index
index = self.indexer.load_index()
if not index:
self.build_index()
index = self.indexer.load_index()
stats = self.indexer.get_stats()
print(Colors.cyan(f"Project stats: ~{stats.total_tokens:,} tokens across {stats.total_files} files"))
print(Colors.cyan(f"Categories: {', '.join(f'{k}: {v}' for k, v in stats.categories.items())}"))
# Determine project size
project_size = self._classify_project_size(stats.total_tokens)
print(Colors.cyan(f"Project size: {project_size}"))
# Analyze prompt context
print(Colors.yellow("Analyzing prompt context..."))
context_result = self.context_analyzer.analyze(prompt)
print(Colors.cyan(f"Identified: {len(context_result.domains)} domains, {len(context_result.languages)} languages"))
if context_result.domains:
print(Colors.cyan(f"Top domains: {', '.join(context_result.domains[:3])}"))
# Determine if we should use embeddings
should_use_embeddings = use_embeddings
if should_use_embeddings is None:
should_use_embeddings = (
self.embedding_manager is not None and
self.config.is_embedding_enabled() and
len(context_result.keywords) < 5 # Use embeddings for vague queries
)
similar_files = []
if should_use_embeddings and self.embedding_manager:
print(Colors.yellow("Using semantic similarity search..."))
# Update embeddings if needed
if not self.embedding_manager.embeddings_exist():
print(Colors.yellow("Building embeddings (first run)..."))
self.embedding_manager.update_embeddings(index)
similar_files = self.embedding_manager.find_similar_files(prompt, index)
print(Colors.cyan(f"Found {len(similar_files)} semantically similar files"))
# Match files to context
print(Colors.yellow("Matching files to context..."))
matching_result = self.path_matcher.match_files(
index,
context_result,
token_limit=token_limit,
explicit_patterns=patterns
)
elapsed = time.time() - start_time
print(Colors.green(f"Analysis complete: {len(matching_result.matched_files)} files, ~{matching_result.total_tokens:,} tokens"))
print(Colors.cyan(f"Confidence: {matching_result.confidence_score:.2f}"))
print(Colors.cyan(f"Execution time: {elapsed:.2f}s"))
return {
'files': [match.file_info.relative_path for match in matching_result.matched_files],
'total_tokens': matching_result.total_tokens,
'confidence': matching_result.confidence_score,
'context': {
'domains': context_result.domains,
'languages': context_result.languages,
'keywords': context_result.keywords
},
'stats': {
'project_size': project_size,
'total_files': stats.total_files,
'analysis_time': elapsed,
'embeddings_used': should_use_embeddings
}
}
def generate_command(self, prompt: str, tool: str = "gemini", **kwargs) -> str:
"""Generate a command for external tools (gemini/codex)."""
analysis_result = self.analyze(prompt, **kwargs)
# Format file patterns
file_patterns = " ".join(f"@{{{file}}}" for file in analysis_result['files'])
if tool == "gemini":
if len(analysis_result['files']) > 50: # Too many files for individual patterns
return f'gemini --all-files -p "{prompt}"'
else:
return f'gemini -p "{file_patterns} {prompt}"'
elif tool == "codex":
workspace_flag = "-s workspace-write" if analysis_result['total_tokens'] > 100000 else "-s danger-full-access"
return f'codex {workspace_flag} --full-auto exec "{file_patterns} {prompt}"'
else:
raise ValueError(f"Unsupported tool: {tool}")
def _classify_project_size(self, tokens: int) -> str:
"""Classify project size based on token count."""
small_limit = self.config.get('token_limits.small_project', 500000)
medium_limit = self.config.get('token_limits.medium_project', 2000000)
if tokens < small_limit:
return "small"
elif tokens < medium_limit:
return "medium"
else:
return "large"
def get_project_stats(self) -> Dict[str, Any]:
"""Get comprehensive project statistics."""
stats = self.indexer.get_stats()
embedding_stats = {}
if self.embedding_manager:
embedding_stats = {
'embeddings_exist': self.embedding_manager.embeddings_exist(),
'embedding_count': len(self.embedding_manager.load_embeddings()) if self.embedding_manager.embeddings_exist() else 0
}
return {
'files': stats.total_files,
'tokens': stats.total_tokens,
'size_bytes': stats.total_size,
'categories': stats.categories,
'project_size': self._classify_project_size(stats.total_tokens),
'last_updated': stats.last_updated,
'embeddings': embedding_stats,
'config': {
'cache_dir': self.config.get_cache_dir(),
'embedding_enabled': self.config.is_embedding_enabled(),
'exclude_patterns_count': len(self.config.get_exclude_patterns())
}
}
def main():
"""CLI entry point."""
parser = argparse.ArgumentParser(
description="Path-Aware Analyzer - Intelligent file pattern detection",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
python analyzer.py "analyze authentication flow"
python analyzer.py "fix database connection" --patterns "src/**/*.py"
python analyzer.py "review API endpoints" --tool gemini
python analyzer.py --stats
"""
)
parser.add_argument('prompt', nargs='?', help='Analysis prompt or task description')
parser.add_argument('--patterns', nargs='*', help='Explicit file patterns to include')
parser.add_argument('--tool', choices=['gemini', 'codex'], help='Generate command for specific tool')
parser.add_argument('--output', choices=['patterns', 'json'], default='patterns', help='Output format')
parser.add_argument('--verbose', '-v', action='store_true', help='Verbose output')
parser.add_argument('--stats', action='store_true', help='Show project statistics and exit')
parser.add_argument('--build-index', action='store_true', help='Build file index and exit')
args = parser.parse_args()
# Create analyzer with default values
analyzer = Analyzer(config_path=None, root_path=".")
# Handle special commands
if args.build_index:
analyzer.build_index()
return
if args.stats:
stats = analyzer.get_project_stats()
if args.output == 'json':
print(json.dumps(stats, indent=2, default=str))
else:
print(f"Total files: {stats['files']}")
print(f"Total tokens: {stats['tokens']:,}")
print(f"Categories: {stats['categories']}")
if 'embeddings' in stats:
print(f"Embeddings: {stats['embeddings']['embedding_count']}")
return
# Require prompt for analysis
if not args.prompt:
parser.error("Analysis prompt is required unless using --build-index or --stats")
# Perform analysis
try:
result = analyzer.analyze(
args.prompt,
patterns=args.patterns,
use_embeddings=False # Disable embeddings by default for simplicity
)
# Generate output
if args.tool:
# Generate command using already computed result
file_patterns = " ".join(f"@{{{file}}}" for file in result['files'])
if args.tool == "gemini":
if len(result['files']) > 50:
command = f'gemini --all-files -p "{args.prompt}"'
else:
command = f'gemini -p "{file_patterns} {args.prompt}"'
elif args.tool == "codex":
workspace_flag = "-s workspace-write" if result['total_tokens'] > 100000 else "-s danger-full-access"
command = f'codex {workspace_flag} --full-auto exec "{file_patterns} {args.prompt}"'
print(command)
elif args.output == 'json':
print(json.dumps(result, indent=2, default=str))
else: # patterns output (default)
for file_path in result['files']:
print(f"@{{{file_path}}}")
# Show verbose details
if args.verbose:
print(f"\n# Analysis Details:")
print(f"# Matched files: {len(result['files'])}")
print(f"# Total tokens: {result['total_tokens']:,}")
print(f"# Confidence: {result['confidence']:.2f}")
except KeyboardInterrupt:
print(Colors.warning("\nAnalysis interrupted by user"))
sys.exit(1)
except Exception as e:
print(Colors.error(f"Analysis failed: {e}"))
if args.verbose:
import traceback
traceback.print_exc()
sys.exit(1)
if __name__ == "__main__":
main()

View File

@@ -1,141 +0,0 @@
#!/usr/bin/env python3
"""
API Documentation Indexer
Parses Markdown documentation to create a searchable index of classes and methods.
"""
import os
import re
import json
import logging
from pathlib import Path
from typing import Dict, Any
from core.file_indexer import FileIndexer
class ApiIndexer:
def __init__(self, config: Dict, root_path: str = "."):
self.config = config
self.root_path = Path(root_path).resolve()
self.file_indexer = FileIndexer(config, root_path)
self.api_index_file = self.file_indexer.cache_dir / "api_index.json"
self.logger = logging.getLogger(__name__)
def build_index(self):
"""Builds the API index from Markdown files."""
self.logger.info("Building API index...")
file_index = self.file_indexer.load_index()
if not file_index:
self.logger.info("File index not found, building it first.")
self.file_indexer.build_index()
file_index = self.file_indexer.load_index()
api_index = {}
for file_info in file_index.values():
if file_info.extension == ".md":
self.logger.debug(f"Parsing {file_info.path}")
try:
with open(file_info.path, "r", encoding="utf-8") as f:
content = f.read()
self._parse_markdown(content, file_info.relative_path, api_index)
except Exception as e:
self.logger.error(f"Error parsing {file_info.path}: {e}")
self._save_index(api_index)
self.logger.info(f"API index built with {len(api_index)} classes.")
def _parse_markdown(self, content: str, file_path: str, api_index: Dict):
"""Parses a single Markdown file for class and method info."""
class_name_match = re.search(r"^#\s+([A-Za-z0-9_]+)", content)
if not class_name_match:
return
class_name = class_name_match.group(1)
api_index[class_name] = {
"file_path": file_path,
"description": "",
"methods": {}
}
# Simple description extraction
desc_match = re.search(r"\*\*Description:\*\*\s*(.+)", content)
if desc_match:
api_index[class_name]["description"] = desc_match.group(1).strip()
# Method extraction
method_sections = re.split(r"###\s+", content)[1:]
for i, section in enumerate(method_sections):
method_signature_match = re.search(r"`(.+?)`", section)
if not method_signature_match:
continue
signature = method_signature_match.group(1)
method_name_match = re.search(r"([A-Za-z0-9_]+)\(“, signature)
if not method_name_match:
continue
method_name = method_name_match.group(1)
method_description = ""
method_desc_match = re.search(r"\*\*Description:\*\*\s*(.+)", section)
if method_desc_match:
method_description = method_desc_match.group(1).strip()
# A simple way to get a line number approximation
line_number = content.count("\n", 0, content.find(f"### `{signature}`")) + 1
api_index[class_name]["methods"Показать больше] = {
"signature": signature,
"description": method_description,
"line_number": line_number
}
def _save_index(self, api_index: Dict):
"""Saves the API index to a file."""
try:
with open(self.api_index_file, "w", encoding="utf-8") as f:
json.dump(api_index, f, indent=2)
except IOError as e:
self.logger.error(f"Could not save API index: {e}")
def search(self, class_name: str, method_name: str = None) -> Any:
"""Searches the API index for a class or method."""
if not self.api_index_file.exists():
self.build_index()
with open(self.api_index_file, "r", encoding="utf-8") as f:
api_index = json.load(f)
if class_name not in api_index:
return None
if method_name:
return api_index[class_name]["methods"].get(method_name)
else:
return api_index[class_name]
if __name__ == "__main__":
from core.config import get_config
import argparse
logging.basicConfig(level=logging.INFO)
parser = argparse.ArgumentParser(description="API Documentation Indexer.")
parser.add_argument("--build", action="store_true", help="Build the API index.")
parser.add_argument("--search_class", help="Search for a class.")
parser.add_argument("--search_method", help="Search for a method within a class (requires --search_class).")
args = parser.parse_args()
config = get_config()
api_indexer = ApiIndexer(config.to_dict())
if args.build:
api_indexer.build_index()
if args.search_class:
result = api_indexer.search(args.search_class, args.search_method)
if result:
print(json.dumps(result, indent=2))
else:
print("Not found.")

View File

@@ -1,276 +0,0 @@
{
"stats": {
"total_files": 26,
"total_tokens": 56126,
"total_size": 246519,
"categories": {
"code": 21,
"config": 3,
"docs": 1,
"other": 1
},
"last_updated": 1758177270.9103189
},
"files": {
"analyzer.py": {
"path": "D:\\Claude_dms3\\.claude\\python_script\\analyzer.py",
"relative_path": "analyzer.py",
"size": 12595,
"modified_time": 1758175179.730658,
"extension": ".py",
"category": "code",
"estimated_tokens": 3072,
"content_hash": "3fb090745b5080e0731e7ef3fc94029d"
},
"cli.py": {
"path": "D:\\Claude_dms3\\.claude\\python_script\\cli.py",
"relative_path": "cli.py",
"size": 8329,
"modified_time": 1758177193.3710027,
"extension": ".py",
"category": "code",
"estimated_tokens": 2030,
"content_hash": "b9f0b5d6a154cf51c8665b2344c9faf8"
},
"config.yaml": {
"path": "D:\\Claude_dms3\\.claude\\python_script\\config.yaml",
"relative_path": "config.yaml",
"size": 4317,
"modified_time": 1758163450.6223683,
"extension": ".yaml",
"category": "config",
"estimated_tokens": 1040,
"content_hash": "b431b73dfa86ff83145468bbf4422a79"
},
"indexer.py": {
"path": "D:\\Claude_dms3\\.claude\\python_script\\indexer.py",
"relative_path": "indexer.py",
"size": 7776,
"modified_time": 1758177151.2160237,
"extension": ".py",
"category": "code",
"estimated_tokens": 1893,
"content_hash": "f88b5e5bffce26f3170974df2906aac3"
},
"install.sh": {
"path": "D:\\Claude_dms3\\.claude\\python_script\\install.sh",
"relative_path": "install.sh",
"size": 5236,
"modified_time": 1758161898.317552,
"extension": ".sh",
"category": "code",
"estimated_tokens": 1262,
"content_hash": "cc3a9121a0b8281457270f30ad76f5f6"
},
"requirements.txt": {
"path": "D:\\Claude_dms3\\.claude\\python_script\\requirements.txt",
"relative_path": "requirements.txt",
"size": 495,
"modified_time": 1758164967.7707567,
"extension": ".txt",
"category": "docs",
"estimated_tokens": 118,
"content_hash": "aea2ba14dfa7b37b1dde5518de87d956"
},
"setup.py": {
"path": "D:\\Claude_dms3\\.claude\\python_script\\setup.py",
"relative_path": "setup.py",
"size": 2860,
"modified_time": 1758177212.9095325,
"extension": ".py",
"category": "code",
"estimated_tokens": 692,
"content_hash": "609abf8b9c84a09f6a59d5815eb90bc5"
},
"__init__.py": {
"path": "D:\\Claude_dms3\\.claude\\python_script\\__init__.py",
"relative_path": "__init__.py",
"size": 1065,
"modified_time": 1758177224.8017242,
"extension": ".py",
"category": "code",
"estimated_tokens": 257,
"content_hash": "47368b235086fc0c75ba34a824c58506"
},
"cache\\embeddings.pkl": {
"path": "D:\\Claude_dms3\\.claude\\python_script\\cache\\embeddings.pkl",
"relative_path": "cache\\embeddings.pkl",
"size": 35109,
"modified_time": 1758175163.6754165,
"extension": ".pkl",
"category": "other",
"estimated_tokens": 4713,
"content_hash": "b8ed5c068acd5ed52ba10839701a5a24"
},
"cache\\embedding_index.json": {
"path": "D:\\Claude_dms3\\.claude\\python_script\\cache\\embedding_index.json",
"relative_path": "cache\\embedding_index.json",
"size": 5589,
"modified_time": 1758175163.6764157,
"extension": ".json",
"category": "config",
"estimated_tokens": 1358,
"content_hash": "5c2ba41b1b69ce19d2fc3b5854f6ee53"
},
"cache\\file_index.json": {
"path": "D:\\Claude_dms3\\.claude\\python_script\\cache\\file_index.json",
"relative_path": "cache\\file_index.json",
"size": 12164,
"modified_time": 1758165699.0883024,
"extension": ".json",
"category": "config",
"estimated_tokens": 2957,
"content_hash": "73563db28a2808aa28544c0275b97f94"
},
"core\\config.py": {
"path": "D:\\Claude_dms3\\.claude\\python_script\\core\\config.py",
"relative_path": "core\\config.py",
"size": 12266,
"modified_time": 1758164531.5934324,
"extension": ".py",
"category": "code",
"estimated_tokens": 2985,
"content_hash": "d85aedc01a528b486d41acbd823181d7"
},
"core\\context_analyzer.py": {
"path": "D:\\Claude_dms3\\.claude\\python_script\\core\\context_analyzer.py",
"relative_path": "core\\context_analyzer.py",
"size": 15002,
"modified_time": 1758164846.7665854,
"extension": ".py",
"category": "code",
"estimated_tokens": 3661,
"content_hash": "677903b5aaf3db13575ca1ca99ec7c16"
},
"core\\embedding_manager.py": {
"path": "D:\\Claude_dms3\\.claude\\python_script\\core\\embedding_manager.py",
"relative_path": "core\\embedding_manager.py",
"size": 17271,
"modified_time": 1758166063.1635072,
"extension": ".py",
"category": "code",
"estimated_tokens": 4204,
"content_hash": "d8f52cb93140a46fe3d22d465ec01b22"
},
"core\\file_indexer.py": {
"path": "D:\\Claude_dms3\\.claude\\python_script\\core\\file_indexer.py",
"relative_path": "core\\file_indexer.py",
"size": 14484,
"modified_time": 1758164612.5888917,
"extension": ".py",
"category": "code",
"estimated_tokens": 3525,
"content_hash": "1518d309108f3300417b65f6234241d1"
},
"core\\gitignore_parser.py": {
"path": "D:\\Claude_dms3\\.claude\\python_script\\core\\gitignore_parser.py",
"relative_path": "core\\gitignore_parser.py",
"size": 6757,
"modified_time": 1758164472.643646,
"extension": ".py",
"category": "code",
"estimated_tokens": 1644,
"content_hash": "9cd97725576727080aaafd329d9ce2c4"
},
"core\\path_matcher.py": {
"path": "D:\\Claude_dms3\\.claude\\python_script\\core\\path_matcher.py",
"relative_path": "core\\path_matcher.py",
"size": 19568,
"modified_time": 1758166045.8395746,
"extension": ".py",
"category": "code",
"estimated_tokens": 4767,
"content_hash": "f1dc44dc3ed67f100770aea40197623f"
},
"core\\__init__.py": {
"path": "D:\\Claude_dms3\\.claude\\python_script\\core\\__init__.py",
"relative_path": "core\\__init__.py",
"size": 712,
"modified_time": 1758164419.4437866,
"extension": ".py",
"category": "code",
"estimated_tokens": 172,
"content_hash": "b25991cb8d977021362f45e121e89de7"
},
"tools\\module_analyzer.py": {
"path": "D:\\Claude_dms3\\.claude\\python_script\\tools\\module_analyzer.py",
"relative_path": "tools\\module_analyzer.py",
"size": 14273,
"modified_time": 1758164687.488236,
"extension": ".py",
"category": "code",
"estimated_tokens": 3476,
"content_hash": "b958ec7ed264242f2bb30b1cca66b144"
},
"tools\\tech_stack.py": {
"path": "D:\\Claude_dms3\\.claude\\python_script\\tools\\tech_stack.py",
"relative_path": "tools\\tech_stack.py",
"size": 7576,
"modified_time": 1758164695.643722,
"extension": ".py",
"category": "code",
"estimated_tokens": 1843,
"content_hash": "f391a45d8254f0c4f4f789027dd69afc"
},
"tools\\workflow_updater.py": {
"path": "D:\\Claude_dms3\\.claude\\python_script\\tools\\workflow_updater.py",
"relative_path": "tools\\workflow_updater.py",
"size": 9577,
"modified_time": 1758164703.2230499,
"extension": ".py",
"category": "code",
"estimated_tokens": 2334,
"content_hash": "526edf0cfbe3c2041135eace9f89ef13"
},
"tools\\__init__.py": {
"path": "D:\\Claude_dms3\\.claude\\python_script\\tools\\__init__.py",
"relative_path": "tools\\__init__.py",
"size": 329,
"modified_time": 1758165927.9923615,
"extension": ".py",
"category": "code",
"estimated_tokens": 79,
"content_hash": "139aa450d7511347cc6799c471eac745"
},
"utils\\cache.py": {
"path": "D:\\Claude_dms3\\.claude\\python_script\\utils\\cache.py",
"relative_path": "utils\\cache.py",
"size": 12067,
"modified_time": 1758164781.2914226,
"extension": ".py",
"category": "code",
"estimated_tokens": 2929,
"content_hash": "39e49b731d601fafac74e96ed074e654"
},
"utils\\colors.py": {
"path": "D:\\Claude_dms3\\.claude\\python_script\\utils\\colors.py",
"relative_path": "utils\\colors.py",
"size": 6959,
"modified_time": 1758165650.9865932,
"extension": ".py",
"category": "code",
"estimated_tokens": 1678,
"content_hash": "8bb57134555d8fb07d2e351d4e100f0f"
},
"utils\\io_helpers.py": {
"path": "D:\\Claude_dms3\\.claude\\python_script\\utils\\io_helpers.py",
"relative_path": "utils\\io_helpers.py",
"size": 13773,
"modified_time": 1758164823.513003,
"extension": ".py",
"category": "code",
"estimated_tokens": 3349,
"content_hash": "aa54747c49319cc2c90c0544c668009a"
},
"utils\\__init__.py": {
"path": "D:\\Claude_dms3\\.claude\\python_script\\utils\\__init__.py",
"relative_path": "utils\\__init__.py",
"size": 370,
"modified_time": 1758164433.7142198,
"extension": ".py",
"category": "code",
"estimated_tokens": 88,
"content_hash": "62ec4a34f1643a23c79207061bdb8d49"
}
}
}

View File

@@ -66,12 +66,12 @@ file_extensions:
# Embedding/RAG configuration
embedding:
enabled: true # Set to true to enable RAG features
model: "codesage/codesage-large-v2" # CodeSage V2 for code embeddings
model: "all-MiniLM-L6-v2" # Stable general-purpose embedding model
cache_dir: "cache"
similarity_threshold: 0.6 # Higher threshold for better code similarity
max_context_length: 2048 # Increased for CodeSage V2 capabilities
batch_size: 8 # Reduced for larger model
trust_remote_code: true # Required for CodeSage V2
max_context_length: 512 # Standard context length
batch_size: 32 # Standard batch size
trust_remote_code: false # Not required for standard models
# Context analysis settings
context_analysis: