mirror of
https://github.com/catlog22/Claude-Code-Workflow.git
synced 2026-02-08 02:14:08 +08:00
refactor: Update workflow plan system and template organization
- Remove --analyze|--deep parameters from plan.md, use default analysis - Change .analysis to .process directory structure for better organization - Create ANALYSIS_RESULTS.md template focused on verified results - Add .process folder to workflow-architecture.md file structure - Template emphasizes verification of files, methods, and commands - Prevent execution errors from non-existent references 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
25
.claude/python_script/core/__init__.py
Normal file
25
.claude/python_script/core/__init__.py
Normal file
@@ -0,0 +1,25 @@
|
||||
"""
|
||||
Core modules for the Python script analyzer.
|
||||
Provides unified interfaces for file indexing, context analysis, and path matching.
|
||||
"""
|
||||
|
||||
from .config import Config
|
||||
from .file_indexer import FileIndexer, FileInfo, IndexStats
|
||||
from .context_analyzer import ContextAnalyzer, AnalysisResult
|
||||
from .path_matcher import PathMatcher, MatchResult, PathMatchingResult
|
||||
from .embedding_manager import EmbeddingManager
|
||||
from .gitignore_parser import GitignoreParser
|
||||
|
||||
__all__ = [
|
||||
'Config',
|
||||
'FileIndexer',
|
||||
'FileInfo',
|
||||
'IndexStats',
|
||||
'ContextAnalyzer',
|
||||
'AnalysisResult',
|
||||
'PathMatcher',
|
||||
'MatchResult',
|
||||
'PathMatchingResult',
|
||||
'EmbeddingManager',
|
||||
'GitignoreParser'
|
||||
]
|
||||
BIN
.claude/python_script/core/__pycache__/__init__.cpython-313.pyc
Normal file
BIN
.claude/python_script/core/__pycache__/__init__.cpython-313.pyc
Normal file
Binary file not shown.
BIN
.claude/python_script/core/__pycache__/config.cpython-313.pyc
Normal file
BIN
.claude/python_script/core/__pycache__/config.cpython-313.pyc
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
327
.claude/python_script/core/config.py
Normal file
327
.claude/python_script/core/config.py
Normal file
@@ -0,0 +1,327 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Configuration Management Module
|
||||
Provides unified configuration management with gitignore integration.
|
||||
"""
|
||||
|
||||
import os
|
||||
import yaml
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Dict, Any, Optional, List
|
||||
from .gitignore_parser import get_all_gitignore_patterns
|
||||
|
||||
|
||||
class Config:
|
||||
"""Singleton configuration manager with hierarchical loading."""
|
||||
|
||||
_instance = None
|
||||
_initialized = False
|
||||
|
||||
def __new__(cls, config_path: Optional[str] = None):
|
||||
if cls._instance is None:
|
||||
cls._instance = super(Config, cls).__new__(cls)
|
||||
return cls._instance
|
||||
|
||||
def __init__(self, config_path: Optional[str] = None):
|
||||
if self._initialized:
|
||||
return
|
||||
|
||||
self.config_path = config_path
|
||||
self.config = {}
|
||||
self.logger = logging.getLogger(__name__)
|
||||
|
||||
self._load_config()
|
||||
self._add_gitignore_patterns()
|
||||
self._apply_env_overrides()
|
||||
self._validate_config()
|
||||
|
||||
self._initialized = True
|
||||
|
||||
def _load_config(self):
|
||||
"""Load configuration from file with fallback hierarchy."""
|
||||
config_paths = self._get_config_paths()
|
||||
|
||||
for config_file in config_paths:
|
||||
if config_file.exists():
|
||||
try:
|
||||
with open(config_file, 'r', encoding='utf-8') as f:
|
||||
loaded_config = yaml.safe_load(f)
|
||||
if loaded_config:
|
||||
self.config = self._merge_configs(self.config, loaded_config)
|
||||
self.logger.info(f"Loaded config from {config_file}")
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Failed to load config from {config_file}: {e}")
|
||||
|
||||
# Apply default config if no config loaded
|
||||
if not self.config:
|
||||
self.config = self._get_default_config()
|
||||
self.logger.info("Using default configuration")
|
||||
|
||||
def _get_config_paths(self) -> List[Path]:
|
||||
"""Get ordered list of config file paths to check."""
|
||||
paths = []
|
||||
|
||||
# 1. Explicitly provided config path
|
||||
if self.config_path:
|
||||
paths.append(Path(self.config_path))
|
||||
|
||||
# 2. Current directory config.yaml
|
||||
paths.append(Path('config.yaml'))
|
||||
|
||||
# 3. Script directory config.yaml
|
||||
script_dir = Path(__file__).parent.parent
|
||||
paths.append(script_dir / 'config.yaml')
|
||||
|
||||
# 4. Default config in script directory
|
||||
paths.append(script_dir / 'default_config.yaml')
|
||||
|
||||
return paths
|
||||
|
||||
def _get_default_config(self) -> Dict[str, Any]:
|
||||
"""Get default configuration."""
|
||||
return {
|
||||
'token_limits': {
|
||||
'small_project': 500000,
|
||||
'medium_project': 2000000,
|
||||
'large_project': 10000000,
|
||||
'max_files': 1000
|
||||
},
|
||||
'exclude_patterns': [
|
||||
"*/node_modules/*",
|
||||
"*/.git/*",
|
||||
"*/build/*",
|
||||
"*/dist/*",
|
||||
"*/.next/*",
|
||||
"*/.nuxt/*",
|
||||
"*/target/*",
|
||||
"*/vendor/*",
|
||||
"*/__pycache__/*",
|
||||
"*.pyc",
|
||||
"*.pyo",
|
||||
"*.log",
|
||||
"*.tmp",
|
||||
"*.temp",
|
||||
"*.history"
|
||||
],
|
||||
'file_extensions': {
|
||||
'code': ['.py', '.js', '.ts', '.tsx', '.jsx', '.java', '.cpp', '.c', '.h', '.rs', '.go', '.php', '.rb', '.sh', '.bash'],
|
||||
'docs': ['.md', '.txt', '.rst', '.adoc'],
|
||||
'config': ['.json', '.yaml', '.yml', '.toml', '.ini', '.env'],
|
||||
'web': ['.html', '.css', '.scss', '.sass', '.xml']
|
||||
},
|
||||
'embedding': {
|
||||
'enabled': True,
|
||||
'model': 'all-MiniLM-L6-v2',
|
||||
'cache_dir': 'cache',
|
||||
'similarity_threshold': 0.3,
|
||||
'max_context_length': 512,
|
||||
'batch_size': 32
|
||||
},
|
||||
'context_analysis': {
|
||||
'domain_keywords': {
|
||||
'auth': ['auth', 'login', 'user', 'password', 'jwt', 'token', 'session'],
|
||||
'database': ['db', 'database', 'sql', 'query', 'model', 'schema', 'migration'],
|
||||
'api': ['api', 'endpoint', 'route', 'controller', 'service', 'handler'],
|
||||
'frontend': ['ui', 'component', 'view', 'template', 'style', 'css'],
|
||||
'backend': ['server', 'service', 'logic', 'business', 'core'],
|
||||
'test': ['test', 'spec', 'unit', 'integration', 'mock'],
|
||||
'config': ['config', 'setting', 'environment', 'env'],
|
||||
'util': ['util', 'helper', 'common', 'shared', 'lib']
|
||||
},
|
||||
'language_indicators': {
|
||||
'python': ['.py', 'python', 'pip', 'requirements.txt', 'setup.py'],
|
||||
'javascript': ['.js', '.ts', 'npm', 'package.json', 'node'],
|
||||
'java': ['.java', 'maven', 'gradle', 'pom.xml'],
|
||||
'go': ['.go', 'go.mod', 'go.sum'],
|
||||
'rust': ['.rs', 'cargo', 'Cargo.toml']
|
||||
}
|
||||
},
|
||||
'path_matching': {
|
||||
'weights': {
|
||||
'keyword_match': 0.4,
|
||||
'extension_match': 0.2,
|
||||
'directory_context': 0.2,
|
||||
'file_size_penalty': 0.1,
|
||||
'recency_bonus': 0.1
|
||||
},
|
||||
'max_files_per_category': 20,
|
||||
'min_relevance_score': 0.1
|
||||
},
|
||||
'output': {
|
||||
'pattern_format': '@{{{path}}}',
|
||||
'always_include': [
|
||||
'CLAUDE.md',
|
||||
'**/CLAUDE.md',
|
||||
'README.md',
|
||||
'docs/**/*.md'
|
||||
],
|
||||
'max_total_files': 50
|
||||
},
|
||||
'performance': {
|
||||
'cache_enabled': True,
|
||||
'cache_ttl': 3600,
|
||||
'max_file_size': 10485760,
|
||||
'max_workers': 4
|
||||
},
|
||||
'logging': {
|
||||
'level': 'INFO',
|
||||
'format': '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
||||
}
|
||||
}
|
||||
|
||||
def _merge_configs(self, base: Dict, override: Dict) -> Dict:
|
||||
"""Recursively merge configuration dictionaries."""
|
||||
result = base.copy()
|
||||
|
||||
for key, value in override.items():
|
||||
if key in result and isinstance(result[key], dict) and isinstance(value, dict):
|
||||
result[key] = self._merge_configs(result[key], value)
|
||||
else:
|
||||
result[key] = value
|
||||
|
||||
return result
|
||||
|
||||
def _add_gitignore_patterns(self):
|
||||
"""Add patterns from .gitignore files to exclude_patterns."""
|
||||
try:
|
||||
# Find root directory (current working directory or script parent)
|
||||
root_dir = Path.cwd()
|
||||
|
||||
gitignore_patterns = get_all_gitignore_patterns(str(root_dir))
|
||||
|
||||
if gitignore_patterns:
|
||||
# Ensure exclude_patterns exists
|
||||
if 'exclude_patterns' not in self.config:
|
||||
self.config['exclude_patterns'] = []
|
||||
|
||||
# Add gitignore patterns, avoiding duplicates
|
||||
existing_patterns = set(self.config['exclude_patterns'])
|
||||
new_patterns = [p for p in gitignore_patterns if p not in existing_patterns]
|
||||
|
||||
self.config['exclude_patterns'].extend(new_patterns)
|
||||
|
||||
self.logger.info(f"Added {len(new_patterns)} patterns from .gitignore files")
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Failed to load .gitignore patterns: {e}")
|
||||
|
||||
def _apply_env_overrides(self):
|
||||
"""Apply environment variable overrides."""
|
||||
env_mappings = {
|
||||
'ANALYZER_CACHE_DIR': ('embedding', 'cache_dir'),
|
||||
'ANALYZER_LOG_LEVEL': ('logging', 'level'),
|
||||
'ANALYZER_MAX_FILES': ('token_limits', 'max_files'),
|
||||
'ANALYZER_EMBEDDING_MODEL': ('embedding', 'model')
|
||||
}
|
||||
|
||||
for env_var, config_path in env_mappings.items():
|
||||
env_value = os.getenv(env_var)
|
||||
if env_value:
|
||||
self._set_nested_value(config_path, env_value)
|
||||
self.logger.info(f"Applied environment override: {env_var} = {env_value}")
|
||||
|
||||
def _set_nested_value(self, path: tuple, value: str):
|
||||
"""Set a nested configuration value."""
|
||||
current = self.config
|
||||
for key in path[:-1]:
|
||||
if key not in current:
|
||||
current[key] = {}
|
||||
current = current[key]
|
||||
|
||||
# Try to convert value to appropriate type
|
||||
if isinstance(current.get(path[-1]), int):
|
||||
try:
|
||||
value = int(value)
|
||||
except ValueError:
|
||||
pass
|
||||
elif isinstance(current.get(path[-1]), bool):
|
||||
value = value.lower() in ('true', '1', 'yes', 'on')
|
||||
|
||||
current[path[-1]] = value
|
||||
|
||||
def _validate_config(self):
|
||||
"""Validate configuration values."""
|
||||
required_sections = ['exclude_patterns', 'file_extensions', 'token_limits']
|
||||
|
||||
for section in required_sections:
|
||||
if section not in self.config:
|
||||
self.logger.warning(f"Missing required config section: {section}")
|
||||
|
||||
# Validate token limits
|
||||
if 'token_limits' in self.config:
|
||||
limits = self.config['token_limits']
|
||||
if limits.get('small_project', 0) >= limits.get('medium_project', 0):
|
||||
self.logger.warning("Token limit configuration may be incorrect")
|
||||
|
||||
def get(self, path: str, default: Any = None) -> Any:
|
||||
"""Get configuration value using dot notation."""
|
||||
keys = path.split('.')
|
||||
current = self.config
|
||||
|
||||
try:
|
||||
for key in keys:
|
||||
current = current[key]
|
||||
return current
|
||||
except (KeyError, TypeError):
|
||||
return default
|
||||
|
||||
def set(self, path: str, value: Any):
|
||||
"""Set configuration value using dot notation."""
|
||||
keys = path.split('.')
|
||||
current = self.config
|
||||
|
||||
for key in keys[:-1]:
|
||||
if key not in current:
|
||||
current[key] = {}
|
||||
current = current[key]
|
||||
|
||||
current[keys[-1]] = value
|
||||
|
||||
def get_exclude_patterns(self) -> List[str]:
|
||||
"""Get all exclude patterns including gitignore patterns."""
|
||||
return self.config.get('exclude_patterns', [])
|
||||
|
||||
def get_file_extensions(self) -> Dict[str, List[str]]:
|
||||
"""Get file extension mappings."""
|
||||
return self.config.get('file_extensions', {})
|
||||
|
||||
def is_embedding_enabled(self) -> bool:
|
||||
"""Check if embedding functionality is enabled."""
|
||||
return self.config.get('embedding', {}).get('enabled', False)
|
||||
|
||||
def get_cache_dir(self) -> str:
|
||||
"""Get cache directory path."""
|
||||
return self.config.get('embedding', {}).get('cache_dir', 'cache')
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
"""Return configuration as dictionary."""
|
||||
return self.config.copy()
|
||||
|
||||
def reload(self, config_path: Optional[str] = None):
|
||||
"""Reload configuration from file."""
|
||||
self._initialized = False
|
||||
if config_path:
|
||||
self.config_path = config_path
|
||||
self.__init__(self.config_path)
|
||||
|
||||
|
||||
# Global configuration instance
|
||||
_global_config = None
|
||||
|
||||
|
||||
def get_config(config_path: Optional[str] = None) -> Config:
|
||||
"""Get global configuration instance."""
|
||||
global _global_config
|
||||
if _global_config is None:
|
||||
_global_config = Config(config_path)
|
||||
return _global_config
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Test configuration loading
|
||||
config = Config()
|
||||
print("Configuration loaded successfully!")
|
||||
print(f"Cache dir: {config.get_cache_dir()}")
|
||||
print(f"Exclude patterns: {len(config.get_exclude_patterns())}")
|
||||
print(f"Embedding enabled: {config.is_embedding_enabled()}")
|
||||
359
.claude/python_script/core/context_analyzer.py
Normal file
359
.claude/python_script/core/context_analyzer.py
Normal file
@@ -0,0 +1,359 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Context Analyzer Module for UltraThink Path-Aware Analyzer
|
||||
Analyzes user prompts to extract relevant context and keywords.
|
||||
"""
|
||||
|
||||
import re
|
||||
import logging
|
||||
from typing import Dict, List, Set, Tuple, Optional
|
||||
from dataclasses import dataclass
|
||||
from collections import Counter
|
||||
import string
|
||||
|
||||
@dataclass
|
||||
class AnalysisResult:
|
||||
"""Results of context analysis."""
|
||||
keywords: List[str]
|
||||
domains: List[str]
|
||||
languages: List[str]
|
||||
file_patterns: List[str]
|
||||
confidence_scores: Dict[str, float]
|
||||
extracted_entities: Dict[str, List[str]]
|
||||
|
||||
class ContextAnalyzer:
|
||||
"""Analyzes user prompts to understand context and intent."""
|
||||
|
||||
def __init__(self, config: Dict):
|
||||
self.config = config
|
||||
self.logger = logging.getLogger(__name__)
|
||||
|
||||
# Load domain and language mappings from config
|
||||
self.domain_keywords = config.get('context_analysis', {}).get('domain_keywords', {})
|
||||
self.language_indicators = config.get('context_analysis', {}).get('language_indicators', {})
|
||||
|
||||
# Common programming terms and patterns
|
||||
self.technical_terms = self._build_technical_terms()
|
||||
self.file_pattern_indicators = self._build_pattern_indicators()
|
||||
|
||||
# Stop words to filter out
|
||||
self.stop_words = {
|
||||
'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with',
|
||||
'by', 'from', 'up', 'about', 'into', 'through', 'during', 'before', 'after',
|
||||
'above', 'below', 'between', 'among', 'as', 'is', 'are', 'was', 'were', 'be',
|
||||
'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would',
|
||||
'could', 'should', 'may', 'might', 'must', 'can', 'this', 'that', 'these',
|
||||
'those', 'i', 'you', 'he', 'she', 'it', 'we', 'they', 'me', 'him', 'her',
|
||||
'us', 'them', 'my', 'your', 'his', 'its', 'our', 'their'
|
||||
}
|
||||
|
||||
def _build_technical_terms(self) -> Dict[str, List[str]]:
|
||||
"""Build comprehensive list of technical terms grouped by category."""
|
||||
return {
|
||||
'authentication': [
|
||||
'auth', 'authentication', 'login', 'logout', 'signin', 'signout',
|
||||
'user', 'password', 'token', 'jwt', 'oauth', 'session', 'cookie',
|
||||
'credential', 'authorize', 'permission', 'role', 'access'
|
||||
],
|
||||
'database': [
|
||||
'database', 'db', 'sql', 'query', 'table', 'schema', 'migration',
|
||||
'model', 'orm', 'entity', 'relation', 'index', 'transaction',
|
||||
'crud', 'select', 'insert', 'update', 'delete', 'join'
|
||||
],
|
||||
'api': [
|
||||
'api', 'rest', 'graphql', 'endpoint', 'route', 'controller',
|
||||
'handler', 'middleware', 'service', 'request', 'response',
|
||||
'http', 'get', 'post', 'put', 'delete', 'patch'
|
||||
],
|
||||
'frontend': [
|
||||
'ui', 'component', 'view', 'template', 'page', 'layout',
|
||||
'style', 'css', 'html', 'javascript', 'react', 'vue',
|
||||
'angular', 'dom', 'event', 'state', 'props'
|
||||
],
|
||||
'backend': [
|
||||
'server', 'service', 'business', 'logic', 'core', 'engine',
|
||||
'worker', 'job', 'queue', 'cache', 'redis', 'memcache'
|
||||
],
|
||||
'testing': [
|
||||
'test', 'testing', 'spec', 'unit', 'integration', 'e2e',
|
||||
'mock', 'stub', 'fixture', 'assert', 'expect', 'should'
|
||||
],
|
||||
'configuration': [
|
||||
'config', 'configuration', 'setting', 'environment', 'env',
|
||||
'variable', 'constant', 'parameter', 'option'
|
||||
],
|
||||
'utility': [
|
||||
'util', 'utility', 'helper', 'common', 'shared', 'lib',
|
||||
'library', 'tool', 'function', 'method'
|
||||
]
|
||||
}
|
||||
|
||||
def _build_pattern_indicators(self) -> Dict[str, List[str]]:
|
||||
"""Build indicators that suggest specific file patterns."""
|
||||
return {
|
||||
'source_code': ['implement', 'code', 'function', 'class', 'method'],
|
||||
'tests': ['test', 'testing', 'spec', 'unittest', 'pytest'],
|
||||
'documentation': ['doc', 'readme', 'guide', 'documentation', 'manual'],
|
||||
'configuration': ['config', 'setting', 'env', 'environment'],
|
||||
'build': ['build', 'compile', 'package', 'deploy', 'release'],
|
||||
'scripts': ['script', 'automation', 'tool', 'utility']
|
||||
}
|
||||
|
||||
def extract_keywords(self, text: str) -> List[str]:
|
||||
"""Extract meaningful keywords from text."""
|
||||
# Clean and normalize text
|
||||
text = text.lower()
|
||||
text = re.sub(r'[^\w\s-]', ' ', text) # Remove punctuation except hyphens
|
||||
words = text.split()
|
||||
|
||||
# Filter stop words and short words
|
||||
keywords = []
|
||||
for word in words:
|
||||
word = word.strip('-') # Remove leading/trailing hyphens
|
||||
if (len(word) >= 2 and
|
||||
word not in self.stop_words and
|
||||
not word.isdigit()):
|
||||
keywords.append(word)
|
||||
|
||||
# Count frequency and return top keywords
|
||||
word_counts = Counter(keywords)
|
||||
return [word for word, count in word_counts.most_common(20)]
|
||||
|
||||
def identify_domains(self, keywords: List[str]) -> List[Tuple[str, float]]:
|
||||
"""Identify relevant domains based on keywords."""
|
||||
domain_scores = {}
|
||||
|
||||
for domain, domain_keywords in self.domain_keywords.items():
|
||||
score = 0.0
|
||||
matched_keywords = []
|
||||
|
||||
for keyword in keywords:
|
||||
for domain_keyword in domain_keywords:
|
||||
if keyword in domain_keyword or domain_keyword in keyword:
|
||||
score += 1.0
|
||||
matched_keywords.append(keyword)
|
||||
break
|
||||
|
||||
if score > 0:
|
||||
# Normalize score by number of domain keywords
|
||||
normalized_score = score / len(domain_keywords)
|
||||
domain_scores[domain] = normalized_score
|
||||
|
||||
# Also check technical terms
|
||||
for category, terms in self.technical_terms.items():
|
||||
score = 0.0
|
||||
for keyword in keywords:
|
||||
for term in terms:
|
||||
if keyword in term or term in keyword:
|
||||
score += 1.0
|
||||
break
|
||||
|
||||
if score > 0:
|
||||
normalized_score = score / len(terms)
|
||||
if category not in domain_scores:
|
||||
domain_scores[category] = normalized_score
|
||||
else:
|
||||
domain_scores[category] = max(domain_scores[category], normalized_score)
|
||||
|
||||
# Sort by score and return top domains
|
||||
sorted_domains = sorted(domain_scores.items(), key=lambda x: x[1], reverse=True)
|
||||
return sorted_domains[:5]
|
||||
|
||||
def identify_languages(self, keywords: List[str]) -> List[Tuple[str, float]]:
|
||||
"""Identify programming languages based on keywords."""
|
||||
language_scores = {}
|
||||
|
||||
for language, indicators in self.language_indicators.items():
|
||||
score = 0.0
|
||||
for keyword in keywords:
|
||||
for indicator in indicators:
|
||||
if keyword in indicator or indicator in keyword:
|
||||
score += 1.0
|
||||
break
|
||||
|
||||
if score > 0:
|
||||
normalized_score = score / len(indicators)
|
||||
language_scores[language] = normalized_score
|
||||
|
||||
sorted_languages = sorted(language_scores.items(), key=lambda x: x[1], reverse=True)
|
||||
return sorted_languages[:3]
|
||||
|
||||
def extract_file_patterns(self, text: str) -> List[str]:
|
||||
"""Extract explicit file patterns from text."""
|
||||
patterns = []
|
||||
|
||||
# Look for @{pattern} syntax
|
||||
at_patterns = re.findall(r'@\{([^}]+)\}', text)
|
||||
patterns.extend(at_patterns)
|
||||
|
||||
# Look for file extensions
|
||||
extensions = re.findall(r'\*\.(\w+)', text)
|
||||
for ext in extensions:
|
||||
patterns.append(f"*.{ext}")
|
||||
|
||||
# Look for directory patterns
|
||||
dir_patterns = re.findall(r'(\w+)/\*\*?', text)
|
||||
for dir_pattern in dir_patterns:
|
||||
patterns.append(f"{dir_pattern}/**/*")
|
||||
|
||||
# Look for specific file names
|
||||
file_patterns = re.findall(r'\b(\w+\.\w+)\b', text)
|
||||
for file_pattern in file_patterns:
|
||||
if '.' in file_pattern:
|
||||
patterns.append(file_pattern)
|
||||
|
||||
return list(set(patterns)) # Remove duplicates
|
||||
|
||||
def suggest_patterns_from_domains(self, domains: List[str]) -> List[str]:
|
||||
"""Suggest file patterns based on identified domains."""
|
||||
patterns = []
|
||||
|
||||
domain_to_patterns = {
|
||||
'auth': ['**/auth/**/*', '**/login/**/*', '**/user/**/*'],
|
||||
'authentication': ['**/auth/**/*', '**/login/**/*', '**/user/**/*'],
|
||||
'database': ['**/db/**/*', '**/model/**/*', '**/migration/**/*', '**/*model*'],
|
||||
'api': ['**/api/**/*', '**/route/**/*', '**/controller/**/*', '**/handler/**/*'],
|
||||
'frontend': ['**/ui/**/*', '**/component/**/*', '**/view/**/*', '**/template/**/*'],
|
||||
'backend': ['**/service/**/*', '**/core/**/*', '**/server/**/*'],
|
||||
'test': ['**/test/**/*', '**/spec/**/*', '**/*test*', '**/*spec*'],
|
||||
'testing': ['**/test/**/*', '**/spec/**/*', '**/*test*', '**/*spec*'],
|
||||
'config': ['**/config/**/*', '**/*.config.*', '**/env/**/*'],
|
||||
'configuration': ['**/config/**/*', '**/*.config.*', '**/env/**/*'],
|
||||
'util': ['**/util/**/*', '**/helper/**/*', '**/common/**/*'],
|
||||
'utility': ['**/util/**/*', '**/helper/**/*', '**/common/**/*']
|
||||
}
|
||||
|
||||
for domain in domains:
|
||||
if domain in domain_to_patterns:
|
||||
patterns.extend(domain_to_patterns[domain])
|
||||
|
||||
return list(set(patterns)) # Remove duplicates
|
||||
|
||||
def extract_entities(self, text: str) -> Dict[str, List[str]]:
|
||||
"""Extract named entities from text."""
|
||||
entities = {
|
||||
'files': [],
|
||||
'functions': [],
|
||||
'classes': [],
|
||||
'variables': [],
|
||||
'technologies': []
|
||||
}
|
||||
|
||||
# File patterns
|
||||
file_patterns = re.findall(r'\b(\w+\.\w+)\b', text)
|
||||
entities['files'] = list(set(file_patterns))
|
||||
|
||||
# Function patterns (camelCase or snake_case followed by parentheses)
|
||||
function_patterns = re.findall(r'\b([a-z][a-zA-Z0-9_]*)\s*\(', text)
|
||||
entities['functions'] = list(set(function_patterns))
|
||||
|
||||
# Class patterns (PascalCase)
|
||||
class_patterns = re.findall(r'\b([A-Z][a-zA-Z0-9]*)\b', text)
|
||||
entities['classes'] = list(set(class_patterns))
|
||||
|
||||
# Technology mentions
|
||||
tech_keywords = [
|
||||
'react', 'vue', 'angular', 'node', 'express', 'django', 'flask',
|
||||
'spring', 'rails', 'laravel', 'docker', 'kubernetes', 'aws',
|
||||
'azure', 'gcp', 'postgresql', 'mysql', 'mongodb', 'redis'
|
||||
]
|
||||
text_lower = text.lower()
|
||||
for tech in tech_keywords:
|
||||
if tech in text_lower:
|
||||
entities['technologies'].append(tech)
|
||||
|
||||
return entities
|
||||
|
||||
def analyze(self, prompt: str) -> AnalysisResult:
|
||||
"""Perform comprehensive analysis of the user prompt."""
|
||||
self.logger.debug(f"Analyzing prompt: {prompt[:100]}...")
|
||||
|
||||
# Extract keywords
|
||||
keywords = self.extract_keywords(prompt)
|
||||
|
||||
# Identify domains and languages
|
||||
domains_with_scores = self.identify_domains(keywords)
|
||||
languages_with_scores = self.identify_languages(keywords)
|
||||
|
||||
# Extract patterns and entities
|
||||
explicit_patterns = self.extract_file_patterns(prompt)
|
||||
entities = self.extract_entities(prompt)
|
||||
|
||||
# Get top domains and languages
|
||||
domains = [domain for domain, score in domains_with_scores]
|
||||
languages = [lang for lang, score in languages_with_scores]
|
||||
|
||||
# Suggest additional patterns based on domains
|
||||
suggested_patterns = self.suggest_patterns_from_domains(domains)
|
||||
|
||||
# Combine explicit and suggested patterns
|
||||
all_patterns = list(set(explicit_patterns + suggested_patterns))
|
||||
|
||||
# Build confidence scores
|
||||
confidence_scores = {
|
||||
'keywords': len(keywords) / 20, # Normalize to 0-1
|
||||
'domain_match': max([score for _, score in domains_with_scores[:1]], default=0),
|
||||
'language_match': max([score for _, score in languages_with_scores[:1]], default=0),
|
||||
'pattern_extraction': len(explicit_patterns) / 5, # Normalize to 0-1
|
||||
}
|
||||
|
||||
result = AnalysisResult(
|
||||
keywords=keywords,
|
||||
domains=domains,
|
||||
languages=languages,
|
||||
file_patterns=all_patterns,
|
||||
confidence_scores=confidence_scores,
|
||||
extracted_entities=entities
|
||||
)
|
||||
|
||||
self.logger.info(f"Analysis complete: {len(domains)} domains, {len(languages)} languages, {len(all_patterns)} patterns")
|
||||
return result
|
||||
|
||||
def main():
|
||||
"""Command-line interface for context analyzer."""
|
||||
import yaml
|
||||
import argparse
|
||||
import json
|
||||
|
||||
parser = argparse.ArgumentParser(description="Context Analyzer for UltraThink")
|
||||
parser.add_argument("prompt", help="Prompt to analyze")
|
||||
parser.add_argument("--config", default="config.yaml", help="Configuration file path")
|
||||
parser.add_argument("--verbose", "-v", action="store_true", help="Verbose output")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Setup logging
|
||||
level = logging.DEBUG if args.verbose else logging.INFO
|
||||
logging.basicConfig(level=level, format='%(levelname)s: %(message)s')
|
||||
|
||||
# Load configuration
|
||||
from pathlib import Path
|
||||
config_path = Path(__file__).parent / args.config
|
||||
with open(config_path, 'r', encoding='utf-8') as f:
|
||||
config = yaml.safe_load(f)
|
||||
|
||||
# Create analyzer
|
||||
analyzer = ContextAnalyzer(config)
|
||||
|
||||
# Analyze prompt
|
||||
result = analyzer.analyze(args.prompt)
|
||||
|
||||
# Output results
|
||||
print(f"Keywords: {', '.join(result.keywords[:10])}")
|
||||
print(f"Domains: {', '.join(result.domains[:5])}")
|
||||
print(f"Languages: {', '.join(result.languages[:3])}")
|
||||
print(f"Patterns: {', '.join(result.file_patterns[:10])}")
|
||||
|
||||
if args.verbose:
|
||||
print("\nDetailed Results:")
|
||||
print(json.dumps({
|
||||
'keywords': result.keywords,
|
||||
'domains': result.domains,
|
||||
'languages': result.languages,
|
||||
'file_patterns': result.file_patterns,
|
||||
'confidence_scores': result.confidence_scores,
|
||||
'extracted_entities': result.extracted_entities
|
||||
}, indent=2))
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
453
.claude/python_script/core/embedding_manager.py
Normal file
453
.claude/python_script/core/embedding_manager.py
Normal file
@@ -0,0 +1,453 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Embedding Manager Module for UltraThink Path-Aware Analyzer
|
||||
Manages embeddings for semantic similarity search (RAG functionality).
|
||||
"""
|
||||
|
||||
import os
|
||||
import json
|
||||
import hashlib
|
||||
import logging
|
||||
import pickle
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Tuple, Optional, Any
|
||||
from dataclasses import dataclass
|
||||
import time
|
||||
|
||||
# Optional imports for embedding functionality
|
||||
try:
|
||||
import numpy as np
|
||||
NUMPY_AVAILABLE = True
|
||||
except ImportError:
|
||||
NUMPY_AVAILABLE = False
|
||||
|
||||
try:
|
||||
from sentence_transformers import SentenceTransformer
|
||||
SENTENCE_TRANSFORMERS_AVAILABLE = True
|
||||
except ImportError:
|
||||
SENTENCE_TRANSFORMERS_AVAILABLE = False
|
||||
|
||||
from .file_indexer import FileInfo
|
||||
|
||||
@dataclass
|
||||
class EmbeddingInfo:
|
||||
"""Information about a file's embedding."""
|
||||
file_path: str
|
||||
content_hash: str
|
||||
embedding_hash: str
|
||||
created_time: float
|
||||
vector_size: int
|
||||
|
||||
@dataclass
|
||||
class SimilarityResult:
|
||||
"""Result of similarity search."""
|
||||
file_info: FileInfo
|
||||
similarity_score: float
|
||||
matching_content: str
|
||||
|
||||
class EmbeddingManager:
|
||||
"""Manages embeddings for semantic file matching."""
|
||||
|
||||
def __init__(self, config: Dict):
|
||||
self.config = config
|
||||
self.logger = logging.getLogger(__name__)
|
||||
|
||||
# Check if embeddings are enabled
|
||||
self.enabled = config.get('embedding', {}).get('enabled', False)
|
||||
if not self.enabled:
|
||||
self.logger.info("Embeddings disabled in configuration")
|
||||
return
|
||||
|
||||
# Check dependencies
|
||||
if not NUMPY_AVAILABLE:
|
||||
self.logger.warning("NumPy not available, disabling embeddings")
|
||||
self.enabled = False
|
||||
return
|
||||
|
||||
if not SENTENCE_TRANSFORMERS_AVAILABLE:
|
||||
self.logger.warning("sentence-transformers not available, disabling embeddings")
|
||||
self.enabled = False
|
||||
return
|
||||
|
||||
# Load configuration
|
||||
self.model_name = config.get('embedding', {}).get('model', 'all-MiniLM-L6-v2')
|
||||
self.cache_dir = Path(config.get('embedding', {}).get('cache_dir', '.claude/cache/embeddings'))
|
||||
self.similarity_threshold = config.get('embedding', {}).get('similarity_threshold', 0.6)
|
||||
self.max_context_length = config.get('embedding', {}).get('max_context_length', 512)
|
||||
self.batch_size = config.get('embedding', {}).get('batch_size', 32)
|
||||
|
||||
# Setup cache directories
|
||||
self.cache_dir.mkdir(parents=True, exist_ok=True)
|
||||
self.embeddings_file = self.cache_dir / "embeddings.pkl"
|
||||
self.index_file = self.cache_dir / "embedding_index.json"
|
||||
|
||||
# Initialize model lazily
|
||||
self._model = None
|
||||
self._embeddings_cache = None
|
||||
self._embedding_index = None
|
||||
|
||||
@property
|
||||
def model(self):
|
||||
"""Lazy load the embedding model."""
|
||||
if not self.enabled:
|
||||
return None
|
||||
|
||||
if self._model is None:
|
||||
try:
|
||||
self.logger.info(f"Loading embedding model: {self.model_name}")
|
||||
self._model = SentenceTransformer(self.model_name)
|
||||
self.logger.info(f"Model loaded successfully")
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to load embedding model: {e}")
|
||||
self.enabled = False
|
||||
return None
|
||||
|
||||
return self._model
|
||||
|
||||
def embeddings_exist(self) -> bool:
|
||||
"""Check if embeddings cache exists."""
|
||||
return self.embeddings_file.exists() and self.index_file.exists()
|
||||
|
||||
def _load_embedding_cache(self) -> Dict[str, np.ndarray]:
|
||||
"""Load embeddings from cache."""
|
||||
if self._embeddings_cache is not None:
|
||||
return self._embeddings_cache
|
||||
|
||||
if not self.embeddings_file.exists():
|
||||
self._embeddings_cache = {}
|
||||
return self._embeddings_cache
|
||||
|
||||
try:
|
||||
with open(self.embeddings_file, 'rb') as f:
|
||||
self._embeddings_cache = pickle.load(f)
|
||||
self.logger.debug(f"Loaded {len(self._embeddings_cache)} embeddings from cache")
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Failed to load embeddings cache: {e}")
|
||||
self._embeddings_cache = {}
|
||||
|
||||
return self._embeddings_cache
|
||||
|
||||
def _save_embedding_cache(self):
|
||||
"""Save embeddings to cache."""
|
||||
if self._embeddings_cache is None:
|
||||
return
|
||||
|
||||
try:
|
||||
with open(self.embeddings_file, 'wb') as f:
|
||||
pickle.dump(self._embeddings_cache, f)
|
||||
self.logger.debug(f"Saved {len(self._embeddings_cache)} embeddings to cache")
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to save embeddings cache: {e}")
|
||||
|
||||
def _load_embedding_index(self) -> Dict[str, EmbeddingInfo]:
|
||||
"""Load embedding index."""
|
||||
if self._embedding_index is not None:
|
||||
return self._embedding_index
|
||||
|
||||
if not self.index_file.exists():
|
||||
self._embedding_index = {}
|
||||
return self._embedding_index
|
||||
|
||||
try:
|
||||
with open(self.index_file, 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
self._embedding_index = {}
|
||||
for path, info_dict in data.items():
|
||||
self._embedding_index[path] = EmbeddingInfo(**info_dict)
|
||||
self.logger.debug(f"Loaded embedding index with {len(self._embedding_index)} entries")
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Failed to load embedding index: {e}")
|
||||
self._embedding_index = {}
|
||||
|
||||
return self._embedding_index
|
||||
|
||||
def _save_embedding_index(self):
|
||||
"""Save embedding index."""
|
||||
if self._embedding_index is None:
|
||||
return
|
||||
|
||||
try:
|
||||
data = {}
|
||||
for path, info in self._embedding_index.items():
|
||||
data[path] = {
|
||||
'file_path': info.file_path,
|
||||
'content_hash': info.content_hash,
|
||||
'embedding_hash': info.embedding_hash,
|
||||
'created_time': info.created_time,
|
||||
'vector_size': info.vector_size
|
||||
}
|
||||
|
||||
with open(self.index_file, 'w', encoding='utf-8') as f:
|
||||
json.dump(data, f, indent=2)
|
||||
self.logger.debug(f"Saved embedding index with {len(self._embedding_index)} entries")
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to save embedding index: {e}")
|
||||
|
||||
def _extract_text_content(self, file_info: FileInfo) -> Optional[str]:
|
||||
"""Extract text content from a file for embedding."""
|
||||
try:
|
||||
file_path = Path(file_info.path)
|
||||
|
||||
# Skip binary files and very large files
|
||||
if file_info.size > self.config.get('performance', {}).get('max_file_size', 10485760):
|
||||
return None
|
||||
|
||||
# Only process text-based files
|
||||
text_extensions = {'.py', '.js', '.ts', '.tsx', '.jsx', '.java', '.cpp', '.c', '.h',
|
||||
'.rs', '.go', '.php', '.rb', '.sh', '.bash', '.md', '.txt', '.json',
|
||||
'.yaml', '.yml', '.xml', '.html', '.css', '.scss', '.sass'}
|
||||
|
||||
if file_info.extension.lower() not in text_extensions:
|
||||
return None
|
||||
|
||||
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
|
||||
content = f.read()
|
||||
|
||||
# Truncate content if too long
|
||||
if len(content) > self.max_context_length * 4: # Approximate token limit
|
||||
content = content[:self.max_context_length * 4]
|
||||
|
||||
return content
|
||||
|
||||
except Exception as e:
|
||||
self.logger.debug(f"Could not extract content from {file_info.path}: {e}")
|
||||
return None
|
||||
|
||||
def _create_embedding(self, text: str) -> Optional[np.ndarray]:
|
||||
"""Create embedding for text content."""
|
||||
if not self.enabled or self.model is None:
|
||||
return None
|
||||
|
||||
try:
|
||||
# Truncate text if needed
|
||||
if len(text) > self.max_context_length * 4:
|
||||
text = text[:self.max_context_length * 4]
|
||||
|
||||
embedding = self.model.encode([text])[0]
|
||||
return embedding
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Failed to create embedding: {e}")
|
||||
return None
|
||||
|
||||
def _get_content_hash(self, content: str) -> str:
|
||||
"""Get hash of content for caching."""
|
||||
return hashlib.md5(content.encode('utf-8')).hexdigest()
|
||||
|
||||
def _get_embedding_hash(self, embedding: np.ndarray) -> str:
|
||||
"""Get hash of embedding for verification."""
|
||||
return hashlib.md5(embedding.tobytes()).hexdigest()
|
||||
|
||||
def update_embeddings(self, file_index: Dict[str, FileInfo], force_rebuild: bool = False) -> int:
|
||||
"""Update embeddings for files in the index."""
|
||||
if not self.enabled:
|
||||
self.logger.info("Embeddings disabled, skipping update")
|
||||
return 0
|
||||
|
||||
self.logger.info("Updating embeddings...")
|
||||
|
||||
# Load caches
|
||||
embeddings_cache = self._load_embedding_cache()
|
||||
embedding_index = self._load_embedding_index()
|
||||
|
||||
new_embeddings = 0
|
||||
batch_texts = []
|
||||
batch_paths = []
|
||||
|
||||
for file_path, file_info in file_index.items():
|
||||
# Check if embedding exists and is current
|
||||
if not force_rebuild and file_path in embedding_index:
|
||||
cached_info = embedding_index[file_path]
|
||||
if cached_info.content_hash == file_info.content_hash:
|
||||
continue # Embedding is current
|
||||
|
||||
# Extract content
|
||||
content = self._extract_text_content(file_info)
|
||||
if content is None:
|
||||
continue
|
||||
|
||||
# Prepare for batch processing
|
||||
batch_texts.append(content)
|
||||
batch_paths.append(file_path)
|
||||
|
||||
# Process batch when full
|
||||
if len(batch_texts) >= self.batch_size:
|
||||
self._process_batch(batch_texts, batch_paths, file_index, embeddings_cache, embedding_index)
|
||||
new_embeddings += len(batch_texts)
|
||||
batch_texts = []
|
||||
batch_paths = []
|
||||
|
||||
# Process remaining batch
|
||||
if batch_texts:
|
||||
self._process_batch(batch_texts, batch_paths, file_index, embeddings_cache, embedding_index)
|
||||
new_embeddings += len(batch_texts)
|
||||
|
||||
# Save caches
|
||||
self._save_embedding_cache()
|
||||
self._save_embedding_index()
|
||||
|
||||
self.logger.info(f"Updated {new_embeddings} embeddings")
|
||||
return new_embeddings
|
||||
|
||||
def _process_batch(self, texts: List[str], paths: List[str], file_index: Dict[str, FileInfo],
|
||||
embeddings_cache: Dict[str, np.ndarray], embedding_index: Dict[str, EmbeddingInfo]):
|
||||
"""Process a batch of texts for embedding."""
|
||||
try:
|
||||
# Create embeddings for batch
|
||||
embeddings = self.model.encode(texts)
|
||||
|
||||
for i, (text, path) in enumerate(zip(texts, paths)):
|
||||
embedding = embeddings[i]
|
||||
file_info = file_index[path]
|
||||
|
||||
# Store embedding
|
||||
content_hash = self._get_content_hash(text)
|
||||
embedding_hash = self._get_embedding_hash(embedding)
|
||||
|
||||
embeddings_cache[path] = embedding
|
||||
embedding_index[path] = EmbeddingInfo(
|
||||
file_path=path,
|
||||
content_hash=content_hash,
|
||||
embedding_hash=embedding_hash,
|
||||
created_time=time.time(),
|
||||
vector_size=len(embedding)
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to process embedding batch: {e}")
|
||||
|
||||
def find_similar_files(self, query: str, file_index: Dict[str, FileInfo],
|
||||
top_k: int = 20) -> List[SimilarityResult]:
|
||||
"""Find files similar to the query using embeddings."""
|
||||
if not self.enabled:
|
||||
return []
|
||||
|
||||
# Create query embedding
|
||||
query_embedding = self._create_embedding(query)
|
||||
if query_embedding is None:
|
||||
return []
|
||||
|
||||
# Load embeddings
|
||||
embeddings_cache = self._load_embedding_cache()
|
||||
if not embeddings_cache:
|
||||
self.logger.warning("No embeddings available for similarity search")
|
||||
return []
|
||||
|
||||
# Calculate similarities
|
||||
similarities = []
|
||||
for file_path, file_embedding in embeddings_cache.items():
|
||||
if file_path not in file_index:
|
||||
continue
|
||||
|
||||
try:
|
||||
# Calculate cosine similarity
|
||||
similarity = np.dot(query_embedding, file_embedding) / (
|
||||
np.linalg.norm(query_embedding) * np.linalg.norm(file_embedding)
|
||||
)
|
||||
|
||||
if similarity >= self.similarity_threshold:
|
||||
similarities.append((file_path, similarity))
|
||||
|
||||
except Exception as e:
|
||||
self.logger.debug(f"Failed to calculate similarity for {file_path}: {e}")
|
||||
continue
|
||||
|
||||
# Sort by similarity
|
||||
similarities.sort(key=lambda x: x[1], reverse=True)
|
||||
|
||||
# Create results
|
||||
results = []
|
||||
for file_path, similarity in similarities[:top_k]:
|
||||
file_info = file_index[file_path]
|
||||
|
||||
# Extract a snippet of matching content
|
||||
content = self._extract_text_content(file_info)
|
||||
snippet = content[:200] + "..." if content and len(content) > 200 else content or ""
|
||||
|
||||
result = SimilarityResult(
|
||||
file_info=file_info,
|
||||
similarity_score=similarity,
|
||||
matching_content=snippet
|
||||
)
|
||||
results.append(result)
|
||||
|
||||
self.logger.info(f"Found {len(results)} similar files for query")
|
||||
return results
|
||||
|
||||
def get_stats(self) -> Dict[str, Any]:
|
||||
"""Get statistics about the embedding cache."""
|
||||
if not self.enabled:
|
||||
return {'enabled': False}
|
||||
|
||||
embedding_index = self._load_embedding_index()
|
||||
embeddings_cache = self._load_embedding_cache()
|
||||
|
||||
return {
|
||||
'enabled': True,
|
||||
'model_name': self.model_name,
|
||||
'total_embeddings': len(embedding_index),
|
||||
'cache_size_mb': os.path.getsize(self.embeddings_file) / 1024 / 1024 if self.embeddings_file.exists() else 0,
|
||||
'similarity_threshold': self.similarity_threshold,
|
||||
'vector_size': list(embedding_index.values())[0].vector_size if embedding_index else 0
|
||||
}
|
||||
|
||||
def main():
|
||||
"""Command-line interface for embedding manager."""
|
||||
import yaml
|
||||
import argparse
|
||||
from .file_indexer import FileIndexer
|
||||
|
||||
parser = argparse.ArgumentParser(description="Embedding Manager for UltraThink")
|
||||
parser.add_argument("--config", default="config.yaml", help="Configuration file path")
|
||||
parser.add_argument("--update", action="store_true", help="Update embeddings")
|
||||
parser.add_argument("--rebuild", action="store_true", help="Force rebuild all embeddings")
|
||||
parser.add_argument("--query", help="Search for similar files")
|
||||
parser.add_argument("--stats", action="store_true", help="Show embedding statistics")
|
||||
parser.add_argument("--verbose", "-v", action="store_true", help="Verbose output")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Setup logging
|
||||
level = logging.DEBUG if args.verbose else logging.INFO
|
||||
logging.basicConfig(level=level, format='%(levelname)s: %(message)s')
|
||||
|
||||
# Load configuration
|
||||
config_path = Path(__file__).parent / args.config
|
||||
with open(config_path, 'r', encoding='utf-8') as f:
|
||||
config = yaml.safe_load(f)
|
||||
|
||||
# Create components
|
||||
indexer = FileIndexer(config)
|
||||
embedding_manager = EmbeddingManager(config)
|
||||
|
||||
if not embedding_manager.enabled:
|
||||
print("Embeddings are disabled. Enable in config.yaml or install required dependencies.")
|
||||
return
|
||||
|
||||
# Load file index
|
||||
file_index = indexer.load_index()
|
||||
if not file_index:
|
||||
print("Building file index...")
|
||||
file_index = indexer.build_index()
|
||||
|
||||
if args.stats:
|
||||
stats = embedding_manager.get_stats()
|
||||
print("Embedding Statistics:")
|
||||
for key, value in stats.items():
|
||||
print(f" {key}: {value}")
|
||||
return
|
||||
|
||||
if args.update or args.rebuild:
|
||||
count = embedding_manager.update_embeddings(file_index, force_rebuild=args.rebuild)
|
||||
print(f"Updated {count} embeddings")
|
||||
|
||||
if args.query:
|
||||
results = embedding_manager.find_similar_files(args.query, file_index)
|
||||
print(f"Found {len(results)} similar files:")
|
||||
for result in results:
|
||||
print(f" {result.file_info.relative_path} (similarity: {result.similarity_score:.3f})")
|
||||
if args.verbose and result.matching_content:
|
||||
print(f" Content: {result.matching_content[:100]}...")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
383
.claude/python_script/core/file_indexer.py
Normal file
383
.claude/python_script/core/file_indexer.py
Normal file
@@ -0,0 +1,383 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
File Indexer Module for UltraThink Path-Aware Analyzer
|
||||
Builds and maintains an index of repository files with metadata.
|
||||
Enhanced with gitignore support and unified configuration.
|
||||
"""
|
||||
|
||||
import os
|
||||
import hashlib
|
||||
import json
|
||||
import time
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional, Set, Tuple, Union
|
||||
from dataclasses import dataclass, asdict
|
||||
from datetime import datetime
|
||||
import fnmatch
|
||||
|
||||
from .gitignore_parser import GitignoreParser
|
||||
|
||||
@dataclass
|
||||
class FileInfo:
|
||||
"""Information about a single file in the repository."""
|
||||
path: str
|
||||
relative_path: str
|
||||
size: int
|
||||
modified_time: float
|
||||
extension: str
|
||||
category: str # code, docs, config, web
|
||||
estimated_tokens: int
|
||||
content_hash: str
|
||||
|
||||
def to_dict(self) -> Dict:
|
||||
return asdict(self)
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: Dict) -> 'FileInfo':
|
||||
return cls(**data)
|
||||
|
||||
@dataclass
|
||||
class IndexStats:
|
||||
"""Statistics about the file index."""
|
||||
total_files: int
|
||||
total_tokens: int
|
||||
total_size: int
|
||||
categories: Dict[str, int]
|
||||
last_updated: float
|
||||
|
||||
def to_dict(self) -> Dict:
|
||||
return asdict(self)
|
||||
|
||||
class FileIndexer:
|
||||
"""Builds and maintains an efficient index of repository files."""
|
||||
|
||||
def __init__(self, config: Union['Config', Dict], root_path: str = "."):
|
||||
# Support both Config object and Dict for backward compatibility
|
||||
if hasattr(config, 'to_dict'):
|
||||
self.config_obj = config
|
||||
self.config = config.to_dict()
|
||||
else:
|
||||
self.config_obj = None
|
||||
self.config = config
|
||||
|
||||
self.root_path = Path(root_path).resolve()
|
||||
self.cache_dir = Path(self.config.get('embedding', {}).get('cache_dir', '.claude/cache'))
|
||||
self.cache_dir.mkdir(parents=True, exist_ok=True)
|
||||
self.index_file = self.cache_dir / "file_index.json"
|
||||
|
||||
# Setup logging
|
||||
self.logger = logging.getLogger(__name__)
|
||||
|
||||
# File extension mappings
|
||||
self.extension_categories = self._build_extension_map()
|
||||
|
||||
# Exclude patterns from config
|
||||
self.exclude_patterns = list(self.config.get('exclude_patterns', []))
|
||||
|
||||
# Initialize gitignore parser and add patterns
|
||||
self.gitignore_parser = GitignoreParser(str(self.root_path))
|
||||
self._load_gitignore_patterns()
|
||||
|
||||
# Performance settings
|
||||
self.max_file_size = self.config.get('performance', {}).get('max_file_size', 10485760)
|
||||
|
||||
def _build_extension_map(self) -> Dict[str, str]:
|
||||
"""Build mapping from file extensions to categories."""
|
||||
ext_map = {}
|
||||
for category, extensions in self.config.get('file_extensions', {}).items():
|
||||
for ext in extensions:
|
||||
ext_map[ext.lower()] = category
|
||||
return ext_map
|
||||
|
||||
def _load_gitignore_patterns(self):
|
||||
"""Load patterns from .gitignore files and add to exclude_patterns."""
|
||||
try:
|
||||
gitignore_patterns = self.gitignore_parser.parse_all_gitignores()
|
||||
|
||||
if gitignore_patterns:
|
||||
# Avoid duplicates
|
||||
existing_patterns = set(self.exclude_patterns)
|
||||
new_patterns = [p for p in gitignore_patterns if p not in existing_patterns]
|
||||
|
||||
self.exclude_patterns.extend(new_patterns)
|
||||
self.logger.info(f"Added {len(new_patterns)} patterns from .gitignore files")
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Failed to load .gitignore patterns: {e}")
|
||||
|
||||
def _should_exclude_file(self, file_path: Path) -> bool:
|
||||
"""Check if file should be excluded based on patterns and gitignore rules."""
|
||||
relative_path = str(file_path.relative_to(self.root_path))
|
||||
|
||||
# Check against exclude patterns from config
|
||||
for pattern in self.exclude_patterns:
|
||||
# Convert pattern to work with fnmatch
|
||||
if fnmatch.fnmatch(relative_path, pattern) or fnmatch.fnmatch(str(file_path), pattern):
|
||||
return True
|
||||
|
||||
# Check if any parent directory matches
|
||||
parts = relative_path.split(os.sep)
|
||||
for i in range(len(parts)):
|
||||
partial_path = "/".join(parts[:i+1])
|
||||
if fnmatch.fnmatch(partial_path, pattern):
|
||||
return True
|
||||
|
||||
# Also check gitignore rules using dedicated parser
|
||||
# Note: gitignore patterns are already included in self.exclude_patterns
|
||||
# but we can add additional gitignore-specific checking here if needed
|
||||
try:
|
||||
# The gitignore patterns are already loaded into exclude_patterns,
|
||||
# but we can do additional gitignore-specific checks if needed
|
||||
pass
|
||||
except Exception as e:
|
||||
self.logger.debug(f"Error in gitignore checking for {file_path}: {e}")
|
||||
|
||||
return False
|
||||
|
||||
def _estimate_tokens(self, file_path: Path) -> int:
|
||||
"""Estimate token count for a file (chars/4 approximation)."""
|
||||
try:
|
||||
if file_path.stat().st_size > self.max_file_size:
|
||||
return file_path.stat().st_size // 8 # Penalty for large files
|
||||
|
||||
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
|
||||
content = f.read()
|
||||
return len(content) // 4 # Rough approximation
|
||||
except (UnicodeDecodeError, OSError):
|
||||
# Binary files or unreadable files
|
||||
return file_path.stat().st_size // 8
|
||||
|
||||
def _get_file_hash(self, file_path: Path) -> str:
|
||||
"""Get a hash of file metadata for change detection."""
|
||||
stat = file_path.stat()
|
||||
return hashlib.md5(f"{file_path}:{stat.st_size}:{stat.st_mtime}".encode()).hexdigest()
|
||||
|
||||
def _categorize_file(self, file_path: Path) -> str:
|
||||
"""Categorize file based on extension."""
|
||||
extension = file_path.suffix.lower()
|
||||
return self.extension_categories.get(extension, 'other')
|
||||
|
||||
def _scan_file(self, file_path: Path) -> Optional[FileInfo]:
|
||||
"""Scan a single file and create FileInfo."""
|
||||
try:
|
||||
if not file_path.is_file() or self._should_exclude_file(file_path):
|
||||
return None
|
||||
|
||||
stat = file_path.stat()
|
||||
relative_path = str(file_path.relative_to(self.root_path))
|
||||
|
||||
file_info = FileInfo(
|
||||
path=str(file_path),
|
||||
relative_path=relative_path,
|
||||
size=stat.st_size,
|
||||
modified_time=stat.st_mtime,
|
||||
extension=file_path.suffix.lower(),
|
||||
category=self._categorize_file(file_path),
|
||||
estimated_tokens=self._estimate_tokens(file_path),
|
||||
content_hash=self._get_file_hash(file_path)
|
||||
)
|
||||
|
||||
return file_info
|
||||
|
||||
except (OSError, PermissionError) as e:
|
||||
self.logger.warning(f"Could not scan file {file_path}: {e}")
|
||||
return None
|
||||
|
||||
def build_index(self, force_rebuild: bool = False) -> Dict[str, FileInfo]:
|
||||
"""Build or update the file index."""
|
||||
self.logger.info(f"Building file index for {self.root_path}")
|
||||
|
||||
# Load existing index if available
|
||||
existing_index = {}
|
||||
if not force_rebuild and self.index_file.exists():
|
||||
existing_index = self.load_index()
|
||||
|
||||
new_index = {}
|
||||
changed_files = 0
|
||||
|
||||
# Walk through all files
|
||||
for file_path in self.root_path.rglob('*'):
|
||||
if not file_path.is_file():
|
||||
continue
|
||||
|
||||
file_info = self._scan_file(file_path)
|
||||
if file_info is None:
|
||||
continue
|
||||
|
||||
# Check if file has changed
|
||||
relative_path = file_info.relative_path
|
||||
if relative_path in existing_index:
|
||||
old_info = existing_index[relative_path]
|
||||
if old_info.content_hash == file_info.content_hash:
|
||||
# File unchanged, keep old info
|
||||
new_index[relative_path] = old_info
|
||||
continue
|
||||
|
||||
# File is new or changed
|
||||
new_index[relative_path] = file_info
|
||||
changed_files += 1
|
||||
|
||||
self.logger.info(f"Indexed {len(new_index)} files ({changed_files} new/changed)")
|
||||
|
||||
# Save index
|
||||
self.save_index(new_index)
|
||||
|
||||
return new_index
|
||||
|
||||
def load_index(self) -> Dict[str, FileInfo]:
|
||||
"""Load file index from cache."""
|
||||
if not self.index_file.exists():
|
||||
return {}
|
||||
|
||||
try:
|
||||
with open(self.index_file, 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
index = {}
|
||||
for path, info_dict in data.get('files', {}).items():
|
||||
index[path] = FileInfo.from_dict(info_dict)
|
||||
return index
|
||||
except (json.JSONDecodeError, KeyError) as e:
|
||||
self.logger.warning(f"Could not load index: {e}")
|
||||
return {}
|
||||
|
||||
def save_index(self, index: Dict[str, FileInfo]) -> None:
|
||||
"""Save file index to cache."""
|
||||
try:
|
||||
# Calculate stats
|
||||
stats = self._calculate_stats(index)
|
||||
|
||||
data = {
|
||||
'stats': stats.to_dict(),
|
||||
'files': {path: info.to_dict() for path, info in index.items()}
|
||||
}
|
||||
|
||||
with open(self.index_file, 'w', encoding='utf-8') as f:
|
||||
json.dump(data, f, indent=2)
|
||||
|
||||
except OSError as e:
|
||||
self.logger.error(f"Could not save index: {e}")
|
||||
|
||||
def _calculate_stats(self, index: Dict[str, FileInfo]) -> IndexStats:
|
||||
"""Calculate statistics for the index."""
|
||||
total_files = len(index)
|
||||
total_tokens = sum(info.estimated_tokens for info in index.values())
|
||||
total_size = sum(info.size for info in index.values())
|
||||
|
||||
categories = {}
|
||||
for info in index.values():
|
||||
categories[info.category] = categories.get(info.category, 0) + 1
|
||||
|
||||
return IndexStats(
|
||||
total_files=total_files,
|
||||
total_tokens=total_tokens,
|
||||
total_size=total_size,
|
||||
categories=categories,
|
||||
last_updated=time.time()
|
||||
)
|
||||
|
||||
def get_stats(self) -> Optional[IndexStats]:
|
||||
"""Get statistics about the current index."""
|
||||
if not self.index_file.exists():
|
||||
return None
|
||||
|
||||
try:
|
||||
with open(self.index_file, 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
return IndexStats(**data.get('stats', {}))
|
||||
except (json.JSONDecodeError, KeyError):
|
||||
return None
|
||||
|
||||
def find_files_by_pattern(self, pattern: str, index: Optional[Dict[str, FileInfo]] = None) -> List[FileInfo]:
|
||||
"""Find files matching a glob pattern."""
|
||||
if index is None:
|
||||
index = self.load_index()
|
||||
|
||||
matching_files = []
|
||||
for path, info in index.items():
|
||||
if fnmatch.fnmatch(path, pattern) or fnmatch.fnmatch(info.path, pattern):
|
||||
matching_files.append(info)
|
||||
|
||||
return matching_files
|
||||
|
||||
def find_files_by_category(self, category: str, index: Optional[Dict[str, FileInfo]] = None) -> List[FileInfo]:
|
||||
"""Find files by category (code, docs, config, etc.)."""
|
||||
if index is None:
|
||||
index = self.load_index()
|
||||
|
||||
return [info for info in index.values() if info.category == category]
|
||||
|
||||
def find_files_by_keywords(self, keywords: List[str], index: Optional[Dict[str, FileInfo]] = None) -> List[FileInfo]:
|
||||
"""Find files whose paths contain any of the specified keywords."""
|
||||
if index is None:
|
||||
index = self.load_index()
|
||||
|
||||
matching_files = []
|
||||
keywords_lower = [kw.lower() for kw in keywords]
|
||||
|
||||
for info in index.values():
|
||||
path_lower = info.relative_path.lower()
|
||||
if any(keyword in path_lower for keyword in keywords_lower):
|
||||
matching_files.append(info)
|
||||
|
||||
return matching_files
|
||||
|
||||
def get_recent_files(self, limit: int = 20, index: Optional[Dict[str, FileInfo]] = None) -> List[FileInfo]:
|
||||
"""Get most recently modified files."""
|
||||
if index is None:
|
||||
index = self.load_index()
|
||||
|
||||
files = list(index.values())
|
||||
files.sort(key=lambda f: f.modified_time, reverse=True)
|
||||
return files[:limit]
|
||||
|
||||
def main():
|
||||
"""Command-line interface for file indexer."""
|
||||
import yaml
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description="File Indexer for UltraThink")
|
||||
parser.add_argument("--config", default="config.yaml", help="Configuration file path")
|
||||
parser.add_argument("--rebuild", action="store_true", help="Force rebuild index")
|
||||
parser.add_argument("--stats", action="store_true", help="Show index statistics")
|
||||
parser.add_argument("--pattern", help="Find files matching pattern")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Load configuration
|
||||
config_path = Path(__file__).parent / args.config
|
||||
with open(config_path, 'r', encoding='utf-8') as f:
|
||||
config = yaml.safe_load(f)
|
||||
|
||||
# Setup logging
|
||||
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
|
||||
|
||||
# Create indexer
|
||||
indexer = FileIndexer(config)
|
||||
|
||||
if args.stats:
|
||||
stats = indexer.get_stats()
|
||||
if stats:
|
||||
print(f"Total files: {stats.total_files}")
|
||||
print(f"Total tokens: {stats.total_tokens:,}")
|
||||
print(f"Total size: {stats.total_size:,} bytes")
|
||||
print(f"Categories: {stats.categories}")
|
||||
print(f"Last updated: {datetime.fromtimestamp(stats.last_updated)}")
|
||||
else:
|
||||
print("No index found. Run without --stats to build index.")
|
||||
return
|
||||
|
||||
# Build index
|
||||
index = indexer.build_index(force_rebuild=args.rebuild)
|
||||
|
||||
if args.pattern:
|
||||
files = indexer.find_files_by_pattern(args.pattern, index)
|
||||
print(f"Found {len(files)} files matching pattern '{args.pattern}':")
|
||||
for file_info in files[:20]: # Limit output
|
||||
print(f" {file_info.relative_path}")
|
||||
else:
|
||||
stats = indexer._calculate_stats(index)
|
||||
print(f"Index built: {stats.total_files} files, ~{stats.total_tokens:,} tokens")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
182
.claude/python_script/core/gitignore_parser.py
Normal file
182
.claude/python_script/core/gitignore_parser.py
Normal file
@@ -0,0 +1,182 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
GitIgnore Parser Module
|
||||
Parses .gitignore files and converts rules to fnmatch patterns for file exclusion.
|
||||
"""
|
||||
|
||||
import os
|
||||
import fnmatch
|
||||
from pathlib import Path
|
||||
from typing import List, Set, Optional
|
||||
|
||||
|
||||
class GitignoreParser:
|
||||
"""Parser for .gitignore files that converts rules to fnmatch patterns."""
|
||||
|
||||
def __init__(self, root_path: str = "."):
|
||||
self.root_path = Path(root_path).resolve()
|
||||
self.patterns: List[str] = []
|
||||
self.negation_patterns: List[str] = []
|
||||
|
||||
def parse_file(self, gitignore_path: str) -> List[str]:
|
||||
"""Parse a .gitignore file and return exclude patterns."""
|
||||
gitignore_file = Path(gitignore_path)
|
||||
if not gitignore_file.exists():
|
||||
return []
|
||||
|
||||
patterns = []
|
||||
try:
|
||||
with open(gitignore_file, 'r', encoding='utf-8') as f:
|
||||
for line_num, line in enumerate(f, 1):
|
||||
pattern = self._parse_line(line.strip())
|
||||
if pattern:
|
||||
patterns.append(pattern)
|
||||
except (UnicodeDecodeError, IOError):
|
||||
# Fallback to system encoding if UTF-8 fails
|
||||
try:
|
||||
with open(gitignore_file, 'r') as f:
|
||||
for line_num, line in enumerate(f, 1):
|
||||
pattern = self._parse_line(line.strip())
|
||||
if pattern:
|
||||
patterns.append(pattern)
|
||||
except IOError:
|
||||
# If file can't be read, return empty list
|
||||
return []
|
||||
|
||||
return patterns
|
||||
|
||||
def _parse_line(self, line: str) -> Optional[str]:
|
||||
"""Parse a single line from .gitignore file."""
|
||||
# Skip empty lines and comments
|
||||
if not line or line.startswith('#'):
|
||||
return None
|
||||
|
||||
# Handle negation patterns (starting with !)
|
||||
if line.startswith('!'):
|
||||
# For now, we'll skip negation patterns as they require
|
||||
# more complex logic to implement correctly
|
||||
return None
|
||||
|
||||
# Convert gitignore pattern to fnmatch pattern
|
||||
return self._convert_to_fnmatch(line)
|
||||
|
||||
def _convert_to_fnmatch(self, pattern: str) -> str:
|
||||
"""Convert gitignore pattern to fnmatch pattern."""
|
||||
# Remove trailing slash (directory indicator)
|
||||
if pattern.endswith('/'):
|
||||
pattern = pattern[:-1]
|
||||
|
||||
# Handle absolute paths (starting with /)
|
||||
if pattern.startswith('/'):
|
||||
pattern = pattern[1:]
|
||||
# Make it match from root
|
||||
return pattern
|
||||
|
||||
# Handle patterns that should match anywhere in the tree
|
||||
# If pattern doesn't contain '/', it matches files/dirs at any level
|
||||
if '/' not in pattern:
|
||||
return f"*/{pattern}"
|
||||
|
||||
# Pattern contains '/', so it's relative to the gitignore location
|
||||
return pattern
|
||||
|
||||
def parse_all_gitignores(self, root_path: Optional[str] = None) -> List[str]:
|
||||
"""Parse all .gitignore files in the repository hierarchy."""
|
||||
if root_path:
|
||||
self.root_path = Path(root_path).resolve()
|
||||
|
||||
all_patterns = []
|
||||
|
||||
# Find all .gitignore files in the repository
|
||||
gitignore_files = self._find_gitignore_files()
|
||||
|
||||
for gitignore_file in gitignore_files:
|
||||
patterns = self.parse_file(gitignore_file)
|
||||
all_patterns.extend(patterns)
|
||||
|
||||
return all_patterns
|
||||
|
||||
def _find_gitignore_files(self) -> List[Path]:
|
||||
"""Find all .gitignore files in the repository."""
|
||||
gitignore_files = []
|
||||
|
||||
# Start with root .gitignore
|
||||
root_gitignore = self.root_path / '.gitignore'
|
||||
if root_gitignore.exists():
|
||||
gitignore_files.append(root_gitignore)
|
||||
|
||||
# Find .gitignore files in subdirectories
|
||||
try:
|
||||
for gitignore_file in self.root_path.rglob('.gitignore'):
|
||||
if gitignore_file != root_gitignore:
|
||||
gitignore_files.append(gitignore_file)
|
||||
except (PermissionError, OSError):
|
||||
# Skip directories we can't access
|
||||
pass
|
||||
|
||||
return gitignore_files
|
||||
|
||||
def should_exclude(self, file_path: str, gitignore_patterns: List[str]) -> bool:
|
||||
"""Check if a file should be excluded based on gitignore patterns."""
|
||||
# Convert to relative path from root
|
||||
try:
|
||||
rel_path = str(Path(file_path).relative_to(self.root_path))
|
||||
except ValueError:
|
||||
# File is not under root path
|
||||
return False
|
||||
|
||||
# Normalize path separators for consistent matching
|
||||
rel_path = rel_path.replace(os.sep, '/')
|
||||
|
||||
for pattern in gitignore_patterns:
|
||||
if self._matches_pattern(rel_path, pattern):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def _matches_pattern(self, file_path: str, pattern: str) -> bool:
|
||||
"""Check if a file path matches a gitignore pattern."""
|
||||
# Normalize pattern separators
|
||||
pattern = pattern.replace(os.sep, '/')
|
||||
|
||||
# Handle different pattern types
|
||||
if pattern.startswith('*/'):
|
||||
# Pattern like */pattern - matches at any level
|
||||
sub_pattern = pattern[2:]
|
||||
return fnmatch.fnmatch(file_path, f"*/{sub_pattern}") or fnmatch.fnmatch(file_path, sub_pattern)
|
||||
elif '/' in pattern:
|
||||
# Pattern contains slash - match exact path
|
||||
return fnmatch.fnmatch(file_path, pattern)
|
||||
else:
|
||||
# Simple pattern - match filename or directory at any level
|
||||
parts = file_path.split('/')
|
||||
return any(fnmatch.fnmatch(part, pattern) for part in parts)
|
||||
|
||||
|
||||
def parse_gitignore(gitignore_path: str) -> List[str]:
|
||||
"""Convenience function to parse a single .gitignore file."""
|
||||
parser = GitignoreParser()
|
||||
return parser.parse_file(gitignore_path)
|
||||
|
||||
|
||||
def get_all_gitignore_patterns(root_path: str = ".") -> List[str]:
|
||||
"""Convenience function to get all gitignore patterns in a repository."""
|
||||
parser = GitignoreParser(root_path)
|
||||
return parser.parse_all_gitignores()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
|
||||
if len(sys.argv) > 1:
|
||||
gitignore_path = sys.argv[1]
|
||||
patterns = parse_gitignore(gitignore_path)
|
||||
print(f"Parsed {len(patterns)} patterns from {gitignore_path}:")
|
||||
for pattern in patterns:
|
||||
print(f" {pattern}")
|
||||
else:
|
||||
# Parse all .gitignore files in current directory
|
||||
patterns = get_all_gitignore_patterns()
|
||||
print(f"Found {len(patterns)} gitignore patterns:")
|
||||
for pattern in patterns:
|
||||
print(f" {pattern}")
|
||||
500
.claude/python_script/core/path_matcher.py
Normal file
500
.claude/python_script/core/path_matcher.py
Normal file
@@ -0,0 +1,500 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Path Matcher Module for UltraThink Path-Aware Analyzer
|
||||
Matches files to analysis context and ranks them by relevance.
|
||||
"""
|
||||
|
||||
import re
|
||||
import logging
|
||||
import fnmatch
|
||||
from typing import Dict, List, Tuple, Optional, Set
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
import math
|
||||
|
||||
from .file_indexer import FileInfo
|
||||
from .context_analyzer import AnalysisResult
|
||||
|
||||
@dataclass
|
||||
class MatchResult:
|
||||
"""Result of path matching with relevance score."""
|
||||
file_info: FileInfo
|
||||
relevance_score: float
|
||||
match_reasons: List[str]
|
||||
category_bonus: float
|
||||
|
||||
@dataclass
|
||||
class PathMatchingResult:
|
||||
"""Complete result of path matching operation."""
|
||||
matched_files: List[MatchResult]
|
||||
total_tokens: int
|
||||
categories: Dict[str, int]
|
||||
patterns_used: List[str]
|
||||
confidence_score: float
|
||||
|
||||
class PathMatcher:
|
||||
"""Matches files to analysis context using various algorithms."""
|
||||
|
||||
def __init__(self, config: Dict):
|
||||
self.config = config
|
||||
self.logger = logging.getLogger(__name__)
|
||||
|
||||
# Load scoring weights
|
||||
self.weights = config.get('path_matching', {}).get('weights', {
|
||||
'keyword_match': 0.4,
|
||||
'extension_match': 0.2,
|
||||
'directory_context': 0.2,
|
||||
'file_size_penalty': 0.1,
|
||||
'recency_bonus': 0.1
|
||||
})
|
||||
|
||||
# Load limits
|
||||
self.max_files_per_category = config.get('path_matching', {}).get('max_files_per_category', 20)
|
||||
self.min_relevance_score = config.get('path_matching', {}).get('min_relevance_score', 0.1)
|
||||
self.max_total_files = config.get('output', {}).get('max_total_files', 50)
|
||||
|
||||
# Load always include patterns
|
||||
self.always_include = config.get('output', {}).get('always_include', [])
|
||||
|
||||
# Category priorities
|
||||
self.category_priorities = {
|
||||
'code': 1.0,
|
||||
'config': 0.8,
|
||||
'docs': 0.6,
|
||||
'web': 0.4,
|
||||
'other': 0.2
|
||||
}
|
||||
|
||||
def _calculate_keyword_score(self, file_info: FileInfo, keywords: List[str]) -> Tuple[float, List[str]]:
|
||||
"""Calculate score based on keyword matches in file path."""
|
||||
if not keywords:
|
||||
return 0.0, []
|
||||
|
||||
path_lower = file_info.relative_path.lower()
|
||||
filename_lower = Path(file_info.relative_path).name.lower()
|
||||
|
||||
matches = []
|
||||
score = 0.0
|
||||
|
||||
for keyword in keywords:
|
||||
keyword_lower = keyword.lower()
|
||||
|
||||
# Exact filename match (highest weight)
|
||||
if keyword_lower in filename_lower:
|
||||
score += 2.0
|
||||
matches.append(f"filename:{keyword}")
|
||||
continue
|
||||
|
||||
# Directory name match
|
||||
if keyword_lower in path_lower:
|
||||
score += 1.0
|
||||
matches.append(f"path:{keyword}")
|
||||
continue
|
||||
|
||||
# Partial match in path components
|
||||
path_parts = path_lower.split('/')
|
||||
for part in path_parts:
|
||||
if keyword_lower in part:
|
||||
score += 0.5
|
||||
matches.append(f"partial:{keyword}")
|
||||
break
|
||||
|
||||
# Normalize by number of keywords
|
||||
normalized_score = score / len(keywords) if keywords else 0.0
|
||||
return min(normalized_score, 1.0), matches
|
||||
|
||||
def _calculate_extension_score(self, file_info: FileInfo, languages: List[str]) -> float:
|
||||
"""Calculate score based on file extension relevance."""
|
||||
if not languages:
|
||||
return 0.5 # Neutral score
|
||||
|
||||
extension = file_info.extension.lower()
|
||||
|
||||
# Language-specific extension mapping
|
||||
lang_extensions = {
|
||||
'python': ['.py', '.pyx', '.pyi'],
|
||||
'javascript': ['.js', '.jsx', '.mjs'],
|
||||
'typescript': ['.ts', '.tsx'],
|
||||
'java': ['.java'],
|
||||
'go': ['.go'],
|
||||
'rust': ['.rs'],
|
||||
'cpp': ['.cpp', '.cc', '.cxx', '.c', '.h', '.hpp'],
|
||||
'csharp': ['.cs'],
|
||||
'php': ['.php'],
|
||||
'ruby': ['.rb'],
|
||||
'shell': ['.sh', '.bash', '.zsh']
|
||||
}
|
||||
|
||||
score = 0.0
|
||||
for language in languages:
|
||||
if language in lang_extensions:
|
||||
if extension in lang_extensions[language]:
|
||||
score = 1.0
|
||||
break
|
||||
|
||||
# Fallback to category-based scoring
|
||||
if score == 0.0:
|
||||
category_scores = {
|
||||
'code': 1.0,
|
||||
'config': 0.8,
|
||||
'docs': 0.6,
|
||||
'web': 0.4,
|
||||
'other': 0.2
|
||||
}
|
||||
score = category_scores.get(file_info.category, 0.2)
|
||||
|
||||
return score
|
||||
|
||||
def _calculate_directory_score(self, file_info: FileInfo, domains: List[str]) -> Tuple[float, List[str]]:
|
||||
"""Calculate score based on directory context."""
|
||||
if not domains:
|
||||
return 0.0, []
|
||||
|
||||
path_parts = file_info.relative_path.lower().split('/')
|
||||
matches = []
|
||||
score = 0.0
|
||||
|
||||
# Domain-specific directory patterns
|
||||
domain_patterns = {
|
||||
'auth': ['auth', 'authentication', 'login', 'user', 'account'],
|
||||
'authentication': ['auth', 'authentication', 'login', 'user', 'account'],
|
||||
'database': ['db', 'database', 'model', 'entity', 'migration', 'schema'],
|
||||
'api': ['api', 'rest', 'graphql', 'route', 'controller', 'handler'],
|
||||
'frontend': ['ui', 'component', 'view', 'template', 'client', 'web'],
|
||||
'backend': ['service', 'server', 'core', 'business', 'logic'],
|
||||
'test': ['test', 'spec', 'tests', '__tests__', 'testing'],
|
||||
'testing': ['test', 'spec', 'tests', '__tests__', 'testing'],
|
||||
'config': ['config', 'configuration', 'env', 'settings'],
|
||||
'configuration': ['config', 'configuration', 'env', 'settings'],
|
||||
'util': ['util', 'utils', 'helper', 'common', 'shared', 'lib'],
|
||||
'utility': ['util', 'utils', 'helper', 'common', 'shared', 'lib']
|
||||
}
|
||||
|
||||
for domain in domains:
|
||||
if domain in domain_patterns:
|
||||
patterns = domain_patterns[domain]
|
||||
for pattern in patterns:
|
||||
for part in path_parts:
|
||||
if pattern in part:
|
||||
score += 1.0
|
||||
matches.append(f"dir:{domain}->{pattern}")
|
||||
break
|
||||
|
||||
# Normalize by number of domains
|
||||
normalized_score = score / len(domains) if domains else 0.0
|
||||
return min(normalized_score, 1.0), matches
|
||||
|
||||
def _calculate_size_penalty(self, file_info: FileInfo) -> float:
|
||||
"""Calculate penalty for very large files."""
|
||||
max_size = self.config.get('performance', {}).get('max_file_size', 10485760) # 10MB
|
||||
|
||||
if file_info.size > max_size:
|
||||
# Heavy penalty for oversized files
|
||||
return -0.5
|
||||
elif file_info.size > max_size * 0.5:
|
||||
# Light penalty for large files
|
||||
return -0.2
|
||||
else:
|
||||
return 0.0
|
||||
|
||||
def _calculate_recency_bonus(self, file_info: FileInfo) -> float:
|
||||
"""Calculate bonus for recently modified files."""
|
||||
import time
|
||||
|
||||
current_time = time.time()
|
||||
file_age = current_time - file_info.modified_time
|
||||
|
||||
# Files modified in last day get bonus
|
||||
if file_age < 86400: # 1 day
|
||||
return 0.3
|
||||
elif file_age < 604800: # 1 week
|
||||
return 0.1
|
||||
else:
|
||||
return 0.0
|
||||
|
||||
def calculate_relevance_score(self, file_info: FileInfo, analysis: AnalysisResult) -> MatchResult:
|
||||
"""Calculate overall relevance score for a file."""
|
||||
# Calculate individual scores
|
||||
keyword_score, keyword_matches = self._calculate_keyword_score(file_info, analysis.keywords)
|
||||
extension_score = self._calculate_extension_score(file_info, analysis.languages)
|
||||
directory_score, dir_matches = self._calculate_directory_score(file_info, analysis.domains)
|
||||
size_penalty = self._calculate_size_penalty(file_info)
|
||||
recency_bonus = self._calculate_recency_bonus(file_info)
|
||||
|
||||
# Apply weights
|
||||
weighted_score = (
|
||||
keyword_score * self.weights.get('keyword_match', 0.4) +
|
||||
extension_score * self.weights.get('extension_match', 0.2) +
|
||||
directory_score * self.weights.get('directory_context', 0.2) +
|
||||
size_penalty * self.weights.get('file_size_penalty', 0.1) +
|
||||
recency_bonus * self.weights.get('recency_bonus', 0.1)
|
||||
)
|
||||
|
||||
# Category bonus
|
||||
category_bonus = self.category_priorities.get(file_info.category, 0.2)
|
||||
|
||||
# Final score with category bonus
|
||||
final_score = weighted_score + (category_bonus * 0.1)
|
||||
|
||||
# Collect match reasons
|
||||
match_reasons = keyword_matches + dir_matches
|
||||
if extension_score > 0.5:
|
||||
match_reasons.append(f"extension:{file_info.extension}")
|
||||
if recency_bonus > 0:
|
||||
match_reasons.append("recent")
|
||||
|
||||
return MatchResult(
|
||||
file_info=file_info,
|
||||
relevance_score=max(0.0, final_score),
|
||||
match_reasons=match_reasons,
|
||||
category_bonus=category_bonus
|
||||
)
|
||||
|
||||
def match_by_patterns(self, file_index: Dict[str, FileInfo], patterns: List[str]) -> List[FileInfo]:
|
||||
"""Match files using explicit glob patterns."""
|
||||
matched_files = []
|
||||
|
||||
for pattern in patterns:
|
||||
for path, file_info in file_index.items():
|
||||
# Try matching both relative path and full path
|
||||
if (fnmatch.fnmatch(path, pattern) or
|
||||
fnmatch.fnmatch(file_info.path, pattern) or
|
||||
fnmatch.fnmatch(Path(path).name, pattern)):
|
||||
matched_files.append(file_info)
|
||||
|
||||
# Remove duplicates based on path
|
||||
seen_paths = set()
|
||||
unique_files = []
|
||||
for file_info in matched_files:
|
||||
if file_info.relative_path not in seen_paths:
|
||||
seen_paths.add(file_info.relative_path)
|
||||
unique_files.append(file_info)
|
||||
return unique_files
|
||||
|
||||
def match_always_include(self, file_index: Dict[str, FileInfo]) -> List[FileInfo]:
|
||||
"""Match files that should always be included."""
|
||||
return self.match_by_patterns(file_index, self.always_include)
|
||||
|
||||
def rank_files(self, files: List[FileInfo], analysis: AnalysisResult) -> List[MatchResult]:
|
||||
"""Rank files by relevance score."""
|
||||
match_results = []
|
||||
|
||||
for file_info in files:
|
||||
match_result = self.calculate_relevance_score(file_info, analysis)
|
||||
if match_result.relevance_score >= self.min_relevance_score:
|
||||
match_results.append(match_result)
|
||||
|
||||
# Sort by relevance score (descending)
|
||||
match_results.sort(key=lambda x: x.relevance_score, reverse=True)
|
||||
|
||||
return match_results
|
||||
|
||||
def select_best_files(self, ranked_files: List[MatchResult], token_limit: Optional[int] = None) -> List[MatchResult]:
|
||||
"""Select the best files within token limits and category constraints."""
|
||||
if not ranked_files:
|
||||
return []
|
||||
|
||||
selected_files = []
|
||||
total_tokens = 0
|
||||
category_counts = {}
|
||||
|
||||
for match_result in ranked_files:
|
||||
file_info = match_result.file_info
|
||||
category = file_info.category
|
||||
|
||||
# Check category limit
|
||||
if category_counts.get(category, 0) >= self.max_files_per_category:
|
||||
continue
|
||||
|
||||
# Check token limit
|
||||
if token_limit and total_tokens + file_info.estimated_tokens > token_limit:
|
||||
continue
|
||||
|
||||
# Check total file limit
|
||||
if len(selected_files) >= self.max_total_files:
|
||||
break
|
||||
|
||||
# Add file
|
||||
selected_files.append(match_result)
|
||||
total_tokens += file_info.estimated_tokens
|
||||
category_counts[category] = category_counts.get(category, 0) + 1
|
||||
|
||||
return selected_files
|
||||
|
||||
def match_files(self, file_index: Dict[str, FileInfo], analysis: AnalysisResult,
|
||||
token_limit: Optional[int] = None, explicit_patterns: Optional[List[str]] = None) -> PathMatchingResult:
|
||||
"""Main file matching function."""
|
||||
self.logger.info(f"Matching files for analysis with {len(analysis.keywords)} keywords and {len(analysis.domains)} domains")
|
||||
|
||||
# Start with always-include files
|
||||
always_include_files = self.match_always_include(file_index)
|
||||
self.logger.debug(f"Always include: {len(always_include_files)} files")
|
||||
|
||||
# Add explicit pattern matches
|
||||
pattern_files = []
|
||||
patterns_used = []
|
||||
if explicit_patterns:
|
||||
pattern_files = self.match_by_patterns(file_index, explicit_patterns)
|
||||
patterns_used.extend(explicit_patterns)
|
||||
self.logger.debug(f"Explicit patterns: {len(pattern_files)} files")
|
||||
|
||||
# Add suggested pattern matches
|
||||
if analysis.file_patterns:
|
||||
suggested_files = self.match_by_patterns(file_index, analysis.file_patterns)
|
||||
pattern_files.extend(suggested_files)
|
||||
patterns_used.extend(analysis.file_patterns)
|
||||
self.logger.debug(f"Suggested patterns: {len(suggested_files)} files")
|
||||
|
||||
# Combine all candidate files and remove duplicates
|
||||
all_files = always_include_files + pattern_files + list(file_index.values())
|
||||
seen_paths = set()
|
||||
all_candidates = []
|
||||
for file_info in all_files:
|
||||
if file_info.relative_path not in seen_paths:
|
||||
seen_paths.add(file_info.relative_path)
|
||||
all_candidates.append(file_info)
|
||||
self.logger.debug(f"Total candidates: {len(all_candidates)} files")
|
||||
|
||||
# Rank all candidates
|
||||
ranked_files = self.rank_files(all_candidates, analysis)
|
||||
self.logger.debug(f"Files above threshold: {len(ranked_files)}")
|
||||
|
||||
# Select best files within limits
|
||||
selected_files = self.select_best_files(ranked_files, token_limit)
|
||||
self.logger.info(f"Selected {len(selected_files)} files")
|
||||
|
||||
# Calculate statistics
|
||||
total_tokens = sum(match.file_info.estimated_tokens for match in selected_files)
|
||||
categories = {}
|
||||
for match in selected_files:
|
||||
category = match.file_info.category
|
||||
categories[category] = categories.get(category, 0) + 1
|
||||
|
||||
# Calculate confidence score
|
||||
confidence_score = self._calculate_confidence(selected_files, analysis)
|
||||
|
||||
return PathMatchingResult(
|
||||
matched_files=selected_files,
|
||||
total_tokens=total_tokens,
|
||||
categories=categories,
|
||||
patterns_used=patterns_used,
|
||||
confidence_score=confidence_score
|
||||
)
|
||||
|
||||
def _calculate_confidence(self, selected_files: List[MatchResult], analysis: AnalysisResult) -> float:
|
||||
"""Calculate confidence score for the matching result."""
|
||||
if not selected_files:
|
||||
return 0.0
|
||||
|
||||
# Average relevance score
|
||||
avg_relevance = sum(match.relevance_score for match in selected_files) / len(selected_files)
|
||||
|
||||
# Keyword coverage (how many keywords are represented)
|
||||
keyword_coverage = 0.0
|
||||
if analysis.keywords:
|
||||
covered_keywords = set()
|
||||
for match in selected_files:
|
||||
for reason in match.match_reasons:
|
||||
if reason.startswith('filename:') or reason.startswith('path:'):
|
||||
keyword = reason.split(':', 1)[1]
|
||||
covered_keywords.add(keyword)
|
||||
keyword_coverage = len(covered_keywords) / len(analysis.keywords)
|
||||
|
||||
# Domain coverage
|
||||
domain_coverage = 0.0
|
||||
if analysis.domains:
|
||||
covered_domains = set()
|
||||
for match in selected_files:
|
||||
for reason in match.match_reasons:
|
||||
if reason.startswith('dir:'):
|
||||
domain = reason.split('->', 1)[0].split(':', 1)[1]
|
||||
covered_domains.add(domain)
|
||||
domain_coverage = len(covered_domains) / len(analysis.domains)
|
||||
|
||||
# Weighted confidence score
|
||||
confidence = (
|
||||
avg_relevance * 0.5 +
|
||||
keyword_coverage * 0.3 +
|
||||
domain_coverage * 0.2
|
||||
)
|
||||
|
||||
return min(confidence, 1.0)
|
||||
|
||||
def format_patterns(self, selected_files: List[MatchResult]) -> List[str]:
|
||||
"""Format selected files as @{pattern} strings."""
|
||||
pattern_format = self.config.get('output', {}).get('pattern_format', '@{{{path}}}')
|
||||
|
||||
patterns = []
|
||||
for match in selected_files:
|
||||
pattern = pattern_format.format(path=match.file_info.relative_path)
|
||||
patterns.append(pattern)
|
||||
|
||||
return patterns
|
||||
|
||||
def main():
|
||||
"""Command-line interface for path matcher."""
|
||||
import yaml
|
||||
import argparse
|
||||
import json
|
||||
from .file_indexer import FileIndexer
|
||||
from .context_analyzer import ContextAnalyzer
|
||||
|
||||
parser = argparse.ArgumentParser(description="Path Matcher for UltraThink")
|
||||
parser.add_argument("prompt", help="Prompt to analyze and match")
|
||||
parser.add_argument("--config", default="config.yaml", help="Configuration file path")
|
||||
parser.add_argument("--token-limit", type=int, help="Token limit for selection")
|
||||
parser.add_argument("--patterns", nargs="*", help="Explicit patterns to include")
|
||||
parser.add_argument("--verbose", "-v", action="store_true", help="Verbose output")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Setup logging
|
||||
level = logging.DEBUG if args.verbose else logging.INFO
|
||||
logging.basicConfig(level=level, format='%(levelname)s: %(message)s')
|
||||
|
||||
# Load configuration
|
||||
config_path = Path(__file__).parent / args.config
|
||||
with open(config_path, 'r', encoding='utf-8') as f:
|
||||
config = yaml.safe_load(f)
|
||||
|
||||
# Create components
|
||||
indexer = FileIndexer(config)
|
||||
analyzer = ContextAnalyzer(config)
|
||||
matcher = PathMatcher(config)
|
||||
|
||||
# Build file index
|
||||
file_index = indexer.load_index()
|
||||
if not file_index:
|
||||
print("Building file index...")
|
||||
file_index = indexer.build_index()
|
||||
|
||||
# Analyze prompt
|
||||
analysis = analyzer.analyze(args.prompt)
|
||||
|
||||
# Match files
|
||||
result = matcher.match_files(
|
||||
file_index=file_index,
|
||||
analysis=analysis,
|
||||
token_limit=args.token_limit,
|
||||
explicit_patterns=args.patterns
|
||||
)
|
||||
|
||||
# Output results
|
||||
print(f"Matched {len(result.matched_files)} files (~{result.total_tokens:,} tokens)")
|
||||
print(f"Categories: {result.categories}")
|
||||
print(f"Confidence: {result.confidence_score:.2f}")
|
||||
print()
|
||||
|
||||
patterns = matcher.format_patterns(result.matched_files)
|
||||
print("Patterns:")
|
||||
for pattern in patterns[:20]: # Limit output
|
||||
print(f" {pattern}")
|
||||
|
||||
if args.verbose:
|
||||
print("\nDetailed matches:")
|
||||
for match in result.matched_files[:10]:
|
||||
print(f" {match.file_info.relative_path} (score: {match.relevance_score:.3f})")
|
||||
print(f" Reasons: {', '.join(match.match_reasons)}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user