refactor: Update workflow plan system and template organization

- Remove --analyze|--deep parameters from plan.md, use default analysis
- Change .analysis to .process directory structure for better organization
- Create ANALYSIS_RESULTS.md template focused on verified results
- Add .process folder to workflow-architecture.md file structure
- Template emphasizes verification of files, methods, and commands
- Prevent execution errors from non-existent references

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
catlog22
2025-09-18 16:26:50 +08:00
parent 9167e4e39e
commit fc6e851230
49 changed files with 5865 additions and 174 deletions

View File

@@ -0,0 +1,25 @@
"""
Core modules for the Python script analyzer.
Provides unified interfaces for file indexing, context analysis, and path matching.
"""
from .config import Config
from .file_indexer import FileIndexer, FileInfo, IndexStats
from .context_analyzer import ContextAnalyzer, AnalysisResult
from .path_matcher import PathMatcher, MatchResult, PathMatchingResult
from .embedding_manager import EmbeddingManager
from .gitignore_parser import GitignoreParser
__all__ = [
'Config',
'FileIndexer',
'FileInfo',
'IndexStats',
'ContextAnalyzer',
'AnalysisResult',
'PathMatcher',
'MatchResult',
'PathMatchingResult',
'EmbeddingManager',
'GitignoreParser'
]

View File

@@ -0,0 +1,327 @@
#!/usr/bin/env python3
"""
Configuration Management Module
Provides unified configuration management with gitignore integration.
"""
import os
import yaml
import logging
from pathlib import Path
from typing import Dict, Any, Optional, List
from .gitignore_parser import get_all_gitignore_patterns
class Config:
"""Singleton configuration manager with hierarchical loading."""
_instance = None
_initialized = False
def __new__(cls, config_path: Optional[str] = None):
if cls._instance is None:
cls._instance = super(Config, cls).__new__(cls)
return cls._instance
def __init__(self, config_path: Optional[str] = None):
if self._initialized:
return
self.config_path = config_path
self.config = {}
self.logger = logging.getLogger(__name__)
self._load_config()
self._add_gitignore_patterns()
self._apply_env_overrides()
self._validate_config()
self._initialized = True
def _load_config(self):
"""Load configuration from file with fallback hierarchy."""
config_paths = self._get_config_paths()
for config_file in config_paths:
if config_file.exists():
try:
with open(config_file, 'r', encoding='utf-8') as f:
loaded_config = yaml.safe_load(f)
if loaded_config:
self.config = self._merge_configs(self.config, loaded_config)
self.logger.info(f"Loaded config from {config_file}")
except Exception as e:
self.logger.warning(f"Failed to load config from {config_file}: {e}")
# Apply default config if no config loaded
if not self.config:
self.config = self._get_default_config()
self.logger.info("Using default configuration")
def _get_config_paths(self) -> List[Path]:
"""Get ordered list of config file paths to check."""
paths = []
# 1. Explicitly provided config path
if self.config_path:
paths.append(Path(self.config_path))
# 2. Current directory config.yaml
paths.append(Path('config.yaml'))
# 3. Script directory config.yaml
script_dir = Path(__file__).parent.parent
paths.append(script_dir / 'config.yaml')
# 4. Default config in script directory
paths.append(script_dir / 'default_config.yaml')
return paths
def _get_default_config(self) -> Dict[str, Any]:
"""Get default configuration."""
return {
'token_limits': {
'small_project': 500000,
'medium_project': 2000000,
'large_project': 10000000,
'max_files': 1000
},
'exclude_patterns': [
"*/node_modules/*",
"*/.git/*",
"*/build/*",
"*/dist/*",
"*/.next/*",
"*/.nuxt/*",
"*/target/*",
"*/vendor/*",
"*/__pycache__/*",
"*.pyc",
"*.pyo",
"*.log",
"*.tmp",
"*.temp",
"*.history"
],
'file_extensions': {
'code': ['.py', '.js', '.ts', '.tsx', '.jsx', '.java', '.cpp', '.c', '.h', '.rs', '.go', '.php', '.rb', '.sh', '.bash'],
'docs': ['.md', '.txt', '.rst', '.adoc'],
'config': ['.json', '.yaml', '.yml', '.toml', '.ini', '.env'],
'web': ['.html', '.css', '.scss', '.sass', '.xml']
},
'embedding': {
'enabled': True,
'model': 'all-MiniLM-L6-v2',
'cache_dir': 'cache',
'similarity_threshold': 0.3,
'max_context_length': 512,
'batch_size': 32
},
'context_analysis': {
'domain_keywords': {
'auth': ['auth', 'login', 'user', 'password', 'jwt', 'token', 'session'],
'database': ['db', 'database', 'sql', 'query', 'model', 'schema', 'migration'],
'api': ['api', 'endpoint', 'route', 'controller', 'service', 'handler'],
'frontend': ['ui', 'component', 'view', 'template', 'style', 'css'],
'backend': ['server', 'service', 'logic', 'business', 'core'],
'test': ['test', 'spec', 'unit', 'integration', 'mock'],
'config': ['config', 'setting', 'environment', 'env'],
'util': ['util', 'helper', 'common', 'shared', 'lib']
},
'language_indicators': {
'python': ['.py', 'python', 'pip', 'requirements.txt', 'setup.py'],
'javascript': ['.js', '.ts', 'npm', 'package.json', 'node'],
'java': ['.java', 'maven', 'gradle', 'pom.xml'],
'go': ['.go', 'go.mod', 'go.sum'],
'rust': ['.rs', 'cargo', 'Cargo.toml']
}
},
'path_matching': {
'weights': {
'keyword_match': 0.4,
'extension_match': 0.2,
'directory_context': 0.2,
'file_size_penalty': 0.1,
'recency_bonus': 0.1
},
'max_files_per_category': 20,
'min_relevance_score': 0.1
},
'output': {
'pattern_format': '@{{{path}}}',
'always_include': [
'CLAUDE.md',
'**/CLAUDE.md',
'README.md',
'docs/**/*.md'
],
'max_total_files': 50
},
'performance': {
'cache_enabled': True,
'cache_ttl': 3600,
'max_file_size': 10485760,
'max_workers': 4
},
'logging': {
'level': 'INFO',
'format': '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
}
}
def _merge_configs(self, base: Dict, override: Dict) -> Dict:
"""Recursively merge configuration dictionaries."""
result = base.copy()
for key, value in override.items():
if key in result and isinstance(result[key], dict) and isinstance(value, dict):
result[key] = self._merge_configs(result[key], value)
else:
result[key] = value
return result
def _add_gitignore_patterns(self):
"""Add patterns from .gitignore files to exclude_patterns."""
try:
# Find root directory (current working directory or script parent)
root_dir = Path.cwd()
gitignore_patterns = get_all_gitignore_patterns(str(root_dir))
if gitignore_patterns:
# Ensure exclude_patterns exists
if 'exclude_patterns' not in self.config:
self.config['exclude_patterns'] = []
# Add gitignore patterns, avoiding duplicates
existing_patterns = set(self.config['exclude_patterns'])
new_patterns = [p for p in gitignore_patterns if p not in existing_patterns]
self.config['exclude_patterns'].extend(new_patterns)
self.logger.info(f"Added {len(new_patterns)} patterns from .gitignore files")
except Exception as e:
self.logger.warning(f"Failed to load .gitignore patterns: {e}")
def _apply_env_overrides(self):
"""Apply environment variable overrides."""
env_mappings = {
'ANALYZER_CACHE_DIR': ('embedding', 'cache_dir'),
'ANALYZER_LOG_LEVEL': ('logging', 'level'),
'ANALYZER_MAX_FILES': ('token_limits', 'max_files'),
'ANALYZER_EMBEDDING_MODEL': ('embedding', 'model')
}
for env_var, config_path in env_mappings.items():
env_value = os.getenv(env_var)
if env_value:
self._set_nested_value(config_path, env_value)
self.logger.info(f"Applied environment override: {env_var} = {env_value}")
def _set_nested_value(self, path: tuple, value: str):
"""Set a nested configuration value."""
current = self.config
for key in path[:-1]:
if key not in current:
current[key] = {}
current = current[key]
# Try to convert value to appropriate type
if isinstance(current.get(path[-1]), int):
try:
value = int(value)
except ValueError:
pass
elif isinstance(current.get(path[-1]), bool):
value = value.lower() in ('true', '1', 'yes', 'on')
current[path[-1]] = value
def _validate_config(self):
"""Validate configuration values."""
required_sections = ['exclude_patterns', 'file_extensions', 'token_limits']
for section in required_sections:
if section not in self.config:
self.logger.warning(f"Missing required config section: {section}")
# Validate token limits
if 'token_limits' in self.config:
limits = self.config['token_limits']
if limits.get('small_project', 0) >= limits.get('medium_project', 0):
self.logger.warning("Token limit configuration may be incorrect")
def get(self, path: str, default: Any = None) -> Any:
"""Get configuration value using dot notation."""
keys = path.split('.')
current = self.config
try:
for key in keys:
current = current[key]
return current
except (KeyError, TypeError):
return default
def set(self, path: str, value: Any):
"""Set configuration value using dot notation."""
keys = path.split('.')
current = self.config
for key in keys[:-1]:
if key not in current:
current[key] = {}
current = current[key]
current[keys[-1]] = value
def get_exclude_patterns(self) -> List[str]:
"""Get all exclude patterns including gitignore patterns."""
return self.config.get('exclude_patterns', [])
def get_file_extensions(self) -> Dict[str, List[str]]:
"""Get file extension mappings."""
return self.config.get('file_extensions', {})
def is_embedding_enabled(self) -> bool:
"""Check if embedding functionality is enabled."""
return self.config.get('embedding', {}).get('enabled', False)
def get_cache_dir(self) -> str:
"""Get cache directory path."""
return self.config.get('embedding', {}).get('cache_dir', 'cache')
def to_dict(self) -> Dict[str, Any]:
"""Return configuration as dictionary."""
return self.config.copy()
def reload(self, config_path: Optional[str] = None):
"""Reload configuration from file."""
self._initialized = False
if config_path:
self.config_path = config_path
self.__init__(self.config_path)
# Global configuration instance
_global_config = None
def get_config(config_path: Optional[str] = None) -> Config:
"""Get global configuration instance."""
global _global_config
if _global_config is None:
_global_config = Config(config_path)
return _global_config
if __name__ == "__main__":
# Test configuration loading
config = Config()
print("Configuration loaded successfully!")
print(f"Cache dir: {config.get_cache_dir()}")
print(f"Exclude patterns: {len(config.get_exclude_patterns())}")
print(f"Embedding enabled: {config.is_embedding_enabled()}")

View File

@@ -0,0 +1,359 @@
#!/usr/bin/env python3
"""
Context Analyzer Module for UltraThink Path-Aware Analyzer
Analyzes user prompts to extract relevant context and keywords.
"""
import re
import logging
from typing import Dict, List, Set, Tuple, Optional
from dataclasses import dataclass
from collections import Counter
import string
@dataclass
class AnalysisResult:
"""Results of context analysis."""
keywords: List[str]
domains: List[str]
languages: List[str]
file_patterns: List[str]
confidence_scores: Dict[str, float]
extracted_entities: Dict[str, List[str]]
class ContextAnalyzer:
"""Analyzes user prompts to understand context and intent."""
def __init__(self, config: Dict):
self.config = config
self.logger = logging.getLogger(__name__)
# Load domain and language mappings from config
self.domain_keywords = config.get('context_analysis', {}).get('domain_keywords', {})
self.language_indicators = config.get('context_analysis', {}).get('language_indicators', {})
# Common programming terms and patterns
self.technical_terms = self._build_technical_terms()
self.file_pattern_indicators = self._build_pattern_indicators()
# Stop words to filter out
self.stop_words = {
'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with',
'by', 'from', 'up', 'about', 'into', 'through', 'during', 'before', 'after',
'above', 'below', 'between', 'among', 'as', 'is', 'are', 'was', 'were', 'be',
'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would',
'could', 'should', 'may', 'might', 'must', 'can', 'this', 'that', 'these',
'those', 'i', 'you', 'he', 'she', 'it', 'we', 'they', 'me', 'him', 'her',
'us', 'them', 'my', 'your', 'his', 'its', 'our', 'their'
}
def _build_technical_terms(self) -> Dict[str, List[str]]:
"""Build comprehensive list of technical terms grouped by category."""
return {
'authentication': [
'auth', 'authentication', 'login', 'logout', 'signin', 'signout',
'user', 'password', 'token', 'jwt', 'oauth', 'session', 'cookie',
'credential', 'authorize', 'permission', 'role', 'access'
],
'database': [
'database', 'db', 'sql', 'query', 'table', 'schema', 'migration',
'model', 'orm', 'entity', 'relation', 'index', 'transaction',
'crud', 'select', 'insert', 'update', 'delete', 'join'
],
'api': [
'api', 'rest', 'graphql', 'endpoint', 'route', 'controller',
'handler', 'middleware', 'service', 'request', 'response',
'http', 'get', 'post', 'put', 'delete', 'patch'
],
'frontend': [
'ui', 'component', 'view', 'template', 'page', 'layout',
'style', 'css', 'html', 'javascript', 'react', 'vue',
'angular', 'dom', 'event', 'state', 'props'
],
'backend': [
'server', 'service', 'business', 'logic', 'core', 'engine',
'worker', 'job', 'queue', 'cache', 'redis', 'memcache'
],
'testing': [
'test', 'testing', 'spec', 'unit', 'integration', 'e2e',
'mock', 'stub', 'fixture', 'assert', 'expect', 'should'
],
'configuration': [
'config', 'configuration', 'setting', 'environment', 'env',
'variable', 'constant', 'parameter', 'option'
],
'utility': [
'util', 'utility', 'helper', 'common', 'shared', 'lib',
'library', 'tool', 'function', 'method'
]
}
def _build_pattern_indicators(self) -> Dict[str, List[str]]:
"""Build indicators that suggest specific file patterns."""
return {
'source_code': ['implement', 'code', 'function', 'class', 'method'],
'tests': ['test', 'testing', 'spec', 'unittest', 'pytest'],
'documentation': ['doc', 'readme', 'guide', 'documentation', 'manual'],
'configuration': ['config', 'setting', 'env', 'environment'],
'build': ['build', 'compile', 'package', 'deploy', 'release'],
'scripts': ['script', 'automation', 'tool', 'utility']
}
def extract_keywords(self, text: str) -> List[str]:
"""Extract meaningful keywords from text."""
# Clean and normalize text
text = text.lower()
text = re.sub(r'[^\w\s-]', ' ', text) # Remove punctuation except hyphens
words = text.split()
# Filter stop words and short words
keywords = []
for word in words:
word = word.strip('-') # Remove leading/trailing hyphens
if (len(word) >= 2 and
word not in self.stop_words and
not word.isdigit()):
keywords.append(word)
# Count frequency and return top keywords
word_counts = Counter(keywords)
return [word for word, count in word_counts.most_common(20)]
def identify_domains(self, keywords: List[str]) -> List[Tuple[str, float]]:
"""Identify relevant domains based on keywords."""
domain_scores = {}
for domain, domain_keywords in self.domain_keywords.items():
score = 0.0
matched_keywords = []
for keyword in keywords:
for domain_keyword in domain_keywords:
if keyword in domain_keyword or domain_keyword in keyword:
score += 1.0
matched_keywords.append(keyword)
break
if score > 0:
# Normalize score by number of domain keywords
normalized_score = score / len(domain_keywords)
domain_scores[domain] = normalized_score
# Also check technical terms
for category, terms in self.technical_terms.items():
score = 0.0
for keyword in keywords:
for term in terms:
if keyword in term or term in keyword:
score += 1.0
break
if score > 0:
normalized_score = score / len(terms)
if category not in domain_scores:
domain_scores[category] = normalized_score
else:
domain_scores[category] = max(domain_scores[category], normalized_score)
# Sort by score and return top domains
sorted_domains = sorted(domain_scores.items(), key=lambda x: x[1], reverse=True)
return sorted_domains[:5]
def identify_languages(self, keywords: List[str]) -> List[Tuple[str, float]]:
"""Identify programming languages based on keywords."""
language_scores = {}
for language, indicators in self.language_indicators.items():
score = 0.0
for keyword in keywords:
for indicator in indicators:
if keyword in indicator or indicator in keyword:
score += 1.0
break
if score > 0:
normalized_score = score / len(indicators)
language_scores[language] = normalized_score
sorted_languages = sorted(language_scores.items(), key=lambda x: x[1], reverse=True)
return sorted_languages[:3]
def extract_file_patterns(self, text: str) -> List[str]:
"""Extract explicit file patterns from text."""
patterns = []
# Look for @{pattern} syntax
at_patterns = re.findall(r'@\{([^}]+)\}', text)
patterns.extend(at_patterns)
# Look for file extensions
extensions = re.findall(r'\*\.(\w+)', text)
for ext in extensions:
patterns.append(f"*.{ext}")
# Look for directory patterns
dir_patterns = re.findall(r'(\w+)/\*\*?', text)
for dir_pattern in dir_patterns:
patterns.append(f"{dir_pattern}/**/*")
# Look for specific file names
file_patterns = re.findall(r'\b(\w+\.\w+)\b', text)
for file_pattern in file_patterns:
if '.' in file_pattern:
patterns.append(file_pattern)
return list(set(patterns)) # Remove duplicates
def suggest_patterns_from_domains(self, domains: List[str]) -> List[str]:
"""Suggest file patterns based on identified domains."""
patterns = []
domain_to_patterns = {
'auth': ['**/auth/**/*', '**/login/**/*', '**/user/**/*'],
'authentication': ['**/auth/**/*', '**/login/**/*', '**/user/**/*'],
'database': ['**/db/**/*', '**/model/**/*', '**/migration/**/*', '**/*model*'],
'api': ['**/api/**/*', '**/route/**/*', '**/controller/**/*', '**/handler/**/*'],
'frontend': ['**/ui/**/*', '**/component/**/*', '**/view/**/*', '**/template/**/*'],
'backend': ['**/service/**/*', '**/core/**/*', '**/server/**/*'],
'test': ['**/test/**/*', '**/spec/**/*', '**/*test*', '**/*spec*'],
'testing': ['**/test/**/*', '**/spec/**/*', '**/*test*', '**/*spec*'],
'config': ['**/config/**/*', '**/*.config.*', '**/env/**/*'],
'configuration': ['**/config/**/*', '**/*.config.*', '**/env/**/*'],
'util': ['**/util/**/*', '**/helper/**/*', '**/common/**/*'],
'utility': ['**/util/**/*', '**/helper/**/*', '**/common/**/*']
}
for domain in domains:
if domain in domain_to_patterns:
patterns.extend(domain_to_patterns[domain])
return list(set(patterns)) # Remove duplicates
def extract_entities(self, text: str) -> Dict[str, List[str]]:
"""Extract named entities from text."""
entities = {
'files': [],
'functions': [],
'classes': [],
'variables': [],
'technologies': []
}
# File patterns
file_patterns = re.findall(r'\b(\w+\.\w+)\b', text)
entities['files'] = list(set(file_patterns))
# Function patterns (camelCase or snake_case followed by parentheses)
function_patterns = re.findall(r'\b([a-z][a-zA-Z0-9_]*)\s*\(', text)
entities['functions'] = list(set(function_patterns))
# Class patterns (PascalCase)
class_patterns = re.findall(r'\b([A-Z][a-zA-Z0-9]*)\b', text)
entities['classes'] = list(set(class_patterns))
# Technology mentions
tech_keywords = [
'react', 'vue', 'angular', 'node', 'express', 'django', 'flask',
'spring', 'rails', 'laravel', 'docker', 'kubernetes', 'aws',
'azure', 'gcp', 'postgresql', 'mysql', 'mongodb', 'redis'
]
text_lower = text.lower()
for tech in tech_keywords:
if tech in text_lower:
entities['technologies'].append(tech)
return entities
def analyze(self, prompt: str) -> AnalysisResult:
"""Perform comprehensive analysis of the user prompt."""
self.logger.debug(f"Analyzing prompt: {prompt[:100]}...")
# Extract keywords
keywords = self.extract_keywords(prompt)
# Identify domains and languages
domains_with_scores = self.identify_domains(keywords)
languages_with_scores = self.identify_languages(keywords)
# Extract patterns and entities
explicit_patterns = self.extract_file_patterns(prompt)
entities = self.extract_entities(prompt)
# Get top domains and languages
domains = [domain for domain, score in domains_with_scores]
languages = [lang for lang, score in languages_with_scores]
# Suggest additional patterns based on domains
suggested_patterns = self.suggest_patterns_from_domains(domains)
# Combine explicit and suggested patterns
all_patterns = list(set(explicit_patterns + suggested_patterns))
# Build confidence scores
confidence_scores = {
'keywords': len(keywords) / 20, # Normalize to 0-1
'domain_match': max([score for _, score in domains_with_scores[:1]], default=0),
'language_match': max([score for _, score in languages_with_scores[:1]], default=0),
'pattern_extraction': len(explicit_patterns) / 5, # Normalize to 0-1
}
result = AnalysisResult(
keywords=keywords,
domains=domains,
languages=languages,
file_patterns=all_patterns,
confidence_scores=confidence_scores,
extracted_entities=entities
)
self.logger.info(f"Analysis complete: {len(domains)} domains, {len(languages)} languages, {len(all_patterns)} patterns")
return result
def main():
"""Command-line interface for context analyzer."""
import yaml
import argparse
import json
parser = argparse.ArgumentParser(description="Context Analyzer for UltraThink")
parser.add_argument("prompt", help="Prompt to analyze")
parser.add_argument("--config", default="config.yaml", help="Configuration file path")
parser.add_argument("--verbose", "-v", action="store_true", help="Verbose output")
args = parser.parse_args()
# Setup logging
level = logging.DEBUG if args.verbose else logging.INFO
logging.basicConfig(level=level, format='%(levelname)s: %(message)s')
# Load configuration
from pathlib import Path
config_path = Path(__file__).parent / args.config
with open(config_path, 'r', encoding='utf-8') as f:
config = yaml.safe_load(f)
# Create analyzer
analyzer = ContextAnalyzer(config)
# Analyze prompt
result = analyzer.analyze(args.prompt)
# Output results
print(f"Keywords: {', '.join(result.keywords[:10])}")
print(f"Domains: {', '.join(result.domains[:5])}")
print(f"Languages: {', '.join(result.languages[:3])}")
print(f"Patterns: {', '.join(result.file_patterns[:10])}")
if args.verbose:
print("\nDetailed Results:")
print(json.dumps({
'keywords': result.keywords,
'domains': result.domains,
'languages': result.languages,
'file_patterns': result.file_patterns,
'confidence_scores': result.confidence_scores,
'extracted_entities': result.extracted_entities
}, indent=2))
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,453 @@
#!/usr/bin/env python3
"""
Embedding Manager Module for UltraThink Path-Aware Analyzer
Manages embeddings for semantic similarity search (RAG functionality).
"""
import os
import json
import hashlib
import logging
import pickle
from pathlib import Path
from typing import Dict, List, Tuple, Optional, Any
from dataclasses import dataclass
import time
# Optional imports for embedding functionality
try:
import numpy as np
NUMPY_AVAILABLE = True
except ImportError:
NUMPY_AVAILABLE = False
try:
from sentence_transformers import SentenceTransformer
SENTENCE_TRANSFORMERS_AVAILABLE = True
except ImportError:
SENTENCE_TRANSFORMERS_AVAILABLE = False
from .file_indexer import FileInfo
@dataclass
class EmbeddingInfo:
"""Information about a file's embedding."""
file_path: str
content_hash: str
embedding_hash: str
created_time: float
vector_size: int
@dataclass
class SimilarityResult:
"""Result of similarity search."""
file_info: FileInfo
similarity_score: float
matching_content: str
class EmbeddingManager:
"""Manages embeddings for semantic file matching."""
def __init__(self, config: Dict):
self.config = config
self.logger = logging.getLogger(__name__)
# Check if embeddings are enabled
self.enabled = config.get('embedding', {}).get('enabled', False)
if not self.enabled:
self.logger.info("Embeddings disabled in configuration")
return
# Check dependencies
if not NUMPY_AVAILABLE:
self.logger.warning("NumPy not available, disabling embeddings")
self.enabled = False
return
if not SENTENCE_TRANSFORMERS_AVAILABLE:
self.logger.warning("sentence-transformers not available, disabling embeddings")
self.enabled = False
return
# Load configuration
self.model_name = config.get('embedding', {}).get('model', 'all-MiniLM-L6-v2')
self.cache_dir = Path(config.get('embedding', {}).get('cache_dir', '.claude/cache/embeddings'))
self.similarity_threshold = config.get('embedding', {}).get('similarity_threshold', 0.6)
self.max_context_length = config.get('embedding', {}).get('max_context_length', 512)
self.batch_size = config.get('embedding', {}).get('batch_size', 32)
# Setup cache directories
self.cache_dir.mkdir(parents=True, exist_ok=True)
self.embeddings_file = self.cache_dir / "embeddings.pkl"
self.index_file = self.cache_dir / "embedding_index.json"
# Initialize model lazily
self._model = None
self._embeddings_cache = None
self._embedding_index = None
@property
def model(self):
"""Lazy load the embedding model."""
if not self.enabled:
return None
if self._model is None:
try:
self.logger.info(f"Loading embedding model: {self.model_name}")
self._model = SentenceTransformer(self.model_name)
self.logger.info(f"Model loaded successfully")
except Exception as e:
self.logger.error(f"Failed to load embedding model: {e}")
self.enabled = False
return None
return self._model
def embeddings_exist(self) -> bool:
"""Check if embeddings cache exists."""
return self.embeddings_file.exists() and self.index_file.exists()
def _load_embedding_cache(self) -> Dict[str, np.ndarray]:
"""Load embeddings from cache."""
if self._embeddings_cache is not None:
return self._embeddings_cache
if not self.embeddings_file.exists():
self._embeddings_cache = {}
return self._embeddings_cache
try:
with open(self.embeddings_file, 'rb') as f:
self._embeddings_cache = pickle.load(f)
self.logger.debug(f"Loaded {len(self._embeddings_cache)} embeddings from cache")
except Exception as e:
self.logger.warning(f"Failed to load embeddings cache: {e}")
self._embeddings_cache = {}
return self._embeddings_cache
def _save_embedding_cache(self):
"""Save embeddings to cache."""
if self._embeddings_cache is None:
return
try:
with open(self.embeddings_file, 'wb') as f:
pickle.dump(self._embeddings_cache, f)
self.logger.debug(f"Saved {len(self._embeddings_cache)} embeddings to cache")
except Exception as e:
self.logger.error(f"Failed to save embeddings cache: {e}")
def _load_embedding_index(self) -> Dict[str, EmbeddingInfo]:
"""Load embedding index."""
if self._embedding_index is not None:
return self._embedding_index
if not self.index_file.exists():
self._embedding_index = {}
return self._embedding_index
try:
with open(self.index_file, 'r', encoding='utf-8') as f:
data = json.load(f)
self._embedding_index = {}
for path, info_dict in data.items():
self._embedding_index[path] = EmbeddingInfo(**info_dict)
self.logger.debug(f"Loaded embedding index with {len(self._embedding_index)} entries")
except Exception as e:
self.logger.warning(f"Failed to load embedding index: {e}")
self._embedding_index = {}
return self._embedding_index
def _save_embedding_index(self):
"""Save embedding index."""
if self._embedding_index is None:
return
try:
data = {}
for path, info in self._embedding_index.items():
data[path] = {
'file_path': info.file_path,
'content_hash': info.content_hash,
'embedding_hash': info.embedding_hash,
'created_time': info.created_time,
'vector_size': info.vector_size
}
with open(self.index_file, 'w', encoding='utf-8') as f:
json.dump(data, f, indent=2)
self.logger.debug(f"Saved embedding index with {len(self._embedding_index)} entries")
except Exception as e:
self.logger.error(f"Failed to save embedding index: {e}")
def _extract_text_content(self, file_info: FileInfo) -> Optional[str]:
"""Extract text content from a file for embedding."""
try:
file_path = Path(file_info.path)
# Skip binary files and very large files
if file_info.size > self.config.get('performance', {}).get('max_file_size', 10485760):
return None
# Only process text-based files
text_extensions = {'.py', '.js', '.ts', '.tsx', '.jsx', '.java', '.cpp', '.c', '.h',
'.rs', '.go', '.php', '.rb', '.sh', '.bash', '.md', '.txt', '.json',
'.yaml', '.yml', '.xml', '.html', '.css', '.scss', '.sass'}
if file_info.extension.lower() not in text_extensions:
return None
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
content = f.read()
# Truncate content if too long
if len(content) > self.max_context_length * 4: # Approximate token limit
content = content[:self.max_context_length * 4]
return content
except Exception as e:
self.logger.debug(f"Could not extract content from {file_info.path}: {e}")
return None
def _create_embedding(self, text: str) -> Optional[np.ndarray]:
"""Create embedding for text content."""
if not self.enabled or self.model is None:
return None
try:
# Truncate text if needed
if len(text) > self.max_context_length * 4:
text = text[:self.max_context_length * 4]
embedding = self.model.encode([text])[0]
return embedding
except Exception as e:
self.logger.warning(f"Failed to create embedding: {e}")
return None
def _get_content_hash(self, content: str) -> str:
"""Get hash of content for caching."""
return hashlib.md5(content.encode('utf-8')).hexdigest()
def _get_embedding_hash(self, embedding: np.ndarray) -> str:
"""Get hash of embedding for verification."""
return hashlib.md5(embedding.tobytes()).hexdigest()
def update_embeddings(self, file_index: Dict[str, FileInfo], force_rebuild: bool = False) -> int:
"""Update embeddings for files in the index."""
if not self.enabled:
self.logger.info("Embeddings disabled, skipping update")
return 0
self.logger.info("Updating embeddings...")
# Load caches
embeddings_cache = self._load_embedding_cache()
embedding_index = self._load_embedding_index()
new_embeddings = 0
batch_texts = []
batch_paths = []
for file_path, file_info in file_index.items():
# Check if embedding exists and is current
if not force_rebuild and file_path in embedding_index:
cached_info = embedding_index[file_path]
if cached_info.content_hash == file_info.content_hash:
continue # Embedding is current
# Extract content
content = self._extract_text_content(file_info)
if content is None:
continue
# Prepare for batch processing
batch_texts.append(content)
batch_paths.append(file_path)
# Process batch when full
if len(batch_texts) >= self.batch_size:
self._process_batch(batch_texts, batch_paths, file_index, embeddings_cache, embedding_index)
new_embeddings += len(batch_texts)
batch_texts = []
batch_paths = []
# Process remaining batch
if batch_texts:
self._process_batch(batch_texts, batch_paths, file_index, embeddings_cache, embedding_index)
new_embeddings += len(batch_texts)
# Save caches
self._save_embedding_cache()
self._save_embedding_index()
self.logger.info(f"Updated {new_embeddings} embeddings")
return new_embeddings
def _process_batch(self, texts: List[str], paths: List[str], file_index: Dict[str, FileInfo],
embeddings_cache: Dict[str, np.ndarray], embedding_index: Dict[str, EmbeddingInfo]):
"""Process a batch of texts for embedding."""
try:
# Create embeddings for batch
embeddings = self.model.encode(texts)
for i, (text, path) in enumerate(zip(texts, paths)):
embedding = embeddings[i]
file_info = file_index[path]
# Store embedding
content_hash = self._get_content_hash(text)
embedding_hash = self._get_embedding_hash(embedding)
embeddings_cache[path] = embedding
embedding_index[path] = EmbeddingInfo(
file_path=path,
content_hash=content_hash,
embedding_hash=embedding_hash,
created_time=time.time(),
vector_size=len(embedding)
)
except Exception as e:
self.logger.error(f"Failed to process embedding batch: {e}")
def find_similar_files(self, query: str, file_index: Dict[str, FileInfo],
top_k: int = 20) -> List[SimilarityResult]:
"""Find files similar to the query using embeddings."""
if not self.enabled:
return []
# Create query embedding
query_embedding = self._create_embedding(query)
if query_embedding is None:
return []
# Load embeddings
embeddings_cache = self._load_embedding_cache()
if not embeddings_cache:
self.logger.warning("No embeddings available for similarity search")
return []
# Calculate similarities
similarities = []
for file_path, file_embedding in embeddings_cache.items():
if file_path not in file_index:
continue
try:
# Calculate cosine similarity
similarity = np.dot(query_embedding, file_embedding) / (
np.linalg.norm(query_embedding) * np.linalg.norm(file_embedding)
)
if similarity >= self.similarity_threshold:
similarities.append((file_path, similarity))
except Exception as e:
self.logger.debug(f"Failed to calculate similarity for {file_path}: {e}")
continue
# Sort by similarity
similarities.sort(key=lambda x: x[1], reverse=True)
# Create results
results = []
for file_path, similarity in similarities[:top_k]:
file_info = file_index[file_path]
# Extract a snippet of matching content
content = self._extract_text_content(file_info)
snippet = content[:200] + "..." if content and len(content) > 200 else content or ""
result = SimilarityResult(
file_info=file_info,
similarity_score=similarity,
matching_content=snippet
)
results.append(result)
self.logger.info(f"Found {len(results)} similar files for query")
return results
def get_stats(self) -> Dict[str, Any]:
"""Get statistics about the embedding cache."""
if not self.enabled:
return {'enabled': False}
embedding_index = self._load_embedding_index()
embeddings_cache = self._load_embedding_cache()
return {
'enabled': True,
'model_name': self.model_name,
'total_embeddings': len(embedding_index),
'cache_size_mb': os.path.getsize(self.embeddings_file) / 1024 / 1024 if self.embeddings_file.exists() else 0,
'similarity_threshold': self.similarity_threshold,
'vector_size': list(embedding_index.values())[0].vector_size if embedding_index else 0
}
def main():
"""Command-line interface for embedding manager."""
import yaml
import argparse
from .file_indexer import FileIndexer
parser = argparse.ArgumentParser(description="Embedding Manager for UltraThink")
parser.add_argument("--config", default="config.yaml", help="Configuration file path")
parser.add_argument("--update", action="store_true", help="Update embeddings")
parser.add_argument("--rebuild", action="store_true", help="Force rebuild all embeddings")
parser.add_argument("--query", help="Search for similar files")
parser.add_argument("--stats", action="store_true", help="Show embedding statistics")
parser.add_argument("--verbose", "-v", action="store_true", help="Verbose output")
args = parser.parse_args()
# Setup logging
level = logging.DEBUG if args.verbose else logging.INFO
logging.basicConfig(level=level, format='%(levelname)s: %(message)s')
# Load configuration
config_path = Path(__file__).parent / args.config
with open(config_path, 'r', encoding='utf-8') as f:
config = yaml.safe_load(f)
# Create components
indexer = FileIndexer(config)
embedding_manager = EmbeddingManager(config)
if not embedding_manager.enabled:
print("Embeddings are disabled. Enable in config.yaml or install required dependencies.")
return
# Load file index
file_index = indexer.load_index()
if not file_index:
print("Building file index...")
file_index = indexer.build_index()
if args.stats:
stats = embedding_manager.get_stats()
print("Embedding Statistics:")
for key, value in stats.items():
print(f" {key}: {value}")
return
if args.update or args.rebuild:
count = embedding_manager.update_embeddings(file_index, force_rebuild=args.rebuild)
print(f"Updated {count} embeddings")
if args.query:
results = embedding_manager.find_similar_files(args.query, file_index)
print(f"Found {len(results)} similar files:")
for result in results:
print(f" {result.file_info.relative_path} (similarity: {result.similarity_score:.3f})")
if args.verbose and result.matching_content:
print(f" Content: {result.matching_content[:100]}...")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,383 @@
#!/usr/bin/env python3
"""
File Indexer Module for UltraThink Path-Aware Analyzer
Builds and maintains an index of repository files with metadata.
Enhanced with gitignore support and unified configuration.
"""
import os
import hashlib
import json
import time
import logging
from pathlib import Path
from typing import Dict, List, Optional, Set, Tuple, Union
from dataclasses import dataclass, asdict
from datetime import datetime
import fnmatch
from .gitignore_parser import GitignoreParser
@dataclass
class FileInfo:
"""Information about a single file in the repository."""
path: str
relative_path: str
size: int
modified_time: float
extension: str
category: str # code, docs, config, web
estimated_tokens: int
content_hash: str
def to_dict(self) -> Dict:
return asdict(self)
@classmethod
def from_dict(cls, data: Dict) -> 'FileInfo':
return cls(**data)
@dataclass
class IndexStats:
"""Statistics about the file index."""
total_files: int
total_tokens: int
total_size: int
categories: Dict[str, int]
last_updated: float
def to_dict(self) -> Dict:
return asdict(self)
class FileIndexer:
"""Builds and maintains an efficient index of repository files."""
def __init__(self, config: Union['Config', Dict], root_path: str = "."):
# Support both Config object and Dict for backward compatibility
if hasattr(config, 'to_dict'):
self.config_obj = config
self.config = config.to_dict()
else:
self.config_obj = None
self.config = config
self.root_path = Path(root_path).resolve()
self.cache_dir = Path(self.config.get('embedding', {}).get('cache_dir', '.claude/cache'))
self.cache_dir.mkdir(parents=True, exist_ok=True)
self.index_file = self.cache_dir / "file_index.json"
# Setup logging
self.logger = logging.getLogger(__name__)
# File extension mappings
self.extension_categories = self._build_extension_map()
# Exclude patterns from config
self.exclude_patterns = list(self.config.get('exclude_patterns', []))
# Initialize gitignore parser and add patterns
self.gitignore_parser = GitignoreParser(str(self.root_path))
self._load_gitignore_patterns()
# Performance settings
self.max_file_size = self.config.get('performance', {}).get('max_file_size', 10485760)
def _build_extension_map(self) -> Dict[str, str]:
"""Build mapping from file extensions to categories."""
ext_map = {}
for category, extensions in self.config.get('file_extensions', {}).items():
for ext in extensions:
ext_map[ext.lower()] = category
return ext_map
def _load_gitignore_patterns(self):
"""Load patterns from .gitignore files and add to exclude_patterns."""
try:
gitignore_patterns = self.gitignore_parser.parse_all_gitignores()
if gitignore_patterns:
# Avoid duplicates
existing_patterns = set(self.exclude_patterns)
new_patterns = [p for p in gitignore_patterns if p not in existing_patterns]
self.exclude_patterns.extend(new_patterns)
self.logger.info(f"Added {len(new_patterns)} patterns from .gitignore files")
except Exception as e:
self.logger.warning(f"Failed to load .gitignore patterns: {e}")
def _should_exclude_file(self, file_path: Path) -> bool:
"""Check if file should be excluded based on patterns and gitignore rules."""
relative_path = str(file_path.relative_to(self.root_path))
# Check against exclude patterns from config
for pattern in self.exclude_patterns:
# Convert pattern to work with fnmatch
if fnmatch.fnmatch(relative_path, pattern) or fnmatch.fnmatch(str(file_path), pattern):
return True
# Check if any parent directory matches
parts = relative_path.split(os.sep)
for i in range(len(parts)):
partial_path = "/".join(parts[:i+1])
if fnmatch.fnmatch(partial_path, pattern):
return True
# Also check gitignore rules using dedicated parser
# Note: gitignore patterns are already included in self.exclude_patterns
# but we can add additional gitignore-specific checking here if needed
try:
# The gitignore patterns are already loaded into exclude_patterns,
# but we can do additional gitignore-specific checks if needed
pass
except Exception as e:
self.logger.debug(f"Error in gitignore checking for {file_path}: {e}")
return False
def _estimate_tokens(self, file_path: Path) -> int:
"""Estimate token count for a file (chars/4 approximation)."""
try:
if file_path.stat().st_size > self.max_file_size:
return file_path.stat().st_size // 8 # Penalty for large files
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
content = f.read()
return len(content) // 4 # Rough approximation
except (UnicodeDecodeError, OSError):
# Binary files or unreadable files
return file_path.stat().st_size // 8
def _get_file_hash(self, file_path: Path) -> str:
"""Get a hash of file metadata for change detection."""
stat = file_path.stat()
return hashlib.md5(f"{file_path}:{stat.st_size}:{stat.st_mtime}".encode()).hexdigest()
def _categorize_file(self, file_path: Path) -> str:
"""Categorize file based on extension."""
extension = file_path.suffix.lower()
return self.extension_categories.get(extension, 'other')
def _scan_file(self, file_path: Path) -> Optional[FileInfo]:
"""Scan a single file and create FileInfo."""
try:
if not file_path.is_file() or self._should_exclude_file(file_path):
return None
stat = file_path.stat()
relative_path = str(file_path.relative_to(self.root_path))
file_info = FileInfo(
path=str(file_path),
relative_path=relative_path,
size=stat.st_size,
modified_time=stat.st_mtime,
extension=file_path.suffix.lower(),
category=self._categorize_file(file_path),
estimated_tokens=self._estimate_tokens(file_path),
content_hash=self._get_file_hash(file_path)
)
return file_info
except (OSError, PermissionError) as e:
self.logger.warning(f"Could not scan file {file_path}: {e}")
return None
def build_index(self, force_rebuild: bool = False) -> Dict[str, FileInfo]:
"""Build or update the file index."""
self.logger.info(f"Building file index for {self.root_path}")
# Load existing index if available
existing_index = {}
if not force_rebuild and self.index_file.exists():
existing_index = self.load_index()
new_index = {}
changed_files = 0
# Walk through all files
for file_path in self.root_path.rglob('*'):
if not file_path.is_file():
continue
file_info = self._scan_file(file_path)
if file_info is None:
continue
# Check if file has changed
relative_path = file_info.relative_path
if relative_path in existing_index:
old_info = existing_index[relative_path]
if old_info.content_hash == file_info.content_hash:
# File unchanged, keep old info
new_index[relative_path] = old_info
continue
# File is new or changed
new_index[relative_path] = file_info
changed_files += 1
self.logger.info(f"Indexed {len(new_index)} files ({changed_files} new/changed)")
# Save index
self.save_index(new_index)
return new_index
def load_index(self) -> Dict[str, FileInfo]:
"""Load file index from cache."""
if not self.index_file.exists():
return {}
try:
with open(self.index_file, 'r', encoding='utf-8') as f:
data = json.load(f)
index = {}
for path, info_dict in data.get('files', {}).items():
index[path] = FileInfo.from_dict(info_dict)
return index
except (json.JSONDecodeError, KeyError) as e:
self.logger.warning(f"Could not load index: {e}")
return {}
def save_index(self, index: Dict[str, FileInfo]) -> None:
"""Save file index to cache."""
try:
# Calculate stats
stats = self._calculate_stats(index)
data = {
'stats': stats.to_dict(),
'files': {path: info.to_dict() for path, info in index.items()}
}
with open(self.index_file, 'w', encoding='utf-8') as f:
json.dump(data, f, indent=2)
except OSError as e:
self.logger.error(f"Could not save index: {e}")
def _calculate_stats(self, index: Dict[str, FileInfo]) -> IndexStats:
"""Calculate statistics for the index."""
total_files = len(index)
total_tokens = sum(info.estimated_tokens for info in index.values())
total_size = sum(info.size for info in index.values())
categories = {}
for info in index.values():
categories[info.category] = categories.get(info.category, 0) + 1
return IndexStats(
total_files=total_files,
total_tokens=total_tokens,
total_size=total_size,
categories=categories,
last_updated=time.time()
)
def get_stats(self) -> Optional[IndexStats]:
"""Get statistics about the current index."""
if not self.index_file.exists():
return None
try:
with open(self.index_file, 'r', encoding='utf-8') as f:
data = json.load(f)
return IndexStats(**data.get('stats', {}))
except (json.JSONDecodeError, KeyError):
return None
def find_files_by_pattern(self, pattern: str, index: Optional[Dict[str, FileInfo]] = None) -> List[FileInfo]:
"""Find files matching a glob pattern."""
if index is None:
index = self.load_index()
matching_files = []
for path, info in index.items():
if fnmatch.fnmatch(path, pattern) or fnmatch.fnmatch(info.path, pattern):
matching_files.append(info)
return matching_files
def find_files_by_category(self, category: str, index: Optional[Dict[str, FileInfo]] = None) -> List[FileInfo]:
"""Find files by category (code, docs, config, etc.)."""
if index is None:
index = self.load_index()
return [info for info in index.values() if info.category == category]
def find_files_by_keywords(self, keywords: List[str], index: Optional[Dict[str, FileInfo]] = None) -> List[FileInfo]:
"""Find files whose paths contain any of the specified keywords."""
if index is None:
index = self.load_index()
matching_files = []
keywords_lower = [kw.lower() for kw in keywords]
for info in index.values():
path_lower = info.relative_path.lower()
if any(keyword in path_lower for keyword in keywords_lower):
matching_files.append(info)
return matching_files
def get_recent_files(self, limit: int = 20, index: Optional[Dict[str, FileInfo]] = None) -> List[FileInfo]:
"""Get most recently modified files."""
if index is None:
index = self.load_index()
files = list(index.values())
files.sort(key=lambda f: f.modified_time, reverse=True)
return files[:limit]
def main():
"""Command-line interface for file indexer."""
import yaml
import argparse
parser = argparse.ArgumentParser(description="File Indexer for UltraThink")
parser.add_argument("--config", default="config.yaml", help="Configuration file path")
parser.add_argument("--rebuild", action="store_true", help="Force rebuild index")
parser.add_argument("--stats", action="store_true", help="Show index statistics")
parser.add_argument("--pattern", help="Find files matching pattern")
args = parser.parse_args()
# Load configuration
config_path = Path(__file__).parent / args.config
with open(config_path, 'r', encoding='utf-8') as f:
config = yaml.safe_load(f)
# Setup logging
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
# Create indexer
indexer = FileIndexer(config)
if args.stats:
stats = indexer.get_stats()
if stats:
print(f"Total files: {stats.total_files}")
print(f"Total tokens: {stats.total_tokens:,}")
print(f"Total size: {stats.total_size:,} bytes")
print(f"Categories: {stats.categories}")
print(f"Last updated: {datetime.fromtimestamp(stats.last_updated)}")
else:
print("No index found. Run without --stats to build index.")
return
# Build index
index = indexer.build_index(force_rebuild=args.rebuild)
if args.pattern:
files = indexer.find_files_by_pattern(args.pattern, index)
print(f"Found {len(files)} files matching pattern '{args.pattern}':")
for file_info in files[:20]: # Limit output
print(f" {file_info.relative_path}")
else:
stats = indexer._calculate_stats(index)
print(f"Index built: {stats.total_files} files, ~{stats.total_tokens:,} tokens")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,182 @@
#!/usr/bin/env python3
"""
GitIgnore Parser Module
Parses .gitignore files and converts rules to fnmatch patterns for file exclusion.
"""
import os
import fnmatch
from pathlib import Path
from typing import List, Set, Optional
class GitignoreParser:
"""Parser for .gitignore files that converts rules to fnmatch patterns."""
def __init__(self, root_path: str = "."):
self.root_path = Path(root_path).resolve()
self.patterns: List[str] = []
self.negation_patterns: List[str] = []
def parse_file(self, gitignore_path: str) -> List[str]:
"""Parse a .gitignore file and return exclude patterns."""
gitignore_file = Path(gitignore_path)
if not gitignore_file.exists():
return []
patterns = []
try:
with open(gitignore_file, 'r', encoding='utf-8') as f:
for line_num, line in enumerate(f, 1):
pattern = self._parse_line(line.strip())
if pattern:
patterns.append(pattern)
except (UnicodeDecodeError, IOError):
# Fallback to system encoding if UTF-8 fails
try:
with open(gitignore_file, 'r') as f:
for line_num, line in enumerate(f, 1):
pattern = self._parse_line(line.strip())
if pattern:
patterns.append(pattern)
except IOError:
# If file can't be read, return empty list
return []
return patterns
def _parse_line(self, line: str) -> Optional[str]:
"""Parse a single line from .gitignore file."""
# Skip empty lines and comments
if not line or line.startswith('#'):
return None
# Handle negation patterns (starting with !)
if line.startswith('!'):
# For now, we'll skip negation patterns as they require
# more complex logic to implement correctly
return None
# Convert gitignore pattern to fnmatch pattern
return self._convert_to_fnmatch(line)
def _convert_to_fnmatch(self, pattern: str) -> str:
"""Convert gitignore pattern to fnmatch pattern."""
# Remove trailing slash (directory indicator)
if pattern.endswith('/'):
pattern = pattern[:-1]
# Handle absolute paths (starting with /)
if pattern.startswith('/'):
pattern = pattern[1:]
# Make it match from root
return pattern
# Handle patterns that should match anywhere in the tree
# If pattern doesn't contain '/', it matches files/dirs at any level
if '/' not in pattern:
return f"*/{pattern}"
# Pattern contains '/', so it's relative to the gitignore location
return pattern
def parse_all_gitignores(self, root_path: Optional[str] = None) -> List[str]:
"""Parse all .gitignore files in the repository hierarchy."""
if root_path:
self.root_path = Path(root_path).resolve()
all_patterns = []
# Find all .gitignore files in the repository
gitignore_files = self._find_gitignore_files()
for gitignore_file in gitignore_files:
patterns = self.parse_file(gitignore_file)
all_patterns.extend(patterns)
return all_patterns
def _find_gitignore_files(self) -> List[Path]:
"""Find all .gitignore files in the repository."""
gitignore_files = []
# Start with root .gitignore
root_gitignore = self.root_path / '.gitignore'
if root_gitignore.exists():
gitignore_files.append(root_gitignore)
# Find .gitignore files in subdirectories
try:
for gitignore_file in self.root_path.rglob('.gitignore'):
if gitignore_file != root_gitignore:
gitignore_files.append(gitignore_file)
except (PermissionError, OSError):
# Skip directories we can't access
pass
return gitignore_files
def should_exclude(self, file_path: str, gitignore_patterns: List[str]) -> bool:
"""Check if a file should be excluded based on gitignore patterns."""
# Convert to relative path from root
try:
rel_path = str(Path(file_path).relative_to(self.root_path))
except ValueError:
# File is not under root path
return False
# Normalize path separators for consistent matching
rel_path = rel_path.replace(os.sep, '/')
for pattern in gitignore_patterns:
if self._matches_pattern(rel_path, pattern):
return True
return False
def _matches_pattern(self, file_path: str, pattern: str) -> bool:
"""Check if a file path matches a gitignore pattern."""
# Normalize pattern separators
pattern = pattern.replace(os.sep, '/')
# Handle different pattern types
if pattern.startswith('*/'):
# Pattern like */pattern - matches at any level
sub_pattern = pattern[2:]
return fnmatch.fnmatch(file_path, f"*/{sub_pattern}") or fnmatch.fnmatch(file_path, sub_pattern)
elif '/' in pattern:
# Pattern contains slash - match exact path
return fnmatch.fnmatch(file_path, pattern)
else:
# Simple pattern - match filename or directory at any level
parts = file_path.split('/')
return any(fnmatch.fnmatch(part, pattern) for part in parts)
def parse_gitignore(gitignore_path: str) -> List[str]:
"""Convenience function to parse a single .gitignore file."""
parser = GitignoreParser()
return parser.parse_file(gitignore_path)
def get_all_gitignore_patterns(root_path: str = ".") -> List[str]:
"""Convenience function to get all gitignore patterns in a repository."""
parser = GitignoreParser(root_path)
return parser.parse_all_gitignores()
if __name__ == "__main__":
import sys
if len(sys.argv) > 1:
gitignore_path = sys.argv[1]
patterns = parse_gitignore(gitignore_path)
print(f"Parsed {len(patterns)} patterns from {gitignore_path}:")
for pattern in patterns:
print(f" {pattern}")
else:
# Parse all .gitignore files in current directory
patterns = get_all_gitignore_patterns()
print(f"Found {len(patterns)} gitignore patterns:")
for pattern in patterns:
print(f" {pattern}")

View File

@@ -0,0 +1,500 @@
#!/usr/bin/env python3
"""
Path Matcher Module for UltraThink Path-Aware Analyzer
Matches files to analysis context and ranks them by relevance.
"""
import re
import logging
import fnmatch
from typing import Dict, List, Tuple, Optional, Set
from dataclasses import dataclass
from pathlib import Path
import math
from .file_indexer import FileInfo
from .context_analyzer import AnalysisResult
@dataclass
class MatchResult:
"""Result of path matching with relevance score."""
file_info: FileInfo
relevance_score: float
match_reasons: List[str]
category_bonus: float
@dataclass
class PathMatchingResult:
"""Complete result of path matching operation."""
matched_files: List[MatchResult]
total_tokens: int
categories: Dict[str, int]
patterns_used: List[str]
confidence_score: float
class PathMatcher:
"""Matches files to analysis context using various algorithms."""
def __init__(self, config: Dict):
self.config = config
self.logger = logging.getLogger(__name__)
# Load scoring weights
self.weights = config.get('path_matching', {}).get('weights', {
'keyword_match': 0.4,
'extension_match': 0.2,
'directory_context': 0.2,
'file_size_penalty': 0.1,
'recency_bonus': 0.1
})
# Load limits
self.max_files_per_category = config.get('path_matching', {}).get('max_files_per_category', 20)
self.min_relevance_score = config.get('path_matching', {}).get('min_relevance_score', 0.1)
self.max_total_files = config.get('output', {}).get('max_total_files', 50)
# Load always include patterns
self.always_include = config.get('output', {}).get('always_include', [])
# Category priorities
self.category_priorities = {
'code': 1.0,
'config': 0.8,
'docs': 0.6,
'web': 0.4,
'other': 0.2
}
def _calculate_keyword_score(self, file_info: FileInfo, keywords: List[str]) -> Tuple[float, List[str]]:
"""Calculate score based on keyword matches in file path."""
if not keywords:
return 0.0, []
path_lower = file_info.relative_path.lower()
filename_lower = Path(file_info.relative_path).name.lower()
matches = []
score = 0.0
for keyword in keywords:
keyword_lower = keyword.lower()
# Exact filename match (highest weight)
if keyword_lower in filename_lower:
score += 2.0
matches.append(f"filename:{keyword}")
continue
# Directory name match
if keyword_lower in path_lower:
score += 1.0
matches.append(f"path:{keyword}")
continue
# Partial match in path components
path_parts = path_lower.split('/')
for part in path_parts:
if keyword_lower in part:
score += 0.5
matches.append(f"partial:{keyword}")
break
# Normalize by number of keywords
normalized_score = score / len(keywords) if keywords else 0.0
return min(normalized_score, 1.0), matches
def _calculate_extension_score(self, file_info: FileInfo, languages: List[str]) -> float:
"""Calculate score based on file extension relevance."""
if not languages:
return 0.5 # Neutral score
extension = file_info.extension.lower()
# Language-specific extension mapping
lang_extensions = {
'python': ['.py', '.pyx', '.pyi'],
'javascript': ['.js', '.jsx', '.mjs'],
'typescript': ['.ts', '.tsx'],
'java': ['.java'],
'go': ['.go'],
'rust': ['.rs'],
'cpp': ['.cpp', '.cc', '.cxx', '.c', '.h', '.hpp'],
'csharp': ['.cs'],
'php': ['.php'],
'ruby': ['.rb'],
'shell': ['.sh', '.bash', '.zsh']
}
score = 0.0
for language in languages:
if language in lang_extensions:
if extension in lang_extensions[language]:
score = 1.0
break
# Fallback to category-based scoring
if score == 0.0:
category_scores = {
'code': 1.0,
'config': 0.8,
'docs': 0.6,
'web': 0.4,
'other': 0.2
}
score = category_scores.get(file_info.category, 0.2)
return score
def _calculate_directory_score(self, file_info: FileInfo, domains: List[str]) -> Tuple[float, List[str]]:
"""Calculate score based on directory context."""
if not domains:
return 0.0, []
path_parts = file_info.relative_path.lower().split('/')
matches = []
score = 0.0
# Domain-specific directory patterns
domain_patterns = {
'auth': ['auth', 'authentication', 'login', 'user', 'account'],
'authentication': ['auth', 'authentication', 'login', 'user', 'account'],
'database': ['db', 'database', 'model', 'entity', 'migration', 'schema'],
'api': ['api', 'rest', 'graphql', 'route', 'controller', 'handler'],
'frontend': ['ui', 'component', 'view', 'template', 'client', 'web'],
'backend': ['service', 'server', 'core', 'business', 'logic'],
'test': ['test', 'spec', 'tests', '__tests__', 'testing'],
'testing': ['test', 'spec', 'tests', '__tests__', 'testing'],
'config': ['config', 'configuration', 'env', 'settings'],
'configuration': ['config', 'configuration', 'env', 'settings'],
'util': ['util', 'utils', 'helper', 'common', 'shared', 'lib'],
'utility': ['util', 'utils', 'helper', 'common', 'shared', 'lib']
}
for domain in domains:
if domain in domain_patterns:
patterns = domain_patterns[domain]
for pattern in patterns:
for part in path_parts:
if pattern in part:
score += 1.0
matches.append(f"dir:{domain}->{pattern}")
break
# Normalize by number of domains
normalized_score = score / len(domains) if domains else 0.0
return min(normalized_score, 1.0), matches
def _calculate_size_penalty(self, file_info: FileInfo) -> float:
"""Calculate penalty for very large files."""
max_size = self.config.get('performance', {}).get('max_file_size', 10485760) # 10MB
if file_info.size > max_size:
# Heavy penalty for oversized files
return -0.5
elif file_info.size > max_size * 0.5:
# Light penalty for large files
return -0.2
else:
return 0.0
def _calculate_recency_bonus(self, file_info: FileInfo) -> float:
"""Calculate bonus for recently modified files."""
import time
current_time = time.time()
file_age = current_time - file_info.modified_time
# Files modified in last day get bonus
if file_age < 86400: # 1 day
return 0.3
elif file_age < 604800: # 1 week
return 0.1
else:
return 0.0
def calculate_relevance_score(self, file_info: FileInfo, analysis: AnalysisResult) -> MatchResult:
"""Calculate overall relevance score for a file."""
# Calculate individual scores
keyword_score, keyword_matches = self._calculate_keyword_score(file_info, analysis.keywords)
extension_score = self._calculate_extension_score(file_info, analysis.languages)
directory_score, dir_matches = self._calculate_directory_score(file_info, analysis.domains)
size_penalty = self._calculate_size_penalty(file_info)
recency_bonus = self._calculate_recency_bonus(file_info)
# Apply weights
weighted_score = (
keyword_score * self.weights.get('keyword_match', 0.4) +
extension_score * self.weights.get('extension_match', 0.2) +
directory_score * self.weights.get('directory_context', 0.2) +
size_penalty * self.weights.get('file_size_penalty', 0.1) +
recency_bonus * self.weights.get('recency_bonus', 0.1)
)
# Category bonus
category_bonus = self.category_priorities.get(file_info.category, 0.2)
# Final score with category bonus
final_score = weighted_score + (category_bonus * 0.1)
# Collect match reasons
match_reasons = keyword_matches + dir_matches
if extension_score > 0.5:
match_reasons.append(f"extension:{file_info.extension}")
if recency_bonus > 0:
match_reasons.append("recent")
return MatchResult(
file_info=file_info,
relevance_score=max(0.0, final_score),
match_reasons=match_reasons,
category_bonus=category_bonus
)
def match_by_patterns(self, file_index: Dict[str, FileInfo], patterns: List[str]) -> List[FileInfo]:
"""Match files using explicit glob patterns."""
matched_files = []
for pattern in patterns:
for path, file_info in file_index.items():
# Try matching both relative path and full path
if (fnmatch.fnmatch(path, pattern) or
fnmatch.fnmatch(file_info.path, pattern) or
fnmatch.fnmatch(Path(path).name, pattern)):
matched_files.append(file_info)
# Remove duplicates based on path
seen_paths = set()
unique_files = []
for file_info in matched_files:
if file_info.relative_path not in seen_paths:
seen_paths.add(file_info.relative_path)
unique_files.append(file_info)
return unique_files
def match_always_include(self, file_index: Dict[str, FileInfo]) -> List[FileInfo]:
"""Match files that should always be included."""
return self.match_by_patterns(file_index, self.always_include)
def rank_files(self, files: List[FileInfo], analysis: AnalysisResult) -> List[MatchResult]:
"""Rank files by relevance score."""
match_results = []
for file_info in files:
match_result = self.calculate_relevance_score(file_info, analysis)
if match_result.relevance_score >= self.min_relevance_score:
match_results.append(match_result)
# Sort by relevance score (descending)
match_results.sort(key=lambda x: x.relevance_score, reverse=True)
return match_results
def select_best_files(self, ranked_files: List[MatchResult], token_limit: Optional[int] = None) -> List[MatchResult]:
"""Select the best files within token limits and category constraints."""
if not ranked_files:
return []
selected_files = []
total_tokens = 0
category_counts = {}
for match_result in ranked_files:
file_info = match_result.file_info
category = file_info.category
# Check category limit
if category_counts.get(category, 0) >= self.max_files_per_category:
continue
# Check token limit
if token_limit and total_tokens + file_info.estimated_tokens > token_limit:
continue
# Check total file limit
if len(selected_files) >= self.max_total_files:
break
# Add file
selected_files.append(match_result)
total_tokens += file_info.estimated_tokens
category_counts[category] = category_counts.get(category, 0) + 1
return selected_files
def match_files(self, file_index: Dict[str, FileInfo], analysis: AnalysisResult,
token_limit: Optional[int] = None, explicit_patterns: Optional[List[str]] = None) -> PathMatchingResult:
"""Main file matching function."""
self.logger.info(f"Matching files for analysis with {len(analysis.keywords)} keywords and {len(analysis.domains)} domains")
# Start with always-include files
always_include_files = self.match_always_include(file_index)
self.logger.debug(f"Always include: {len(always_include_files)} files")
# Add explicit pattern matches
pattern_files = []
patterns_used = []
if explicit_patterns:
pattern_files = self.match_by_patterns(file_index, explicit_patterns)
patterns_used.extend(explicit_patterns)
self.logger.debug(f"Explicit patterns: {len(pattern_files)} files")
# Add suggested pattern matches
if analysis.file_patterns:
suggested_files = self.match_by_patterns(file_index, analysis.file_patterns)
pattern_files.extend(suggested_files)
patterns_used.extend(analysis.file_patterns)
self.logger.debug(f"Suggested patterns: {len(suggested_files)} files")
# Combine all candidate files and remove duplicates
all_files = always_include_files + pattern_files + list(file_index.values())
seen_paths = set()
all_candidates = []
for file_info in all_files:
if file_info.relative_path not in seen_paths:
seen_paths.add(file_info.relative_path)
all_candidates.append(file_info)
self.logger.debug(f"Total candidates: {len(all_candidates)} files")
# Rank all candidates
ranked_files = self.rank_files(all_candidates, analysis)
self.logger.debug(f"Files above threshold: {len(ranked_files)}")
# Select best files within limits
selected_files = self.select_best_files(ranked_files, token_limit)
self.logger.info(f"Selected {len(selected_files)} files")
# Calculate statistics
total_tokens = sum(match.file_info.estimated_tokens for match in selected_files)
categories = {}
for match in selected_files:
category = match.file_info.category
categories[category] = categories.get(category, 0) + 1
# Calculate confidence score
confidence_score = self._calculate_confidence(selected_files, analysis)
return PathMatchingResult(
matched_files=selected_files,
total_tokens=total_tokens,
categories=categories,
patterns_used=patterns_used,
confidence_score=confidence_score
)
def _calculate_confidence(self, selected_files: List[MatchResult], analysis: AnalysisResult) -> float:
"""Calculate confidence score for the matching result."""
if not selected_files:
return 0.0
# Average relevance score
avg_relevance = sum(match.relevance_score for match in selected_files) / len(selected_files)
# Keyword coverage (how many keywords are represented)
keyword_coverage = 0.0
if analysis.keywords:
covered_keywords = set()
for match in selected_files:
for reason in match.match_reasons:
if reason.startswith('filename:') or reason.startswith('path:'):
keyword = reason.split(':', 1)[1]
covered_keywords.add(keyword)
keyword_coverage = len(covered_keywords) / len(analysis.keywords)
# Domain coverage
domain_coverage = 0.0
if analysis.domains:
covered_domains = set()
for match in selected_files:
for reason in match.match_reasons:
if reason.startswith('dir:'):
domain = reason.split('->', 1)[0].split(':', 1)[1]
covered_domains.add(domain)
domain_coverage = len(covered_domains) / len(analysis.domains)
# Weighted confidence score
confidence = (
avg_relevance * 0.5 +
keyword_coverage * 0.3 +
domain_coverage * 0.2
)
return min(confidence, 1.0)
def format_patterns(self, selected_files: List[MatchResult]) -> List[str]:
"""Format selected files as @{pattern} strings."""
pattern_format = self.config.get('output', {}).get('pattern_format', '@{{{path}}}')
patterns = []
for match in selected_files:
pattern = pattern_format.format(path=match.file_info.relative_path)
patterns.append(pattern)
return patterns
def main():
"""Command-line interface for path matcher."""
import yaml
import argparse
import json
from .file_indexer import FileIndexer
from .context_analyzer import ContextAnalyzer
parser = argparse.ArgumentParser(description="Path Matcher for UltraThink")
parser.add_argument("prompt", help="Prompt to analyze and match")
parser.add_argument("--config", default="config.yaml", help="Configuration file path")
parser.add_argument("--token-limit", type=int, help="Token limit for selection")
parser.add_argument("--patterns", nargs="*", help="Explicit patterns to include")
parser.add_argument("--verbose", "-v", action="store_true", help="Verbose output")
args = parser.parse_args()
# Setup logging
level = logging.DEBUG if args.verbose else logging.INFO
logging.basicConfig(level=level, format='%(levelname)s: %(message)s')
# Load configuration
config_path = Path(__file__).parent / args.config
with open(config_path, 'r', encoding='utf-8') as f:
config = yaml.safe_load(f)
# Create components
indexer = FileIndexer(config)
analyzer = ContextAnalyzer(config)
matcher = PathMatcher(config)
# Build file index
file_index = indexer.load_index()
if not file_index:
print("Building file index...")
file_index = indexer.build_index()
# Analyze prompt
analysis = analyzer.analyze(args.prompt)
# Match files
result = matcher.match_files(
file_index=file_index,
analysis=analysis,
token_limit=args.token_limit,
explicit_patterns=args.patterns
)
# Output results
print(f"Matched {len(result.matched_files)} files (~{result.total_tokens:,} tokens)")
print(f"Categories: {result.categories}")
print(f"Confidence: {result.confidence_score:.2f}")
print()
patterns = matcher.format_patterns(result.matched_files)
print("Patterns:")
for pattern in patterns[:20]: # Limit output
print(f" {pattern}")
if args.verbose:
print("\nDetailed matches:")
for match in result.matched_files[:10]:
print(f" {match.file_info.relative_path} (score: {match.relevance_score:.3f})")
print(f" Reasons: {', '.join(match.match_reasons)}")
if __name__ == "__main__":
main()