Add comprehensive tests for tokenizer, performance benchmarks, and TreeSitter parser functionality

- Implemented unit tests for the Tokenizer class, covering various text inputs, edge cases, and fallback mechanisms.
- Created performance benchmarks comparing tiktoken and pure Python implementations for token counting.
- Developed extensive tests for TreeSitterSymbolParser across Python, JavaScript, and TypeScript, ensuring accurate symbol extraction and parsing.
- Added configuration documentation for MCP integration and custom prompts, enhancing usability and flexibility.
- Introduced a refactor script for GraphAnalyzer to streamline future improvements.
This commit is contained in:
catlog22
2025-12-15 14:36:09 +08:00
parent 82dcafff00
commit 0fe16963cd
49 changed files with 9307 additions and 438 deletions

View File

@@ -13,6 +13,8 @@ class Symbol(BaseModel):
name: str = Field(..., min_length=1)
kind: str = Field(..., min_length=1)
range: Tuple[int, int] = Field(..., description="(start_line, end_line), 1-based inclusive")
token_count: Optional[int] = Field(default=None, description="Token count for symbol content")
symbol_type: Optional[str] = Field(default=None, description="Extended symbol type for filtering")
@field_validator("range")
@classmethod
@@ -26,6 +28,13 @@ class Symbol(BaseModel):
raise ValueError("end_line must be >= start_line")
return value
@field_validator("token_count")
@classmethod
def validate_token_count(cls, value: Optional[int]) -> Optional[int]:
if value is not None and value < 0:
raise ValueError("token_count must be >= 0")
return value
class SemanticChunk(BaseModel):
"""A semantically meaningful chunk of content, optionally embedded."""
@@ -61,6 +70,25 @@ class IndexedFile(BaseModel):
return cleaned
class CodeRelationship(BaseModel):
"""A relationship between code symbols (e.g., function calls, inheritance)."""
source_symbol: str = Field(..., min_length=1, description="Name of source symbol")
target_symbol: str = Field(..., min_length=1, description="Name of target symbol")
relationship_type: str = Field(..., min_length=1, description="Type of relationship (call, inherits, etc.)")
source_file: str = Field(..., min_length=1, description="File path containing source symbol")
target_file: Optional[str] = Field(default=None, description="File path containing target (None if same file)")
source_line: int = Field(..., ge=1, description="Line number where relationship occurs (1-based)")
@field_validator("relationship_type")
@classmethod
def validate_relationship_type(cls, value: str) -> str:
allowed_types = {"call", "inherits", "imports"}
if value not in allowed_types:
raise ValueError(f"relationship_type must be one of {allowed_types}")
return value
class SearchResult(BaseModel):
"""A unified search result for lexical or semantic search."""