Add comprehensive tests for tokenizer, performance benchmarks, and TreeSitter parser functionality

- Implemented unit tests for the Tokenizer class, covering various text inputs, edge cases, and fallback mechanisms. - Created performance benchmarks comparing tiktoken and pure Python implementations for token counting. - Developed extensive tests for TreeSitterSymbolParser across Python, JavaScript, and TypeScript, ensuring accurate symbol extraction and parsing. - Added configuration documentation for MCP integration and custom prompts, enhancing usability and flexibility. - Introduced a refactor script for GraphAnalyzer to streamline future improvements.
2026-02-10 02:24:35 +08:00 · 2025-12-15 14:36:09 +08:00
parent 82dcafff00
commit 0fe16963cd
49 changed files with 9307 additions and 438 deletions
--- a/codex-lens/src/codexlens/entities.py
+++ b/codex-lens/src/codexlens/entities.py
@@ -13,6 +13,8 @@ class Symbol(BaseModel):
    name: str = Field(..., min_length=1)
    kind: str = Field(..., min_length=1)
    range: Tuple[int, int] = Field(..., description="(start_line, end_line), 1-based inclusive")
+    token_count: Optional[int] = Field(default=None, description="Token count for symbol content")
+    symbol_type: Optional[str] = Field(default=None, description="Extended symbol type for filtering")

    @field_validator("range")
    @classmethod
@@ -26,6 +28,13 @@ class Symbol(BaseModel):
            raise ValueError("end_line must be >= start_line")
        return value

+    @field_validator("token_count")
+    @classmethod
+    def validate_token_count(cls, value: Optional[int]) -> Optional[int]:
+        if value is not None and value < 0:
+            raise ValueError("token_count must be >= 0")
+        return value
+

 class SemanticChunk(BaseModel):
    """A semantically meaningful chunk of content, optionally embedded."""
@@ -61,6 +70,25 @@ class IndexedFile(BaseModel):
        return cleaned


+class CodeRelationship(BaseModel):
+    """A relationship between code symbols (e.g., function calls, inheritance)."""
+
+    source_symbol: str = Field(..., min_length=1, description="Name of source symbol")
+    target_symbol: str = Field(..., min_length=1, description="Name of target symbol")
+    relationship_type: str = Field(..., min_length=1, description="Type of relationship (call, inherits, etc.)")
+    source_file: str = Field(..., min_length=1, description="File path containing source symbol")
+    target_file: Optional[str] = Field(default=None, description="File path containing target (None if same file)")
+    source_line: int = Field(..., ge=1, description="Line number where relationship occurs (1-based)")
+
+    @field_validator("relationship_type")
+    @classmethod
+    def validate_relationship_type(cls, value: str) -> str:
+        allowed_types = {"call", "inherits", "imports"}
+        if value not in allowed_types:
+            raise ValueError(f"relationship_type must be one of {allowed_types}")
+        return value
+
+
 class SearchResult(BaseModel):
    """A unified search result for lexical or semantic search."""