Add comprehensive tests for tokenizer, performance benchmarks, and TreeSitter parser functionality

- Implemented unit tests for the Tokenizer class, covering various text inputs, edge cases, and fallback mechanisms. - Created performance benchmarks comparing tiktoken and pure Python implementations for token counting. - Developed extensive tests for TreeSitterSymbolParser across Python, JavaScript, and TypeScript, ensuring accurate symbol extraction and parsing. - Added configuration documentation for MCP integration and custom prompts, enhancing usability and flexibility. - Introduced a refactor script for GraphAnalyzer to streamline future improvements.
2026-02-10 02:24:35 +08:00 · 2025-12-15 14:36:09 +08:00
parent 82dcafff00
commit 0fe16963cd
49 changed files with 9307 additions and 438 deletions
--- a/codex-lens/src/codexlens/storage/dir_index.py
+++ b/codex-lens/src/codexlens/storage/dir_index.py
@@ -149,15 +149,21 @@ class DirIndexStore:
                # Replace symbols
                conn.execute("DELETE FROM symbols WHERE file_id=?", (file_id,))
                if symbols:
+                    # Extract token_count and symbol_type from symbol metadata if available
+                    symbol_rows = []
+                    for s in symbols:
+                        token_count = getattr(s, 'token_count', None)
+                        symbol_type = getattr(s, 'symbol_type', None) or s.kind
+                        symbol_rows.append(
+                            (file_id, s.name, s.kind, s.range[0], s.range[1], token_count, symbol_type)
+                        )
+
                    conn.executemany(
                        """
-                        INSERT INTO symbols(file_id, name, kind, start_line, end_line)
-                        VALUES(?, ?, ?, ?, ?)
+                        INSERT INTO symbols(file_id, name, kind, start_line, end_line, token_count, symbol_type)
+                        VALUES(?, ?, ?, ?, ?, ?, ?)
                        """,
-                        [
-                            (file_id, s.name, s.kind, s.range[0], s.range[1])
-                            for s in symbols
-                        ],
+                        symbol_rows,
                    )

                conn.commit()
@@ -216,15 +222,21 @@ class DirIndexStore:

                    conn.execute("DELETE FROM symbols WHERE file_id=?", (file_id,))
                    if symbols:
+                        # Extract token_count and symbol_type from symbol metadata if available
+                        symbol_rows = []
+                        for s in symbols:
+                            token_count = getattr(s, 'token_count', None)
+                            symbol_type = getattr(s, 'symbol_type', None) or s.kind
+                            symbol_rows.append(
+                                (file_id, s.name, s.kind, s.range[0], s.range[1], token_count, symbol_type)
+                            )
+
                        conn.executemany(
                            """
-                            INSERT INTO symbols(file_id, name, kind, start_line, end_line)
-                            VALUES(?, ?, ?, ?, ?)
+                            INSERT INTO symbols(file_id, name, kind, start_line, end_line, token_count, symbol_type)
+                            VALUES(?, ?, ?, ?, ?, ?, ?)
                            """,
-                            [
-                                (file_id, s.name, s.kind, s.range[0], s.range[1])
-                                for s in symbols
-                            ],
+                            symbol_rows,
                        )

                conn.commit()
@@ -1021,7 +1033,9 @@ class DirIndexStore:
                    name TEXT NOT NULL,
                    kind TEXT NOT NULL,
                    start_line INTEGER,
-                    end_line INTEGER
+                    end_line INTEGER,
+                    token_count INTEGER,
+                    symbol_type TEXT
                )
                """
            )
@@ -1083,6 +1097,7 @@ class DirIndexStore:
            conn.execute("CREATE INDEX IF NOT EXISTS idx_subdirs_name ON subdirs(name)")
            conn.execute("CREATE INDEX IF NOT EXISTS idx_symbols_name ON symbols(name)")
            conn.execute("CREATE INDEX IF NOT EXISTS idx_symbols_file ON symbols(file_id)")
+            conn.execute("CREATE INDEX IF NOT EXISTS idx_symbols_type ON symbols(symbol_type)")
            conn.execute("CREATE INDEX IF NOT EXISTS idx_semantic_file ON semantic_metadata(file_id)")
            conn.execute("CREATE INDEX IF NOT EXISTS idx_keywords_keyword ON keywords(keyword)")
            conn.execute("CREATE INDEX IF NOT EXISTS idx_file_keywords_file_id ON file_keywords(file_id)")
--- a/codex-lens/src/codexlens/storage/migrations/migration_002_add_token_metadata.py
+++ b/codex-lens/src/codexlens/storage/migrations/migration_002_add_token_metadata.py
@@ -0,0 +1,48 @@
+"""
+Migration 002: Add token_count and symbol_type to symbols table.
+
+This migration adds token counting metadata to symbols for accurate chunk
+splitting and performance optimization. It also adds symbol_type for better
+filtering in searches.
+"""
+
+import logging
+from sqlite3 import Connection
+
+log = logging.getLogger(__name__)
+
+
+def upgrade(db_conn: Connection):
+    """
+    Applies the migration to add token metadata to symbols.
+
+    - Adds token_count column to symbols table
+    - Adds symbol_type column to symbols table (for future use)
+    - Creates index on symbol_type for efficient filtering
+    - Backfills existing symbols with NULL token_count (to be calculated lazily)
+
+    Args:
+        db_conn: The SQLite database connection.
+    """
+    cursor = db_conn.cursor()
+
+    log.info("Adding token_count column to symbols table...")
+    try:
+        cursor.execute("ALTER TABLE symbols ADD COLUMN token_count INTEGER")
+        log.info("Successfully added token_count column.")
+    except Exception as e:
+        # Column might already exist
+        log.warning(f"Could not add token_count column (might already exist): {e}")
+
+    log.info("Adding symbol_type column to symbols table...")
+    try:
+        cursor.execute("ALTER TABLE symbols ADD COLUMN symbol_type TEXT")
+        log.info("Successfully added symbol_type column.")
+    except Exception as e:
+        # Column might already exist
+        log.warning(f"Could not add symbol_type column (might already exist): {e}")
+
+    log.info("Creating index on symbol_type for efficient filtering...")
+    cursor.execute("CREATE INDEX IF NOT EXISTS idx_symbols_type ON symbols(symbol_type)")
+
+    log.info("Migration 002 completed successfully.")
--- a/codex-lens/src/codexlens/storage/migrations/migration_003_code_relationships.py
+++ b/codex-lens/src/codexlens/storage/migrations/migration_003_code_relationships.py
@@ -0,0 +1,57 @@
+"""
+Migration 003: Add code relationships storage.
+
+This migration introduces the `code_relationships` table to store semantic
+relationships between code symbols (function calls, inheritance, imports).
+This enables graph-based code navigation and dependency analysis.
+"""
+
+import logging
+from sqlite3 import Connection
+
+log = logging.getLogger(__name__)
+
+
+def upgrade(db_conn: Connection):
+    """
+    Applies the migration to add code relationships table.
+
+    - Creates `code_relationships` table with foreign key to symbols
+    - Creates indexes for efficient relationship queries
+    - Supports lazy expansion with target_symbol being qualified names
+
+    Args:
+        db_conn: The SQLite database connection.
+    """
+    cursor = db_conn.cursor()
+
+    log.info("Creating 'code_relationships' table...")
+    cursor.execute(
+        """
+        CREATE TABLE IF NOT EXISTS code_relationships (
+            id INTEGER PRIMARY KEY,
+            source_symbol_id INTEGER NOT NULL,
+            target_qualified_name TEXT NOT NULL,
+            relationship_type TEXT NOT NULL,
+            source_line INTEGER NOT NULL,
+            target_file TEXT,
+            FOREIGN KEY (source_symbol_id) REFERENCES symbols (id) ON DELETE CASCADE
+        )
+        """
+    )
+
+    log.info("Creating indexes for code_relationships...")
+    cursor.execute(
+        "CREATE INDEX IF NOT EXISTS idx_relationships_source ON code_relationships (source_symbol_id)"
+    )
+    cursor.execute(
+        "CREATE INDEX IF NOT EXISTS idx_relationships_target ON code_relationships (target_qualified_name)"
+    )
+    cursor.execute(
+        "CREATE INDEX IF NOT EXISTS idx_relationships_type ON code_relationships (relationship_type)"
+    )
+    cursor.execute(
+        "CREATE INDEX IF NOT EXISTS idx_relationships_source_line ON code_relationships (source_line)"
+    )
+
+    log.info("Finished creating code_relationships table and indexes.")
--- a/codex-lens/src/codexlens/storage/sqlite_store.py
+++ b/codex-lens/src/codexlens/storage/sqlite_store.py
@@ -9,7 +9,7 @@ from dataclasses import asdict
 from pathlib import Path
 from typing import Any, Dict, Iterable, List, Optional

-from codexlens.entities import IndexedFile, SearchResult, Symbol
+from codexlens.entities import CodeRelationship, IndexedFile, SearchResult, Symbol
 from codexlens.errors import StorageError


@@ -309,13 +309,184 @@ class SQLiteStore:
                "SELECT language, COUNT(*) AS c FROM files GROUP BY language ORDER BY c DESC"
            ).fetchall()
            languages = {row["language"]: row["c"] for row in lang_rows}
+            # Include relationship count if table exists
+            relationship_count = 0
+            try:
+                rel_row = conn.execute("SELECT COUNT(*) AS c FROM code_relationships").fetchone()
+                relationship_count = int(rel_row["c"]) if rel_row else 0
+            except sqlite3.DatabaseError:
+                pass
+
            return {
                "files": int(file_count),
                "symbols": int(symbol_count),
+                "relationships": relationship_count,
                "languages": languages,
                "db_path": str(self.db_path),
            }

+
+    def add_relationships(self, file_path: str | Path, relationships: List[CodeRelationship]) -> None:
+        """Store code relationships for a file.
+
+        Args:
+            file_path: Path to the file containing the relationships
+            relationships: List of CodeRelationship objects to store
+        """
+        if not relationships:
+            return
+
+        with self._lock:
+            conn = self._get_connection()
+            resolved_path = str(Path(file_path).resolve())
+
+            # Get file_id
+            row = conn.execute("SELECT id FROM files WHERE path=?", (resolved_path,)).fetchone()
+            if not row:
+                raise StorageError(f"File not found in index: {file_path}")
+            file_id = int(row["id"])
+
+            # Delete existing relationships for symbols in this file
+            conn.execute(
+                """
+                DELETE FROM code_relationships
+                WHERE source_symbol_id IN (
+                    SELECT id FROM symbols WHERE file_id=?
+                )
+                """,
+                (file_id,)
+            )
+
+            # Insert new relationships
+            relationship_rows = []
+            for rel in relationships:
+                # Find source symbol ID
+                symbol_row = conn.execute(
+                    """
+                    SELECT id FROM symbols
+                    WHERE file_id=? AND name=? AND start_line <= ? AND end_line >= ?
+                    ORDER BY (end_line - start_line) ASC
+                    LIMIT 1
+                    """,
+                    (file_id, rel.source_symbol, rel.source_line, rel.source_line)
+                ).fetchone()
+
+                if symbol_row:
+                    source_symbol_id = int(symbol_row["id"])
+                    relationship_rows.append((
+                        source_symbol_id,
+                        rel.target_symbol,
+                        rel.relationship_type,
+                        rel.source_line,
+                        rel.target_file
+                    ))
+
+            if relationship_rows:
+                conn.executemany(
+                    """
+                    INSERT INTO code_relationships(
+                        source_symbol_id, target_qualified_name, relationship_type,
+                        source_line, target_file
+                    )
+                    VALUES(?, ?, ?, ?, ?)
+                    """,
+                    relationship_rows
+                )
+            conn.commit()
+
+    def query_relationships_by_target(
+        self, target_name: str, *, limit: int = 100
+    ) -> List[Dict[str, Any]]:
+        """Query relationships by target symbol name (find all callers).
+
+        Args:
+            target_name: Name of the target symbol
+            limit: Maximum number of results
+
+        Returns:
+            List of dicts containing relationship info with file paths and line numbers
+        """
+        with self._lock:
+            conn = self._get_connection()
+            rows = conn.execute(
+                """
+                SELECT
+                    s.name AS source_symbol,
+                    r.target_qualified_name,
+                    r.relationship_type,
+                    r.source_line,
+                    f.path AS source_file,
+                    r.target_file
+                FROM code_relationships r
+                JOIN symbols s ON r.source_symbol_id = s.id
+                JOIN files f ON s.file_id = f.id
+                WHERE r.target_qualified_name = ?
+                ORDER BY f.path, r.source_line
+                LIMIT ?
+                """,
+                (target_name, limit)
+            ).fetchall()
+
+            return [
+                {
+                    "source_symbol": row["source_symbol"],
+                    "target_symbol": row["target_qualified_name"],
+                    "relationship_type": row["relationship_type"],
+                    "source_line": row["source_line"],
+                    "source_file": row["source_file"],
+                    "target_file": row["target_file"],
+                }
+                for row in rows
+            ]
+
+    def query_relationships_by_source(
+        self, source_symbol: str, source_file: str | Path, *, limit: int = 100
+    ) -> List[Dict[str, Any]]:
+        """Query relationships by source symbol (find what a symbol calls).
+
+        Args:
+            source_symbol: Name of the source symbol
+            source_file: File path containing the source symbol
+            limit: Maximum number of results
+
+        Returns:
+            List of dicts containing relationship info
+        """
+        with self._lock:
+            conn = self._get_connection()
+            resolved_path = str(Path(source_file).resolve())
+
+            rows = conn.execute(
+                """
+                SELECT
+                    s.name AS source_symbol,
+                    r.target_qualified_name,
+                    r.relationship_type,
+                    r.source_line,
+                    f.path AS source_file,
+                    r.target_file
+                FROM code_relationships r
+                JOIN symbols s ON r.source_symbol_id = s.id
+                JOIN files f ON s.file_id = f.id
+                WHERE s.name = ? AND f.path = ?
+                ORDER BY r.source_line
+                LIMIT ?
+                """,
+                (source_symbol, resolved_path, limit)
+            ).fetchall()
+
+            return [
+                {
+                    "source_symbol": row["source_symbol"],
+                    "target_symbol": row["target_qualified_name"],
+                    "relationship_type": row["relationship_type"],
+                    "source_line": row["source_line"],
+                    "source_file": row["source_file"],
+                    "target_file": row["target_file"],
+                }
+                for row in rows
+            ]
+
    def _connect(self) -> sqlite3.Connection:
        """Legacy method for backward compatibility."""
        return self._get_connection()
@@ -348,6 +519,20 @@ class SQLiteStore:
            )
            conn.execute("CREATE INDEX IF NOT EXISTS idx_symbols_name ON symbols(name)")
            conn.execute("CREATE INDEX IF NOT EXISTS idx_symbols_kind ON symbols(kind)")
+            conn.execute(
+                """
+                CREATE TABLE IF NOT EXISTS code_relationships (
+                    id INTEGER PRIMARY KEY,
+                    source_symbol_id INTEGER NOT NULL REFERENCES symbols(id) ON DELETE CASCADE,
+                    target_qualified_name TEXT NOT NULL,
+                    relationship_type TEXT NOT NULL,
+                    source_line INTEGER NOT NULL,
+                    target_file TEXT
+                )
+                """
+            )
+            conn.execute("CREATE INDEX IF NOT EXISTS idx_rel_target ON code_relationships(target_qualified_name)")
+            conn.execute("CREATE INDEX IF NOT EXISTS idx_rel_source ON code_relationships(source_symbol_id)")
            conn.commit()
        except sqlite3.DatabaseError as exc:
            raise StorageError(f"Failed to initialize database schema: {exc}") from exc