Implement database migration framework and performance optimizations

- Added active memory configuration for manual interval and Gemini tool. - Created file modification rules for handling edits and writes. - Implemented migration manager for managing database schema migrations. - Added migration 001 to normalize keywords into separate tables. - Developed tests for validating performance optimizations including keyword normalization, path lookup, and symbol search. - Created validation script to manually verify optimization implementations.
2026-02-10 02:24:35 +08:00 · 2025-12-14 18:08:32 +08:00
parent 79a2953862
commit 0529b57694
18 changed files with 2085 additions and 545 deletions
--- a/codex-lens/src/codexlens/cli/commands.py
+++ b/codex-lens/src/codexlens/cli/commands.py
@@ -1123,11 +1123,11 @@ def semantic_list(
        registry.initialize()
        mapper = PathMapper()

-        project_info = registry.find_project(base_path)
+        project_info = registry.get_project(base_path)
        if not project_info:
            raise CodexLensError(f"No index found for: {base_path}. Run 'codex-lens init' first.")

-        index_dir = mapper.source_to_index_dir(base_path)
+        index_dir = Path(project_info.index_root)
        if not index_dir.exists():
            raise CodexLensError(f"Index directory not found: {index_dir}")

--- a/codex-lens/src/codexlens/storage/dir_index.py
+++ b/codex-lens/src/codexlens/storage/dir_index.py
@@ -375,6 +375,7 @@ class DirIndexStore:
            keywords_json = json.dumps(keywords)
            generated_at = time.time()

+            # Write to semantic_metadata table (for backward compatibility)
            conn.execute(
                """
                INSERT INTO semantic_metadata(file_id, summary, keywords, purpose, llm_tool, generated_at)
@@ -388,6 +389,37 @@ class DirIndexStore:
                """,
                (file_id, summary, keywords_json, purpose, llm_tool, generated_at),
            )
+
+            # Write to normalized keywords tables for optimized search
+            # First, remove existing keyword associations
+            conn.execute("DELETE FROM file_keywords WHERE file_id = ?", (file_id,))
+
+            # Then add new keywords
+            for keyword in keywords:
+                keyword = keyword.strip()
+                if not keyword:
+                    continue
+
+                # Insert keyword if it doesn't exist
+                conn.execute(
+                    "INSERT OR IGNORE INTO keywords(keyword) VALUES(?)",
+                    (keyword,)
+                )
+
+                # Get keyword_id
+                row = conn.execute(
+                    "SELECT id FROM keywords WHERE keyword = ?",
+                    (keyword,)
+                ).fetchone()
+
+                if row:
+                    keyword_id = row["id"]
+                    # Link file to keyword
+                    conn.execute(
+                        "INSERT OR IGNORE INTO file_keywords(file_id, keyword_id) VALUES(?, ?)",
+                        (file_id, keyword_id)
+                    )
+
            conn.commit()

    def get_semantic_metadata(self, file_id: int) -> Optional[Dict[str, Any]]:
@@ -454,11 +486,12 @@ class DirIndexStore:
                for row in rows
            ]

-    def search_semantic_keywords(self, keyword: str) -> List[Tuple[FileEntry, List[str]]]:
+    def search_semantic_keywords(self, keyword: str, use_normalized: bool = True) -> List[Tuple[FileEntry, List[str]]]:
        """Search files by semantic keywords.

        Args:
            keyword: Keyword to search for (case-insensitive)
+            use_normalized: Use optimized normalized tables (default: True)

        Returns:
            List of (FileEntry, keywords) tuples where keyword matches
@@ -466,35 +499,71 @@ class DirIndexStore:
        with self._lock:
            conn = self._get_connection()

-            keyword_pattern = f"%{keyword}%"
+            if use_normalized:
+                # Optimized query using normalized tables with indexed lookup
+                # Use prefix search (keyword%) for better index utilization
+                keyword_pattern = f"{keyword}%"

-            rows = conn.execute(
-                """
-                SELECT f.id, f.name, f.full_path, f.language, f.mtime, f.line_count, sm.keywords
-                FROM files f
-                JOIN semantic_metadata sm ON f.id = sm.file_id
-                WHERE sm.keywords LIKE ? COLLATE NOCASE
-                ORDER BY f.name
-                """,
-                (keyword_pattern,),
-            ).fetchall()
+                rows = conn.execute(
+                    """
+                    SELECT f.id, f.name, f.full_path, f.language, f.mtime, f.line_count,
+                           GROUP_CONCAT(k.keyword, ',') as keywords
+                    FROM files f
+                    JOIN file_keywords fk ON f.id = fk.file_id
+                    JOIN keywords k ON fk.keyword_id = k.id
+                    WHERE k.keyword LIKE ? COLLATE NOCASE
+                    GROUP BY f.id, f.name, f.full_path, f.language, f.mtime, f.line_count
+                    ORDER BY f.name
+                    """,
+                    (keyword_pattern,),
+                ).fetchall()

-            import json
+                results = []
+                for row in rows:
+                    file_entry = FileEntry(
+                        id=int(row["id"]),
+                        name=row["name"],
+                        full_path=Path(row["full_path"]),
+                        language=row["language"],
+                        mtime=float(row["mtime"]) if row["mtime"] else 0.0,
+                        line_count=int(row["line_count"]) if row["line_count"] else 0,
+                    )
+                    keywords = row["keywords"].split(',') if row["keywords"] else []
+                    results.append((file_entry, keywords))

-            results = []
-            for row in rows:
-                file_entry = FileEntry(
-                    id=int(row["id"]),
-                    name=row["name"],
-                    full_path=Path(row["full_path"]),
-                    language=row["language"],
-                    mtime=float(row["mtime"]) if row["mtime"] else 0.0,
-                    line_count=int(row["line_count"]) if row["line_count"] else 0,
-                )
-                keywords = json.loads(row["keywords"]) if row["keywords"] else []
-                results.append((file_entry, keywords))
+                return results

-            return results
+            else:
+                # Fallback to original query for backward compatibility
+                keyword_pattern = f"%{keyword}%"
+
+                rows = conn.execute(
+                    """
+                    SELECT f.id, f.name, f.full_path, f.language, f.mtime, f.line_count, sm.keywords
+                    FROM files f
+                    JOIN semantic_metadata sm ON f.id = sm.file_id
+                    WHERE sm.keywords LIKE ? COLLATE NOCASE
+                    ORDER BY f.name
+                    """,
+                    (keyword_pattern,),
+                ).fetchall()
+
+                import json
+
+                results = []
+                for row in rows:
+                    file_entry = FileEntry(
+                        id=int(row["id"]),
+                        name=row["name"],
+                        full_path=Path(row["full_path"]),
+                        language=row["language"],
+                        mtime=float(row["mtime"]) if row["mtime"] else 0.0,
+                        line_count=int(row["line_count"]) if row["line_count"] else 0,
+                    )
+                    keywords = json.loads(row["keywords"]) if row["keywords"] else []
+                    results.append((file_entry, keywords))
+
+                return results

    def list_semantic_metadata(
        self,
@@ -794,19 +863,26 @@ class DirIndexStore:
            return [row["full_path"] for row in rows]

    def search_symbols(
-        self, name: str, kind: Optional[str] = None, limit: int = 50
+        self, name: str, kind: Optional[str] = None, limit: int = 50, prefix_mode: bool = True
    ) -> List[Symbol]:
        """Search symbols by name pattern.

        Args:
-            name: Symbol name pattern (LIKE query)
+            name: Symbol name pattern
            kind: Optional symbol kind filter
            limit: Maximum results to return
+            prefix_mode: If True, use prefix search (faster with index);
+                        If False, use substring search (slower)

        Returns:
            List of Symbol objects
        """
-        pattern = f"%{name}%"
+        # Prefix search is much faster as it can use index
+        if prefix_mode:
+            pattern = f"{name}%"
+        else:
+            pattern = f"%{name}%"
+
        with self._lock:
            conn = self._get_connection()
            if kind:
@@ -979,6 +1055,28 @@ class DirIndexStore:
                """
            )

+            # Normalized keywords tables for performance
+            conn.execute(
+                """
+                CREATE TABLE IF NOT EXISTS keywords (
+                    id INTEGER PRIMARY KEY,
+                    keyword TEXT NOT NULL UNIQUE
+                )
+                """
+            )
+
+            conn.execute(
+                """
+                CREATE TABLE IF NOT EXISTS file_keywords (
+                    file_id INTEGER NOT NULL,
+                    keyword_id INTEGER NOT NULL,
+                    PRIMARY KEY (file_id, keyword_id),
+                    FOREIGN KEY (file_id) REFERENCES files (id) ON DELETE CASCADE,
+                    FOREIGN KEY (keyword_id) REFERENCES keywords (id) ON DELETE CASCADE
+                )
+                """
+            )
+
            # Indexes
            conn.execute("CREATE INDEX IF NOT EXISTS idx_files_name ON files(name)")
            conn.execute("CREATE INDEX IF NOT EXISTS idx_files_path ON files(full_path)")
@@ -986,6 +1084,9 @@ class DirIndexStore:
            conn.execute("CREATE INDEX IF NOT EXISTS idx_symbols_name ON symbols(name)")
            conn.execute("CREATE INDEX IF NOT EXISTS idx_symbols_file ON symbols(file_id)")
            conn.execute("CREATE INDEX IF NOT EXISTS idx_semantic_file ON semantic_metadata(file_id)")
+            conn.execute("CREATE INDEX IF NOT EXISTS idx_keywords_keyword ON keywords(keyword)")
+            conn.execute("CREATE INDEX IF NOT EXISTS idx_file_keywords_file_id ON file_keywords(file_id)")
+            conn.execute("CREATE INDEX IF NOT EXISTS idx_file_keywords_keyword_id ON file_keywords(keyword_id)")

        except sqlite3.DatabaseError as exc:
            raise StorageError(f"Failed to create schema: {exc}") from exc
--- a/codex-lens/src/codexlens/storage/migration_manager.py
+++ b/codex-lens/src/codexlens/storage/migration_manager.py
@@ -0,0 +1,139 @@
+"""
+Manages database schema migrations.
+
+This module provides a framework for applying versioned migrations to the SQLite
+database. Migrations are discovered from the `codexlens.storage.migrations`
+package and applied sequentially. The database schema version is tracked using
+the `user_version` pragma.
+"""
+
+import importlib
+import logging
+import pkgutil
+from pathlib import Path
+from sqlite3 import Connection
+from typing import List, NamedTuple
+
+log = logging.getLogger(__name__)
+
+
+class Migration(NamedTuple):
+    """Represents a single database migration."""
+
+    version: int
+    name: str
+    upgrade: callable
+
+
+def discover_migrations() -> List[Migration]:
+    """
+    Discovers and returns a sorted list of database migrations.
+
+    Migrations are expected to be in the `codexlens.storage.migrations` package,
+    with filenames in the format `migration_XXX_description.py`, where XXX is
+    the version number. Each migration module must contain an `upgrade` function
+    that takes a `sqlite3.Connection` object as its argument.
+
+    Returns:
+        A list of Migration objects, sorted by version.
+    """
+    import codexlens.storage.migrations
+
+    migrations = []
+    package_path = Path(codexlens.storage.migrations.__file__).parent
+    
+    for _, name, _ in pkgutil.iter_modules([str(package_path)]):
+        if name.startswith("migration_"):
+            try:
+                version = int(name.split("_")[1])
+                module = importlib.import_module(f"codexlens.storage.migrations.{name}")
+                if hasattr(module, "upgrade"):
+                    migrations.append(
+                        Migration(version=version, name=name, upgrade=module.upgrade)
+                    )
+                else:
+                    log.warning(f"Migration {name} is missing 'upgrade' function.")
+            except (ValueError, IndexError) as e:
+                log.warning(f"Could not parse migration name {name}: {e}")
+            except ImportError as e:
+                log.warning(f"Could not import migration {name}: {e}")
+
+    migrations.sort(key=lambda m: m.version)
+    return migrations
+
+
+class MigrationManager:
+    """
+    Manages the application of migrations to a database.
+    """
+
+    def __init__(self, db_conn: Connection):
+        """
+        Initializes the MigrationManager.
+
+        Args:
+            db_conn: The SQLite database connection.
+        """
+        self.db_conn = db_conn
+        self.migrations = discover_migrations()
+
+    def get_current_version(self) -> int:
+        """
+        Gets the current version of the database schema.
+
+        Returns:
+            The current schema version number.
+        """
+        return self.db_conn.execute("PRAGMA user_version").fetchone()[0]
+
+    def set_version(self, version: int):
+        """
+        Sets the database schema version.
+
+        Args:
+            version: The version number to set.
+        """
+        self.db_conn.execute(f"PRAGMA user_version = {version}")
+        log.info(f"Database schema version set to {version}")
+
+    def apply_migrations(self):
+        """
+        Applies all pending migrations to the database.
+
+        This method checks the current database version and applies all
+        subsequent migrations in order. Each migration is applied within
+        a transaction.
+        """
+        current_version = self.get_current_version()
+        log.info(f"Current database schema version: {current_version}")
+
+        for migration in self.migrations:
+            if migration.version > current_version:
+                log.info(f"Applying migration {migration.version}: {migration.name}...")
+                try:
+                    self.db_conn.execute("BEGIN")
+                    migration.upgrade(self.db_conn)
+                    self.set_version(migration.version)
+                    self.db_conn.execute("COMMIT")
+                    log.info(
+                        f"Successfully applied migration {migration.version}: {migration.name}"
+                    )
+                except Exception as e:
+                    log.error(
+                        f"Failed to apply migration {migration.version}: {migration.name}. Rolling back. Error: {e}",
+                        exc_info=True,
+                    )
+                    self.db_conn.execute("ROLLBACK")
+                    raise
+        
+        latest_migration_version = self.migrations[-1].version if self.migrations else 0
+        if current_version < latest_migration_version:
+            # This case can be hit if migrations were applied but the loop was exited
+            # and set_version was not called for the last one for some reason.
+            # To be safe, we explicitly set the version to the latest known migration.
+            final_version = self.get_current_version()
+            if final_version != latest_migration_version:
+                 log.warning(f"Database version ({final_version}) is not the latest migration version ({latest_migration_version}). This may indicate a problem.")
+
+        log.info("All pending migrations applied successfully.")
+
--- a/codex-lens/src/codexlens/storage/migrations/init.py
+++ b/codex-lens/src/codexlens/storage/migrations/init.py
@@ -0,0 +1 @@
+# This file makes the 'migrations' directory a Python package.
--- a/codex-lens/src/codexlens/storage/migrations/migration_001_normalize_keywords.py
+++ b/codex-lens/src/codexlens/storage/migrations/migration_001_normalize_keywords.py
@@ -0,0 +1,108 @@
+"""
+Migration 001: Normalize keywords into separate tables.
+
+This migration introduces two new tables, `keywords` and `file_keywords`, to
+store semantic keywords in a normalized fashion. It then migrates the existing
+keywords from the `semantic_data` JSON blob in the `files` table into these
+new tables. This is intended to speed up keyword-based searches significantly.
+"""
+
+import json
+import logging
+from sqlite3 import Connection
+
+log = logging.getLogger(__name__)
+
+
+def upgrade(db_conn: Connection):
+    """
+    Applies the migration to normalize keywords.
+
+    - Creates `keywords` and `file_keywords` tables.
+    - Creates indexes for efficient querying.
+    - Migrates data from `files.semantic_data` to the new tables.
+
+    Args:
+        db_conn: The SQLite database connection.
+    """
+    cursor = db_conn.cursor()
+
+    log.info("Creating 'keywords' and 'file_keywords' tables...")
+    # Create a table to store unique keywords
+    cursor.execute(
+        """
+        CREATE TABLE IF NOT EXISTS keywords (
+            id INTEGER PRIMARY KEY,
+            keyword TEXT NOT NULL UNIQUE
+        )
+        """
+    )
+
+    # Create a join table to link files and keywords (many-to-many)
+    cursor.execute(
+        """
+        CREATE TABLE IF NOT EXISTS file_keywords (
+            file_id INTEGER NOT NULL,
+            keyword_id INTEGER NOT NULL,
+            PRIMARY KEY (file_id, keyword_id),
+            FOREIGN KEY (file_id) REFERENCES files (id) ON DELETE CASCADE,
+            FOREIGN KEY (keyword_id) REFERENCES keywords (id) ON DELETE CASCADE
+        )
+        """
+    )
+    
+    log.info("Creating indexes for new keyword tables...")
+    cursor.execute("CREATE INDEX IF NOT EXISTS idx_keywords_keyword ON keywords (keyword)")
+    cursor.execute("CREATE INDEX IF NOT EXISTS idx_file_keywords_file_id ON file_keywords (file_id)")
+    cursor.execute("CREATE INDEX IF NOT EXISTS idx_file_keywords_keyword_id ON file_keywords (keyword_id)")
+
+    log.info("Migrating existing keywords from 'semantic_metadata' table...")
+    cursor.execute("SELECT file_id, keywords FROM semantic_metadata WHERE keywords IS NOT NULL AND keywords != ''")
+
+    files_to_migrate = cursor.fetchall()
+    if not files_to_migrate:
+        log.info("No existing files with semantic metadata to migrate.")
+        return
+
+    log.info(f"Found {len(files_to_migrate)} files with semantic metadata to migrate.")
+
+    for file_id, keywords_json in files_to_migrate:
+        if not keywords_json:
+            continue
+        try:
+            keywords = json.loads(keywords_json)
+
+            if not isinstance(keywords, list):
+                log.warning(f"Keywords for file_id {file_id} is not a list, skipping.")
+                continue
+
+            for keyword in keywords:
+                if not isinstance(keyword, str):
+                    log.warning(f"Non-string keyword '{keyword}' found for file_id {file_id}, skipping.")
+                    continue
+
+                keyword = keyword.strip()
+                if not keyword:
+                    continue
+
+                # Get or create keyword_id
+                cursor.execute("INSERT OR IGNORE INTO keywords (keyword) VALUES (?)", (keyword,))
+                cursor.execute("SELECT id FROM keywords WHERE keyword = ?", (keyword,))
+                keyword_id_result = cursor.fetchone()
+
+                if keyword_id_result:
+                    keyword_id = keyword_id_result[0]
+                    # Link file to keyword
+                    cursor.execute(
+                        "INSERT OR IGNORE INTO file_keywords (file_id, keyword_id) VALUES (?, ?)",
+                        (file_id, keyword_id),
+                    )
+                else:
+                    log.error(f"Failed to retrieve or create keyword_id for keyword: {keyword}")
+
+        except json.JSONDecodeError as e:
+            log.warning(f"Could not parse keywords for file_id {file_id}: {e}")
+        except Exception as e:
+            log.error(f"An unexpected error occurred during migration for file_id {file_id}: {e}", exc_info=True)
+
+    log.info("Finished migrating keywords.")
--- a/codex-lens/src/codexlens/storage/registry.py
+++ b/codex-lens/src/codexlens/storage/registry.py
@@ -424,6 +424,9 @@ class RegistryStore:
        Searches for the closest parent directory that has an index.
        Useful for supporting subdirectory searches.

+        Optimized to use single database query instead of iterating through
+        each parent directory level.
+
        Args:
            source_path: Source directory or file path

@@ -434,23 +437,30 @@ class RegistryStore:
            conn = self._get_connection()
            source_path_resolved = source_path.resolve()

-            # Check from current path up to root
+            # Build list of all parent paths from deepest to shallowest
+            paths_to_check = []
            current = source_path_resolved
            while True:
-                current_str = str(current)
-                row = conn.execute(
-                    "SELECT * FROM dir_mapping WHERE source_path=?", (current_str,)
-                ).fetchone()
-
-                if row:
-                    return self._row_to_dir_mapping(row)
-
+                paths_to_check.append(str(current))
                parent = current.parent
                if parent == current:  # Reached filesystem root
                    break
                current = parent

-            return None
+            if not paths_to_check:
+                return None
+
+            # Single query with WHERE IN, ordered by path length (longest = nearest)
+            placeholders = ','.join('?' * len(paths_to_check))
+            query = f"""
+                SELECT * FROM dir_mapping
+                WHERE source_path IN ({placeholders})
+                ORDER BY LENGTH(source_path) DESC
+                LIMIT 1
+            """
+
+            row = conn.execute(query, paths_to_check).fetchone()
+            return self._row_to_dir_mapping(row) if row else None

    def get_project_dirs(self, project_id: int) -> List[DirMapping]:
        """Get all directory mappings for a project.
				`@@ -0,0 +1 @@`
				`# This file makes the 'migrations' directory a Python package.`