Implement database migration framework and performance optimizations

- Added active memory configuration for manual interval and Gemini tool.
- Created file modification rules for handling edits and writes.
- Implemented migration manager for managing database schema migrations.
- Added migration 001 to normalize keywords into separate tables.
- Developed tests for validating performance optimizations including keyword normalization, path lookup, and symbol search.
- Created validation script to manually verify optimization implementations.
This commit is contained in:
catlog22
2025-12-14 18:08:32 +08:00
parent 79a2953862
commit 0529b57694
18 changed files with 2085 additions and 545 deletions

View File

@@ -1123,11 +1123,11 @@ def semantic_list(
registry.initialize()
mapper = PathMapper()
project_info = registry.find_project(base_path)
project_info = registry.get_project(base_path)
if not project_info:
raise CodexLensError(f"No index found for: {base_path}. Run 'codex-lens init' first.")
index_dir = mapper.source_to_index_dir(base_path)
index_dir = Path(project_info.index_root)
if not index_dir.exists():
raise CodexLensError(f"Index directory not found: {index_dir}")

View File

@@ -375,6 +375,7 @@ class DirIndexStore:
keywords_json = json.dumps(keywords)
generated_at = time.time()
# Write to semantic_metadata table (for backward compatibility)
conn.execute(
"""
INSERT INTO semantic_metadata(file_id, summary, keywords, purpose, llm_tool, generated_at)
@@ -388,6 +389,37 @@ class DirIndexStore:
""",
(file_id, summary, keywords_json, purpose, llm_tool, generated_at),
)
# Write to normalized keywords tables for optimized search
# First, remove existing keyword associations
conn.execute("DELETE FROM file_keywords WHERE file_id = ?", (file_id,))
# Then add new keywords
for keyword in keywords:
keyword = keyword.strip()
if not keyword:
continue
# Insert keyword if it doesn't exist
conn.execute(
"INSERT OR IGNORE INTO keywords(keyword) VALUES(?)",
(keyword,)
)
# Get keyword_id
row = conn.execute(
"SELECT id FROM keywords WHERE keyword = ?",
(keyword,)
).fetchone()
if row:
keyword_id = row["id"]
# Link file to keyword
conn.execute(
"INSERT OR IGNORE INTO file_keywords(file_id, keyword_id) VALUES(?, ?)",
(file_id, keyword_id)
)
conn.commit()
def get_semantic_metadata(self, file_id: int) -> Optional[Dict[str, Any]]:
@@ -454,11 +486,12 @@ class DirIndexStore:
for row in rows
]
def search_semantic_keywords(self, keyword: str) -> List[Tuple[FileEntry, List[str]]]:
def search_semantic_keywords(self, keyword: str, use_normalized: bool = True) -> List[Tuple[FileEntry, List[str]]]:
"""Search files by semantic keywords.
Args:
keyword: Keyword to search for (case-insensitive)
use_normalized: Use optimized normalized tables (default: True)
Returns:
List of (FileEntry, keywords) tuples where keyword matches
@@ -466,35 +499,71 @@ class DirIndexStore:
with self._lock:
conn = self._get_connection()
keyword_pattern = f"%{keyword}%"
if use_normalized:
# Optimized query using normalized tables with indexed lookup
# Use prefix search (keyword%) for better index utilization
keyword_pattern = f"{keyword}%"
rows = conn.execute(
"""
SELECT f.id, f.name, f.full_path, f.language, f.mtime, f.line_count, sm.keywords
FROM files f
JOIN semantic_metadata sm ON f.id = sm.file_id
WHERE sm.keywords LIKE ? COLLATE NOCASE
ORDER BY f.name
""",
(keyword_pattern,),
).fetchall()
rows = conn.execute(
"""
SELECT f.id, f.name, f.full_path, f.language, f.mtime, f.line_count,
GROUP_CONCAT(k.keyword, ',') as keywords
FROM files f
JOIN file_keywords fk ON f.id = fk.file_id
JOIN keywords k ON fk.keyword_id = k.id
WHERE k.keyword LIKE ? COLLATE NOCASE
GROUP BY f.id, f.name, f.full_path, f.language, f.mtime, f.line_count
ORDER BY f.name
""",
(keyword_pattern,),
).fetchall()
import json
results = []
for row in rows:
file_entry = FileEntry(
id=int(row["id"]),
name=row["name"],
full_path=Path(row["full_path"]),
language=row["language"],
mtime=float(row["mtime"]) if row["mtime"] else 0.0,
line_count=int(row["line_count"]) if row["line_count"] else 0,
)
keywords = row["keywords"].split(',') if row["keywords"] else []
results.append((file_entry, keywords))
results = []
for row in rows:
file_entry = FileEntry(
id=int(row["id"]),
name=row["name"],
full_path=Path(row["full_path"]),
language=row["language"],
mtime=float(row["mtime"]) if row["mtime"] else 0.0,
line_count=int(row["line_count"]) if row["line_count"] else 0,
)
keywords = json.loads(row["keywords"]) if row["keywords"] else []
results.append((file_entry, keywords))
return results
return results
else:
# Fallback to original query for backward compatibility
keyword_pattern = f"%{keyword}%"
rows = conn.execute(
"""
SELECT f.id, f.name, f.full_path, f.language, f.mtime, f.line_count, sm.keywords
FROM files f
JOIN semantic_metadata sm ON f.id = sm.file_id
WHERE sm.keywords LIKE ? COLLATE NOCASE
ORDER BY f.name
""",
(keyword_pattern,),
).fetchall()
import json
results = []
for row in rows:
file_entry = FileEntry(
id=int(row["id"]),
name=row["name"],
full_path=Path(row["full_path"]),
language=row["language"],
mtime=float(row["mtime"]) if row["mtime"] else 0.0,
line_count=int(row["line_count"]) if row["line_count"] else 0,
)
keywords = json.loads(row["keywords"]) if row["keywords"] else []
results.append((file_entry, keywords))
return results
def list_semantic_metadata(
self,
@@ -794,19 +863,26 @@ class DirIndexStore:
return [row["full_path"] for row in rows]
def search_symbols(
self, name: str, kind: Optional[str] = None, limit: int = 50
self, name: str, kind: Optional[str] = None, limit: int = 50, prefix_mode: bool = True
) -> List[Symbol]:
"""Search symbols by name pattern.
Args:
name: Symbol name pattern (LIKE query)
name: Symbol name pattern
kind: Optional symbol kind filter
limit: Maximum results to return
prefix_mode: If True, use prefix search (faster with index);
If False, use substring search (slower)
Returns:
List of Symbol objects
"""
pattern = f"%{name}%"
# Prefix search is much faster as it can use index
if prefix_mode:
pattern = f"{name}%"
else:
pattern = f"%{name}%"
with self._lock:
conn = self._get_connection()
if kind:
@@ -979,6 +1055,28 @@ class DirIndexStore:
"""
)
# Normalized keywords tables for performance
conn.execute(
"""
CREATE TABLE IF NOT EXISTS keywords (
id INTEGER PRIMARY KEY,
keyword TEXT NOT NULL UNIQUE
)
"""
)
conn.execute(
"""
CREATE TABLE IF NOT EXISTS file_keywords (
file_id INTEGER NOT NULL,
keyword_id INTEGER NOT NULL,
PRIMARY KEY (file_id, keyword_id),
FOREIGN KEY (file_id) REFERENCES files (id) ON DELETE CASCADE,
FOREIGN KEY (keyword_id) REFERENCES keywords (id) ON DELETE CASCADE
)
"""
)
# Indexes
conn.execute("CREATE INDEX IF NOT EXISTS idx_files_name ON files(name)")
conn.execute("CREATE INDEX IF NOT EXISTS idx_files_path ON files(full_path)")
@@ -986,6 +1084,9 @@ class DirIndexStore:
conn.execute("CREATE INDEX IF NOT EXISTS idx_symbols_name ON symbols(name)")
conn.execute("CREATE INDEX IF NOT EXISTS idx_symbols_file ON symbols(file_id)")
conn.execute("CREATE INDEX IF NOT EXISTS idx_semantic_file ON semantic_metadata(file_id)")
conn.execute("CREATE INDEX IF NOT EXISTS idx_keywords_keyword ON keywords(keyword)")
conn.execute("CREATE INDEX IF NOT EXISTS idx_file_keywords_file_id ON file_keywords(file_id)")
conn.execute("CREATE INDEX IF NOT EXISTS idx_file_keywords_keyword_id ON file_keywords(keyword_id)")
except sqlite3.DatabaseError as exc:
raise StorageError(f"Failed to create schema: {exc}") from exc

View File

@@ -0,0 +1,139 @@
"""
Manages database schema migrations.
This module provides a framework for applying versioned migrations to the SQLite
database. Migrations are discovered from the `codexlens.storage.migrations`
package and applied sequentially. The database schema version is tracked using
the `user_version` pragma.
"""
import importlib
import logging
import pkgutil
from pathlib import Path
from sqlite3 import Connection
from typing import List, NamedTuple
log = logging.getLogger(__name__)
class Migration(NamedTuple):
"""Represents a single database migration."""
version: int
name: str
upgrade: callable
def discover_migrations() -> List[Migration]:
"""
Discovers and returns a sorted list of database migrations.
Migrations are expected to be in the `codexlens.storage.migrations` package,
with filenames in the format `migration_XXX_description.py`, where XXX is
the version number. Each migration module must contain an `upgrade` function
that takes a `sqlite3.Connection` object as its argument.
Returns:
A list of Migration objects, sorted by version.
"""
import codexlens.storage.migrations
migrations = []
package_path = Path(codexlens.storage.migrations.__file__).parent
for _, name, _ in pkgutil.iter_modules([str(package_path)]):
if name.startswith("migration_"):
try:
version = int(name.split("_")[1])
module = importlib.import_module(f"codexlens.storage.migrations.{name}")
if hasattr(module, "upgrade"):
migrations.append(
Migration(version=version, name=name, upgrade=module.upgrade)
)
else:
log.warning(f"Migration {name} is missing 'upgrade' function.")
except (ValueError, IndexError) as e:
log.warning(f"Could not parse migration name {name}: {e}")
except ImportError as e:
log.warning(f"Could not import migration {name}: {e}")
migrations.sort(key=lambda m: m.version)
return migrations
class MigrationManager:
"""
Manages the application of migrations to a database.
"""
def __init__(self, db_conn: Connection):
"""
Initializes the MigrationManager.
Args:
db_conn: The SQLite database connection.
"""
self.db_conn = db_conn
self.migrations = discover_migrations()
def get_current_version(self) -> int:
"""
Gets the current version of the database schema.
Returns:
The current schema version number.
"""
return self.db_conn.execute("PRAGMA user_version").fetchone()[0]
def set_version(self, version: int):
"""
Sets the database schema version.
Args:
version: The version number to set.
"""
self.db_conn.execute(f"PRAGMA user_version = {version}")
log.info(f"Database schema version set to {version}")
def apply_migrations(self):
"""
Applies all pending migrations to the database.
This method checks the current database version and applies all
subsequent migrations in order. Each migration is applied within
a transaction.
"""
current_version = self.get_current_version()
log.info(f"Current database schema version: {current_version}")
for migration in self.migrations:
if migration.version > current_version:
log.info(f"Applying migration {migration.version}: {migration.name}...")
try:
self.db_conn.execute("BEGIN")
migration.upgrade(self.db_conn)
self.set_version(migration.version)
self.db_conn.execute("COMMIT")
log.info(
f"Successfully applied migration {migration.version}: {migration.name}"
)
except Exception as e:
log.error(
f"Failed to apply migration {migration.version}: {migration.name}. Rolling back. Error: {e}",
exc_info=True,
)
self.db_conn.execute("ROLLBACK")
raise
latest_migration_version = self.migrations[-1].version if self.migrations else 0
if current_version < latest_migration_version:
# This case can be hit if migrations were applied but the loop was exited
# and set_version was not called for the last one for some reason.
# To be safe, we explicitly set the version to the latest known migration.
final_version = self.get_current_version()
if final_version != latest_migration_version:
log.warning(f"Database version ({final_version}) is not the latest migration version ({latest_migration_version}). This may indicate a problem.")
log.info("All pending migrations applied successfully.")

View File

@@ -0,0 +1 @@
# This file makes the 'migrations' directory a Python package.

View File

@@ -0,0 +1,108 @@
"""
Migration 001: Normalize keywords into separate tables.
This migration introduces two new tables, `keywords` and `file_keywords`, to
store semantic keywords in a normalized fashion. It then migrates the existing
keywords from the `semantic_data` JSON blob in the `files` table into these
new tables. This is intended to speed up keyword-based searches significantly.
"""
import json
import logging
from sqlite3 import Connection
log = logging.getLogger(__name__)
def upgrade(db_conn: Connection):
"""
Applies the migration to normalize keywords.
- Creates `keywords` and `file_keywords` tables.
- Creates indexes for efficient querying.
- Migrates data from `files.semantic_data` to the new tables.
Args:
db_conn: The SQLite database connection.
"""
cursor = db_conn.cursor()
log.info("Creating 'keywords' and 'file_keywords' tables...")
# Create a table to store unique keywords
cursor.execute(
"""
CREATE TABLE IF NOT EXISTS keywords (
id INTEGER PRIMARY KEY,
keyword TEXT NOT NULL UNIQUE
)
"""
)
# Create a join table to link files and keywords (many-to-many)
cursor.execute(
"""
CREATE TABLE IF NOT EXISTS file_keywords (
file_id INTEGER NOT NULL,
keyword_id INTEGER NOT NULL,
PRIMARY KEY (file_id, keyword_id),
FOREIGN KEY (file_id) REFERENCES files (id) ON DELETE CASCADE,
FOREIGN KEY (keyword_id) REFERENCES keywords (id) ON DELETE CASCADE
)
"""
)
log.info("Creating indexes for new keyword tables...")
cursor.execute("CREATE INDEX IF NOT EXISTS idx_keywords_keyword ON keywords (keyword)")
cursor.execute("CREATE INDEX IF NOT EXISTS idx_file_keywords_file_id ON file_keywords (file_id)")
cursor.execute("CREATE INDEX IF NOT EXISTS idx_file_keywords_keyword_id ON file_keywords (keyword_id)")
log.info("Migrating existing keywords from 'semantic_metadata' table...")
cursor.execute("SELECT file_id, keywords FROM semantic_metadata WHERE keywords IS NOT NULL AND keywords != ''")
files_to_migrate = cursor.fetchall()
if not files_to_migrate:
log.info("No existing files with semantic metadata to migrate.")
return
log.info(f"Found {len(files_to_migrate)} files with semantic metadata to migrate.")
for file_id, keywords_json in files_to_migrate:
if not keywords_json:
continue
try:
keywords = json.loads(keywords_json)
if not isinstance(keywords, list):
log.warning(f"Keywords for file_id {file_id} is not a list, skipping.")
continue
for keyword in keywords:
if not isinstance(keyword, str):
log.warning(f"Non-string keyword '{keyword}' found for file_id {file_id}, skipping.")
continue
keyword = keyword.strip()
if not keyword:
continue
# Get or create keyword_id
cursor.execute("INSERT OR IGNORE INTO keywords (keyword) VALUES (?)", (keyword,))
cursor.execute("SELECT id FROM keywords WHERE keyword = ?", (keyword,))
keyword_id_result = cursor.fetchone()
if keyword_id_result:
keyword_id = keyword_id_result[0]
# Link file to keyword
cursor.execute(
"INSERT OR IGNORE INTO file_keywords (file_id, keyword_id) VALUES (?, ?)",
(file_id, keyword_id),
)
else:
log.error(f"Failed to retrieve or create keyword_id for keyword: {keyword}")
except json.JSONDecodeError as e:
log.warning(f"Could not parse keywords for file_id {file_id}: {e}")
except Exception as e:
log.error(f"An unexpected error occurred during migration for file_id {file_id}: {e}", exc_info=True)
log.info("Finished migrating keywords.")

View File

@@ -424,6 +424,9 @@ class RegistryStore:
Searches for the closest parent directory that has an index.
Useful for supporting subdirectory searches.
Optimized to use single database query instead of iterating through
each parent directory level.
Args:
source_path: Source directory or file path
@@ -434,23 +437,30 @@ class RegistryStore:
conn = self._get_connection()
source_path_resolved = source_path.resolve()
# Check from current path up to root
# Build list of all parent paths from deepest to shallowest
paths_to_check = []
current = source_path_resolved
while True:
current_str = str(current)
row = conn.execute(
"SELECT * FROM dir_mapping WHERE source_path=?", (current_str,)
).fetchone()
if row:
return self._row_to_dir_mapping(row)
paths_to_check.append(str(current))
parent = current.parent
if parent == current: # Reached filesystem root
break
current = parent
return None
if not paths_to_check:
return None
# Single query with WHERE IN, ordered by path length (longest = nearest)
placeholders = ','.join('?' * len(paths_to_check))
query = f"""
SELECT * FROM dir_mapping
WHERE source_path IN ({placeholders})
ORDER BY LENGTH(source_path) DESC
LIMIT 1
"""
row = conn.execute(query, paths_to_check).fetchone()
return self._row_to_dir_mapping(row) if row else None
def get_project_dirs(self, project_id: int) -> List[DirMapping]:
"""Get all directory mappings for a project.