Add comprehensive tests for schema cleanup migration and search comparison

- Implement tests for migration 005 to verify removal of deprecated fields in the database schema.
- Ensure that new databases are created with a clean schema.
- Validate that keywords are correctly extracted from the normalized file_keywords table.
- Test symbol insertion without deprecated fields and subdir operations without direct_files.
- Create a detailed search comparison test to evaluate vector search vs hybrid search performance.
- Add a script for reindexing projects to extract code relationships and verify GraphAnalyzer functionality.
- Include a test script to check TreeSitter parser availability and relationship extraction from sample files.
This commit is contained in:
catlog22
2025-12-16 19:27:05 +08:00
parent 3da0ef2adb
commit df23975a0b
61 changed files with 13114 additions and 366 deletions

View File

@@ -27,7 +27,6 @@ class SubdirLink:
name: str
index_path: Path
files_count: int
direct_files: int
last_updated: float
@@ -57,7 +56,7 @@ class DirIndexStore:
# Schema version for migration tracking
# Increment this when schema changes require migration
SCHEMA_VERSION = 4
SCHEMA_VERSION = 5
def __init__(self, db_path: str | Path) -> None:
"""Initialize directory index store.
@@ -133,6 +132,11 @@ class DirIndexStore:
from codexlens.storage.migrations.migration_004_dual_fts import upgrade
upgrade(conn)
# Migration v4 -> v5: Remove unused/redundant fields
if from_version < 5:
from codexlens.storage.migrations.migration_005_cleanup_unused_fields import upgrade
upgrade(conn)
def close(self) -> None:
"""Close database connection."""
with self._lock:
@@ -208,19 +212,17 @@ class DirIndexStore:
# Replace symbols
conn.execute("DELETE FROM symbols WHERE file_id=?", (file_id,))
if symbols:
# Extract token_count and symbol_type from symbol metadata if available
# Insert symbols without token_count and symbol_type
symbol_rows = []
for s in symbols:
token_count = getattr(s, 'token_count', None)
symbol_type = getattr(s, 'symbol_type', None) or s.kind
symbol_rows.append(
(file_id, s.name, s.kind, s.range[0], s.range[1], token_count, symbol_type)
(file_id, s.name, s.kind, s.range[0], s.range[1])
)
conn.executemany(
"""
INSERT INTO symbols(file_id, name, kind, start_line, end_line, token_count, symbol_type)
VALUES(?, ?, ?, ?, ?, ?, ?)
INSERT INTO symbols(file_id, name, kind, start_line, end_line)
VALUES(?, ?, ?, ?, ?)
""",
symbol_rows,
)
@@ -374,19 +376,17 @@ class DirIndexStore:
conn.execute("DELETE FROM symbols WHERE file_id=?", (file_id,))
if symbols:
# Extract token_count and symbol_type from symbol metadata if available
# Insert symbols without token_count and symbol_type
symbol_rows = []
for s in symbols:
token_count = getattr(s, 'token_count', None)
symbol_type = getattr(s, 'symbol_type', None) or s.kind
symbol_rows.append(
(file_id, s.name, s.kind, s.range[0], s.range[1], token_count, symbol_type)
(file_id, s.name, s.kind, s.range[0], s.range[1])
)
conn.executemany(
"""
INSERT INTO symbols(file_id, name, kind, start_line, end_line, token_count, symbol_type)
VALUES(?, ?, ?, ?, ?, ?, ?)
INSERT INTO symbols(file_id, name, kind, start_line, end_line)
VALUES(?, ?, ?, ?, ?)
""",
symbol_rows,
)
@@ -644,25 +644,22 @@ class DirIndexStore:
with self._lock:
conn = self._get_connection()
import json
import time
keywords_json = json.dumps(keywords)
generated_at = time.time()
# Write to semantic_metadata table (for backward compatibility)
# Write to semantic_metadata table (without keywords column)
conn.execute(
"""
INSERT INTO semantic_metadata(file_id, summary, keywords, purpose, llm_tool, generated_at)
VALUES(?, ?, ?, ?, ?, ?)
INSERT INTO semantic_metadata(file_id, summary, purpose, llm_tool, generated_at)
VALUES(?, ?, ?, ?, ?)
ON CONFLICT(file_id) DO UPDATE SET
summary=excluded.summary,
keywords=excluded.keywords,
purpose=excluded.purpose,
llm_tool=excluded.llm_tool,
generated_at=excluded.generated_at
""",
(file_id, summary, keywords_json, purpose, llm_tool, generated_at),
(file_id, summary, purpose, llm_tool, generated_at),
)
# Write to normalized keywords tables for optimized search
@@ -709,9 +706,10 @@ class DirIndexStore:
with self._lock:
conn = self._get_connection()
# Get semantic metadata (without keywords column)
row = conn.execute(
"""
SELECT summary, keywords, purpose, llm_tool, generated_at
SELECT summary, purpose, llm_tool, generated_at
FROM semantic_metadata WHERE file_id=?
""",
(file_id,),
@@ -720,11 +718,23 @@ class DirIndexStore:
if not row:
return None
import json
# Get keywords from normalized file_keywords table
keyword_rows = conn.execute(
"""
SELECT k.keyword
FROM file_keywords fk
JOIN keywords k ON fk.keyword_id = k.id
WHERE fk.file_id = ?
ORDER BY k.keyword
""",
(file_id,),
).fetchall()
keywords = [kw["keyword"] for kw in keyword_rows]
return {
"summary": row["summary"],
"keywords": json.loads(row["keywords"]) if row["keywords"] else [],
"keywords": keywords,
"purpose": row["purpose"],
"llm_tool": row["llm_tool"],
"generated_at": float(row["generated_at"]) if row["generated_at"] else 0.0,
@@ -856,15 +866,14 @@ class DirIndexStore:
Returns:
Tuple of (list of metadata dicts, total count)
"""
import json
with self._lock:
conn = self._get_connection()
# Query semantic metadata without keywords column
base_query = """
SELECT f.id as file_id, f.name as file_name, f.full_path,
f.language, f.line_count,
sm.summary, sm.keywords, sm.purpose,
sm.summary, sm.purpose,
sm.llm_tool, sm.generated_at
FROM files f
JOIN semantic_metadata sm ON f.id = sm.file_id
@@ -892,14 +901,30 @@ class DirIndexStore:
results = []
for row in rows:
file_id = int(row["file_id"])
# Get keywords from normalized file_keywords table
keyword_rows = conn.execute(
"""
SELECT k.keyword
FROM file_keywords fk
JOIN keywords k ON fk.keyword_id = k.id
WHERE fk.file_id = ?
ORDER BY k.keyword
""",
(file_id,),
).fetchall()
keywords = [kw["keyword"] for kw in keyword_rows]
results.append({
"file_id": int(row["file_id"]),
"file_id": file_id,
"file_name": row["file_name"],
"full_path": row["full_path"],
"language": row["language"],
"line_count": int(row["line_count"]) if row["line_count"] else 0,
"summary": row["summary"],
"keywords": json.loads(row["keywords"]) if row["keywords"] else [],
"keywords": keywords,
"purpose": row["purpose"],
"llm_tool": row["llm_tool"],
"generated_at": float(row["generated_at"]) if row["generated_at"] else 0.0,
@@ -922,7 +947,7 @@ class DirIndexStore:
name: Subdirectory name
index_path: Path to subdirectory's _index.db
files_count: Total files recursively
direct_files: Files directly in subdirectory
direct_files: Deprecated parameter (no longer used)
"""
with self._lock:
conn = self._get_connection()
@@ -931,17 +956,17 @@ class DirIndexStore:
import time
last_updated = time.time()
# Note: direct_files parameter is deprecated but kept for backward compatibility
conn.execute(
"""
INSERT INTO subdirs(name, index_path, files_count, direct_files, last_updated)
VALUES(?, ?, ?, ?, ?)
INSERT INTO subdirs(name, index_path, files_count, last_updated)
VALUES(?, ?, ?, ?)
ON CONFLICT(name) DO UPDATE SET
index_path=excluded.index_path,
files_count=excluded.files_count,
direct_files=excluded.direct_files,
last_updated=excluded.last_updated
""",
(name, index_path_str, files_count, direct_files, last_updated),
(name, index_path_str, files_count, last_updated),
)
conn.commit()
@@ -974,7 +999,7 @@ class DirIndexStore:
conn = self._get_connection()
rows = conn.execute(
"""
SELECT id, name, index_path, files_count, direct_files, last_updated
SELECT id, name, index_path, files_count, last_updated
FROM subdirs
ORDER BY name
"""
@@ -986,7 +1011,6 @@ class DirIndexStore:
name=row["name"],
index_path=Path(row["index_path"]),
files_count=int(row["files_count"]) if row["files_count"] else 0,
direct_files=int(row["direct_files"]) if row["direct_files"] else 0,
last_updated=float(row["last_updated"]) if row["last_updated"] else 0.0,
)
for row in rows
@@ -1005,7 +1029,7 @@ class DirIndexStore:
conn = self._get_connection()
row = conn.execute(
"""
SELECT id, name, index_path, files_count, direct_files, last_updated
SELECT id, name, index_path, files_count, last_updated
FROM subdirs WHERE name=?
""",
(name,),
@@ -1019,7 +1043,6 @@ class DirIndexStore:
name=row["name"],
index_path=Path(row["index_path"]),
files_count=int(row["files_count"]) if row["files_count"] else 0,
direct_files=int(row["direct_files"]) if row["direct_files"] else 0,
last_updated=float(row["last_updated"]) if row["last_updated"] else 0.0,
)
@@ -1031,41 +1054,71 @@ class DirIndexStore:
Args:
name: Subdirectory name
files_count: Total files recursively
direct_files: Files directly in subdirectory (optional)
direct_files: Deprecated parameter (no longer used)
"""
with self._lock:
conn = self._get_connection()
import time
last_updated = time.time()
if direct_files is not None:
conn.execute(
"""
UPDATE subdirs
SET files_count=?, direct_files=?, last_updated=?
WHERE name=?
""",
(files_count, direct_files, last_updated, name),
)
else:
conn.execute(
"""
UPDATE subdirs
SET files_count=?, last_updated=?
WHERE name=?
""",
(files_count, last_updated, name),
)
# Note: direct_files parameter is deprecated but kept for backward compatibility
conn.execute(
"""
UPDATE subdirs
SET files_count=?, last_updated=?
WHERE name=?
""",
(files_count, last_updated, name),
)
conn.commit()
# === Search ===
def search_fts(self, query: str, limit: int = 20) -> List[SearchResult]:
@staticmethod
def _enhance_fts_query(query: str) -> str:
"""Enhance FTS5 query to support prefix matching for simple queries.
For simple single-word or multi-word queries without FTS5 operators,
automatically adds prefix wildcard (*) to enable partial matching.
Examples:
"loadPack" -> "loadPack*"
"load package" -> "load* package*"
"load*" -> "load*" (already has wildcard, unchanged)
"NOT test" -> "NOT test" (has FTS operator, unchanged)
Args:
query: Original FTS5 query string
Returns:
Enhanced query string with prefix wildcards for simple queries
"""
# Don't modify if query already contains FTS5 operators or wildcards
if any(op in query.upper() for op in [' AND ', ' OR ', ' NOT ', ' NEAR ', '*', '"']):
return query
# For simple queries, add prefix wildcard to each word
words = query.split()
enhanced_words = [f"{word}*" if not word.endswith('*') else word for word in words]
return ' '.join(enhanced_words)
def search_fts(self, query: str, limit: int = 20, enhance_query: bool = False) -> List[SearchResult]:
"""Full-text search in current directory files.
Uses files_fts_exact (unicode61 tokenizer) for exact token matching.
For fuzzy/substring search, use search_fts_fuzzy() instead.
Best Practice (from industry analysis of Codanna/Code-Index-MCP):
- Default: Respects exact user input without modification
- Users can manually add wildcards (e.g., "loadPack*") for prefix matching
- Automatic enhancement (enhance_query=True) is NOT recommended as it can
violate user intent and bring unwanted noise in results
Args:
query: FTS5 query string
limit: Maximum results to return
enhance_query: If True, automatically add prefix wildcards for simple queries.
Default False to respect exact user input.
Returns:
List of SearchResult objects sorted by relevance
@@ -1073,19 +1126,23 @@ class DirIndexStore:
Raises:
StorageError: If FTS search fails
"""
# Only enhance query if explicitly requested (not default behavior)
# Best practice: Let users control wildcards manually
final_query = self._enhance_fts_query(query) if enhance_query else query
with self._lock:
conn = self._get_connection()
try:
rows = conn.execute(
"""
SELECT rowid, full_path, bm25(files_fts) AS rank,
snippet(files_fts, 2, '[bold red]', '[/bold red]', '...', 20) AS excerpt
FROM files_fts
WHERE files_fts MATCH ?
SELECT rowid, full_path, bm25(files_fts_exact) AS rank,
snippet(files_fts_exact, 2, '[bold red]', '[/bold red]', '...', 20) AS excerpt
FROM files_fts_exact
WHERE files_fts_exact MATCH ?
ORDER BY rank
LIMIT ?
""",
(query, limit),
(final_query, limit),
).fetchall()
except sqlite3.DatabaseError as exc:
raise StorageError(f"FTS search failed: {exc}") from exc
@@ -1249,10 +1306,11 @@ class DirIndexStore:
if kind:
rows = conn.execute(
"""
SELECT name, kind, start_line, end_line
FROM symbols
WHERE name LIKE ? AND kind=?
ORDER BY name
SELECT s.name, s.kind, s.start_line, s.end_line, f.full_path
FROM symbols s
JOIN files f ON s.file_id = f.id
WHERE s.name LIKE ? AND s.kind=?
ORDER BY s.name
LIMIT ?
""",
(pattern, kind, limit),
@@ -1260,10 +1318,11 @@ class DirIndexStore:
else:
rows = conn.execute(
"""
SELECT name, kind, start_line, end_line
FROM symbols
WHERE name LIKE ?
ORDER BY name
SELECT s.name, s.kind, s.start_line, s.end_line, f.full_path
FROM symbols s
JOIN files f ON s.file_id = f.id
WHERE s.name LIKE ?
ORDER BY s.name
LIMIT ?
""",
(pattern, limit),
@@ -1274,6 +1333,7 @@ class DirIndexStore:
name=row["name"],
kind=row["kind"],
range=(row["start_line"], row["end_line"]),
file=row["full_path"],
)
for row in rows
]
@@ -1359,7 +1419,7 @@ class DirIndexStore:
"""
)
# Subdirectories table
# Subdirectories table (v5: removed direct_files)
conn.execute(
"""
CREATE TABLE IF NOT EXISTS subdirs (
@@ -1367,13 +1427,12 @@ class DirIndexStore:
name TEXT NOT NULL UNIQUE,
index_path TEXT NOT NULL,
files_count INTEGER DEFAULT 0,
direct_files INTEGER DEFAULT 0,
last_updated REAL
)
"""
)
# Symbols table
# Symbols table (v5: removed token_count and symbol_type)
conn.execute(
"""
CREATE TABLE IF NOT EXISTS symbols (
@@ -1382,9 +1441,7 @@ class DirIndexStore:
name TEXT NOT NULL,
kind TEXT NOT NULL,
start_line INTEGER,
end_line INTEGER,
token_count INTEGER,
symbol_type TEXT
end_line INTEGER
)
"""
)
@@ -1421,14 +1478,13 @@ class DirIndexStore:
"""
)
# Semantic metadata table
# Semantic metadata table (v5: removed keywords column)
conn.execute(
"""
CREATE TABLE IF NOT EXISTS semantic_metadata (
id INTEGER PRIMARY KEY,
file_id INTEGER UNIQUE REFERENCES files(id) ON DELETE CASCADE,
summary TEXT,
keywords TEXT,
purpose TEXT,
llm_tool TEXT,
generated_at REAL
@@ -1473,13 +1529,12 @@ class DirIndexStore:
"""
)
# Indexes
# Indexes (v5: removed idx_symbols_type)
conn.execute("CREATE INDEX IF NOT EXISTS idx_files_name ON files(name)")
conn.execute("CREATE INDEX IF NOT EXISTS idx_files_path ON files(full_path)")
conn.execute("CREATE INDEX IF NOT EXISTS idx_subdirs_name ON subdirs(name)")
conn.execute("CREATE INDEX IF NOT EXISTS idx_symbols_name ON symbols(name)")
conn.execute("CREATE INDEX IF NOT EXISTS idx_symbols_file ON symbols(file_id)")
conn.execute("CREATE INDEX IF NOT EXISTS idx_symbols_type ON symbols(symbol_type)")
conn.execute("CREATE INDEX IF NOT EXISTS idx_semantic_file ON semantic_metadata(file_id)")
conn.execute("CREATE INDEX IF NOT EXISTS idx_keywords_keyword ON keywords(keyword)")
conn.execute("CREATE INDEX IF NOT EXISTS idx_file_keywords_file_id ON file_keywords(file_id)")

View File

@@ -0,0 +1,188 @@
"""
Migration 005: Remove unused and redundant database fields.
This migration removes four problematic fields identified by Gemini analysis:
1. **semantic_metadata.keywords** (deprecated - replaced by file_keywords table)
- Data: Migrated to normalized file_keywords table in migration 001
- Impact: Column now redundant, remove to prevent sync issues
2. **symbols.token_count** (unused - always NULL)
- Data: Never populated, always NULL
- Impact: No data loss, just removes unused column
3. **symbols.symbol_type** (redundant - duplicates kind)
- Data: Redundant with symbols.kind field
- Impact: No data loss, kind field contains same information
4. **subdirs.direct_files** (unused - never displayed)
- Data: Never used in queries or display logic
- Impact: No data loss, just removes unused column
Schema changes use table recreation pattern (SQLite best practice):
- Create new table without deprecated columns
- Copy data from old table
- Drop old table
- Rename new table
- Recreate indexes
"""
import logging
from sqlite3 import Connection
log = logging.getLogger(__name__)
def upgrade(db_conn: Connection):
"""Remove unused and redundant fields from schema.
Args:
db_conn: The SQLite database connection.
"""
cursor = db_conn.cursor()
try:
cursor.execute("BEGIN TRANSACTION")
# Step 1: Remove semantic_metadata.keywords
log.info("Removing semantic_metadata.keywords column...")
# Check if semantic_metadata table exists
cursor.execute(
"SELECT name FROM sqlite_master WHERE type='table' AND name='semantic_metadata'"
)
if cursor.fetchone():
cursor.execute("""
CREATE TABLE semantic_metadata_new (
id INTEGER PRIMARY KEY AUTOINCREMENT,
file_id INTEGER NOT NULL UNIQUE,
summary TEXT,
purpose TEXT,
llm_tool TEXT,
generated_at REAL,
FOREIGN KEY (file_id) REFERENCES files(id) ON DELETE CASCADE
)
""")
cursor.execute("""
INSERT INTO semantic_metadata_new (id, file_id, summary, purpose, llm_tool, generated_at)
SELECT id, file_id, summary, purpose, llm_tool, generated_at
FROM semantic_metadata
""")
cursor.execute("DROP TABLE semantic_metadata")
cursor.execute("ALTER TABLE semantic_metadata_new RENAME TO semantic_metadata")
# Recreate index
cursor.execute(
"CREATE INDEX IF NOT EXISTS idx_semantic_file ON semantic_metadata(file_id)"
)
log.info("Removed semantic_metadata.keywords column")
else:
log.info("semantic_metadata table does not exist, skipping")
# Step 2: Remove symbols.token_count and symbols.symbol_type
log.info("Removing symbols.token_count and symbols.symbol_type columns...")
# Check if symbols table exists
cursor.execute(
"SELECT name FROM sqlite_master WHERE type='table' AND name='symbols'"
)
if cursor.fetchone():
cursor.execute("""
CREATE TABLE symbols_new (
id INTEGER PRIMARY KEY AUTOINCREMENT,
file_id INTEGER NOT NULL,
name TEXT NOT NULL,
kind TEXT,
start_line INTEGER,
end_line INTEGER,
FOREIGN KEY (file_id) REFERENCES files(id) ON DELETE CASCADE
)
""")
cursor.execute("""
INSERT INTO symbols_new (id, file_id, name, kind, start_line, end_line)
SELECT id, file_id, name, kind, start_line, end_line
FROM symbols
""")
cursor.execute("DROP TABLE symbols")
cursor.execute("ALTER TABLE symbols_new RENAME TO symbols")
# Recreate indexes (excluding idx_symbols_type which indexed symbol_type)
cursor.execute("CREATE INDEX IF NOT EXISTS idx_symbols_file ON symbols(file_id)")
cursor.execute("CREATE INDEX IF NOT EXISTS idx_symbols_name ON symbols(name)")
log.info("Removed symbols.token_count and symbols.symbol_type columns")
else:
log.info("symbols table does not exist, skipping")
# Step 3: Remove subdirs.direct_files
log.info("Removing subdirs.direct_files column...")
# Check if subdirs table exists
cursor.execute(
"SELECT name FROM sqlite_master WHERE type='table' AND name='subdirs'"
)
if cursor.fetchone():
cursor.execute("""
CREATE TABLE subdirs_new (
id INTEGER PRIMARY KEY AUTOINCREMENT,
name TEXT NOT NULL UNIQUE,
index_path TEXT NOT NULL,
files_count INTEGER DEFAULT 0,
last_updated REAL
)
""")
cursor.execute("""
INSERT INTO subdirs_new (id, name, index_path, files_count, last_updated)
SELECT id, name, index_path, files_count, last_updated
FROM subdirs
""")
cursor.execute("DROP TABLE subdirs")
cursor.execute("ALTER TABLE subdirs_new RENAME TO subdirs")
# Recreate index
cursor.execute("CREATE INDEX IF NOT EXISTS idx_subdirs_name ON subdirs(name)")
log.info("Removed subdirs.direct_files column")
else:
log.info("subdirs table does not exist, skipping")
cursor.execute("COMMIT")
log.info("Migration 005 completed successfully")
# Vacuum to reclaim space (outside transaction)
try:
log.info("Running VACUUM to reclaim space...")
cursor.execute("VACUUM")
log.info("VACUUM completed successfully")
except Exception as e:
log.warning(f"VACUUM failed (non-critical): {e}")
except Exception as e:
log.error(f"Migration 005 failed: {e}")
try:
cursor.execute("ROLLBACK")
except Exception:
pass
raise
def downgrade(db_conn: Connection):
"""Restore removed fields (data will be lost for keywords, token_count, symbol_type, direct_files).
This is a placeholder - true downgrade is not feasible as data is lost.
The migration is designed to be one-way since removed fields are unused/redundant.
Args:
db_conn: The SQLite database connection.
"""
log.warning(
"Migration 005 downgrade not supported - removed fields are unused/redundant. "
"Data cannot be restored."
)
raise NotImplementedError(
"Migration 005 downgrade not supported - this is a one-way migration"
)