Add comprehensive tests for schema cleanup migration and search comparison

- Implement tests for migration 005 to verify removal of deprecated fields in the database schema. - Ensure that new databases are created with a clean schema. - Validate that keywords are correctly extracted from the normalized file_keywords table. - Test symbol insertion without deprecated fields and subdir operations without direct_files. - Create a detailed search comparison test to evaluate vector search vs hybrid search performance. - Add a script for reindexing projects to extract code relationships and verify GraphAnalyzer functionality. - Include a test script to check TreeSitter parser availability and relationship extraction from sample files.
2026-02-10 02:24:35 +08:00 · 2025-12-16 19:27:05 +08:00
parent 3da0ef2adb
commit df23975a0b
61 changed files with 13114 additions and 366 deletions
--- a/codex-lens/src/codexlens/storage/dir_index.py
+++ b/codex-lens/src/codexlens/storage/dir_index.py
@@ -27,7 +27,6 @@ class SubdirLink:
    name: str
    index_path: Path
    files_count: int
-    direct_files: int
    last_updated: float


@@ -57,7 +56,7 @@ class DirIndexStore:

    # Schema version for migration tracking
    # Increment this when schema changes require migration
-    SCHEMA_VERSION = 4
+    SCHEMA_VERSION = 5

    def __init__(self, db_path: str | Path) -> None:
        """Initialize directory index store.
@@ -133,6 +132,11 @@ class DirIndexStore:
            from codexlens.storage.migrations.migration_004_dual_fts import upgrade
            upgrade(conn)

+        # Migration v4 -> v5: Remove unused/redundant fields
+        if from_version < 5:
+            from codexlens.storage.migrations.migration_005_cleanup_unused_fields import upgrade
+            upgrade(conn)
+
    def close(self) -> None:
        """Close database connection."""
        with self._lock:
@@ -208,19 +212,17 @@ class DirIndexStore:
                # Replace symbols
                conn.execute("DELETE FROM symbols WHERE file_id=?", (file_id,))
                if symbols:
-                    # Extract token_count and symbol_type from symbol metadata if available
+                    # Insert symbols without token_count and symbol_type
                    symbol_rows = []
                    for s in symbols:
-                        token_count = getattr(s, 'token_count', None)
-                        symbol_type = getattr(s, 'symbol_type', None) or s.kind
                        symbol_rows.append(
-                            (file_id, s.name, s.kind, s.range[0], s.range[1], token_count, symbol_type)
+                            (file_id, s.name, s.kind, s.range[0], s.range[1])
                        )

                    conn.executemany(
                        """
-                        INSERT INTO symbols(file_id, name, kind, start_line, end_line, token_count, symbol_type)
-                        VALUES(?, ?, ?, ?, ?, ?, ?)
+                        INSERT INTO symbols(file_id, name, kind, start_line, end_line)
+                        VALUES(?, ?, ?, ?, ?)
                        """,
                        symbol_rows,
                    )
@@ -374,19 +376,17 @@ class DirIndexStore:

                    conn.execute("DELETE FROM symbols WHERE file_id=?", (file_id,))
                    if symbols:
-                        # Extract token_count and symbol_type from symbol metadata if available
+                        # Insert symbols without token_count and symbol_type
                        symbol_rows = []
                        for s in symbols:
-                            token_count = getattr(s, 'token_count', None)
-                            symbol_type = getattr(s, 'symbol_type', None) or s.kind
                            symbol_rows.append(
-                                (file_id, s.name, s.kind, s.range[0], s.range[1], token_count, symbol_type)
+                                (file_id, s.name, s.kind, s.range[0], s.range[1])
                            )

                        conn.executemany(
                            """
-                            INSERT INTO symbols(file_id, name, kind, start_line, end_line, token_count, symbol_type)
-                            VALUES(?, ?, ?, ?, ?, ?, ?)
+                            INSERT INTO symbols(file_id, name, kind, start_line, end_line)
+                            VALUES(?, ?, ?, ?, ?)
                            """,
                            symbol_rows,
                        )
@@ -644,25 +644,22 @@ class DirIndexStore:
        with self._lock:
            conn = self._get_connection()

-            import json
            import time

-            keywords_json = json.dumps(keywords)
            generated_at = time.time()

-            # Write to semantic_metadata table (for backward compatibility)
+            # Write to semantic_metadata table (without keywords column)
            conn.execute(
                """
-                INSERT INTO semantic_metadata(file_id, summary, keywords, purpose, llm_tool, generated_at)
-                VALUES(?, ?, ?, ?, ?, ?)
+                INSERT INTO semantic_metadata(file_id, summary, purpose, llm_tool, generated_at)
+                VALUES(?, ?, ?, ?, ?)
                ON CONFLICT(file_id) DO UPDATE SET
                    summary=excluded.summary,
-                    keywords=excluded.keywords,
                    purpose=excluded.purpose,
                    llm_tool=excluded.llm_tool,
                    generated_at=excluded.generated_at
                """,
-                (file_id, summary, keywords_json, purpose, llm_tool, generated_at),
+                (file_id, summary, purpose, llm_tool, generated_at),
            )

            # Write to normalized keywords tables for optimized search
@@ -709,9 +706,10 @@ class DirIndexStore:
        with self._lock:
            conn = self._get_connection()

+            # Get semantic metadata (without keywords column)
            row = conn.execute(
                """
-                SELECT summary, keywords, purpose, llm_tool, generated_at
+                SELECT summary, purpose, llm_tool, generated_at
                FROM semantic_metadata WHERE file_id=?
                """,
                (file_id,),
@@ -720,11 +718,23 @@ class DirIndexStore:
            if not row:
                return None

-            import json
+            # Get keywords from normalized file_keywords table
+            keyword_rows = conn.execute(
+                """
+                SELECT k.keyword
+                FROM file_keywords fk
+                JOIN keywords k ON fk.keyword_id = k.id
+                WHERE fk.file_id = ?
+                ORDER BY k.keyword
+                """,
+                (file_id,),
+            ).fetchall()
+
+            keywords = [kw["keyword"] for kw in keyword_rows]

            return {
                "summary": row["summary"],
-                "keywords": json.loads(row["keywords"]) if row["keywords"] else [],
+                "keywords": keywords,
                "purpose": row["purpose"],
                "llm_tool": row["llm_tool"],
                "generated_at": float(row["generated_at"]) if row["generated_at"] else 0.0,
@@ -856,15 +866,14 @@ class DirIndexStore:
        Returns:
            Tuple of (list of metadata dicts, total count)
        """
-        import json
-
        with self._lock:
            conn = self._get_connection()

+            # Query semantic metadata without keywords column
            base_query = """
                SELECT f.id as file_id, f.name as file_name, f.full_path,
                       f.language, f.line_count,
-                       sm.summary, sm.keywords, sm.purpose,
+                       sm.summary, sm.purpose,
                       sm.llm_tool, sm.generated_at
                FROM files f
                JOIN semantic_metadata sm ON f.id = sm.file_id
@@ -892,14 +901,30 @@ class DirIndexStore:

            results = []
            for row in rows:
+                file_id = int(row["file_id"])
+
+                # Get keywords from normalized file_keywords table
+                keyword_rows = conn.execute(
+                    """
+                    SELECT k.keyword
+                    FROM file_keywords fk
+                    JOIN keywords k ON fk.keyword_id = k.id
+                    WHERE fk.file_id = ?
+                    ORDER BY k.keyword
+                    """,
+                    (file_id,),
+                ).fetchall()
+
+                keywords = [kw["keyword"] for kw in keyword_rows]
+
                results.append({
-                    "file_id": int(row["file_id"]),
+                    "file_id": file_id,
                    "file_name": row["file_name"],
                    "full_path": row["full_path"],
                    "language": row["language"],
                    "line_count": int(row["line_count"]) if row["line_count"] else 0,
                    "summary": row["summary"],
-                    "keywords": json.loads(row["keywords"]) if row["keywords"] else [],
+                    "keywords": keywords,
                    "purpose": row["purpose"],
                    "llm_tool": row["llm_tool"],
                    "generated_at": float(row["generated_at"]) if row["generated_at"] else 0.0,
@@ -922,7 +947,7 @@ class DirIndexStore:
            name: Subdirectory name
            index_path: Path to subdirectory's _index.db
            files_count: Total files recursively
-            direct_files: Files directly in subdirectory
+            direct_files: Deprecated parameter (no longer used)
        """
        with self._lock:
            conn = self._get_connection()
@@ -931,17 +956,17 @@ class DirIndexStore:
            import time
            last_updated = time.time()

+            # Note: direct_files parameter is deprecated but kept for backward compatibility
            conn.execute(
                """
-                INSERT INTO subdirs(name, index_path, files_count, direct_files, last_updated)
-                VALUES(?, ?, ?, ?, ?)
+                INSERT INTO subdirs(name, index_path, files_count, last_updated)
+                VALUES(?, ?, ?, ?)
                ON CONFLICT(name) DO UPDATE SET
                    index_path=excluded.index_path,
                    files_count=excluded.files_count,
-                    direct_files=excluded.direct_files,
                    last_updated=excluded.last_updated
                """,
-                (name, index_path_str, files_count, direct_files, last_updated),
+                (name, index_path_str, files_count, last_updated),
            )
            conn.commit()

@@ -974,7 +999,7 @@ class DirIndexStore:
            conn = self._get_connection()
            rows = conn.execute(
                """
-                SELECT id, name, index_path, files_count, direct_files, last_updated
+                SELECT id, name, index_path, files_count, last_updated
                FROM subdirs
                ORDER BY name
                """
@@ -986,7 +1011,6 @@ class DirIndexStore:
                    name=row["name"],
                    index_path=Path(row["index_path"]),
                    files_count=int(row["files_count"]) if row["files_count"] else 0,
-                    direct_files=int(row["direct_files"]) if row["direct_files"] else 0,
                    last_updated=float(row["last_updated"]) if row["last_updated"] else 0.0,
                )
                for row in rows
@@ -1005,7 +1029,7 @@ class DirIndexStore:
            conn = self._get_connection()
            row = conn.execute(
                """
-                SELECT id, name, index_path, files_count, direct_files, last_updated
+                SELECT id, name, index_path, files_count, last_updated
                FROM subdirs WHERE name=?
                """,
                (name,),
@@ -1019,7 +1043,6 @@ class DirIndexStore:
                name=row["name"],
                index_path=Path(row["index_path"]),
                files_count=int(row["files_count"]) if row["files_count"] else 0,
-                direct_files=int(row["direct_files"]) if row["direct_files"] else 0,
                last_updated=float(row["last_updated"]) if row["last_updated"] else 0.0,
            )

@@ -1031,41 +1054,71 @@ class DirIndexStore:
        Args:
            name: Subdirectory name
            files_count: Total files recursively
-            direct_files: Files directly in subdirectory (optional)
+            direct_files: Deprecated parameter (no longer used)
        """
        with self._lock:
            conn = self._get_connection()
            import time
            last_updated = time.time()

-            if direct_files is not None:
-                conn.execute(
-                    """
-                    UPDATE subdirs
-                    SET files_count=?, direct_files=?, last_updated=?
-                    WHERE name=?
-                    """,
-                    (files_count, direct_files, last_updated, name),
-                )
-            else:
-                conn.execute(
-                    """
-                    UPDATE subdirs
-                    SET files_count=?, last_updated=?
-                    WHERE name=?
-                    """,
-                    (files_count, last_updated, name),
-                )
+            # Note: direct_files parameter is deprecated but kept for backward compatibility
+            conn.execute(
+                """
+                UPDATE subdirs
+                SET files_count=?, last_updated=?
+                WHERE name=?
+                """,
+                (files_count, last_updated, name),
+            )
            conn.commit()

    # === Search ===

-    def search_fts(self, query: str, limit: int = 20) -> List[SearchResult]:
+    @staticmethod
+    def _enhance_fts_query(query: str) -> str:
+        """Enhance FTS5 query to support prefix matching for simple queries.
+
+        For simple single-word or multi-word queries without FTS5 operators,
+        automatically adds prefix wildcard (*) to enable partial matching.
+
+        Examples:
+            "loadPack" -> "loadPack*"
+            "load package" -> "load* package*"
+            "load*" -> "load*" (already has wildcard, unchanged)
+            "NOT test" -> "NOT test" (has FTS operator, unchanged)
+
+        Args:
+            query: Original FTS5 query string
+
+        Returns:
+            Enhanced query string with prefix wildcards for simple queries
+        """
+        # Don't modify if query already contains FTS5 operators or wildcards
+        if any(op in query.upper() for op in [' AND ', ' OR ', ' NOT ', ' NEAR ', '*', '"']):
+            return query
+
+        # For simple queries, add prefix wildcard to each word
+        words = query.split()
+        enhanced_words = [f"{word}*" if not word.endswith('*') else word for word in words]
+        return ' '.join(enhanced_words)
+
+    def search_fts(self, query: str, limit: int = 20, enhance_query: bool = False) -> List[SearchResult]:
        """Full-text search in current directory files.

+        Uses files_fts_exact (unicode61 tokenizer) for exact token matching.
+        For fuzzy/substring search, use search_fts_fuzzy() instead.
+
+        Best Practice (from industry analysis of Codanna/Code-Index-MCP):
+        - Default: Respects exact user input without modification
+        - Users can manually add wildcards (e.g., "loadPack*") for prefix matching
+        - Automatic enhancement (enhance_query=True) is NOT recommended as it can
+          violate user intent and bring unwanted noise in results
+
        Args:
            query: FTS5 query string
            limit: Maximum results to return
+            enhance_query: If True, automatically add prefix wildcards for simple queries.
+                          Default False to respect exact user input.

        Returns:
            List of SearchResult objects sorted by relevance
@@ -1073,19 +1126,23 @@ class DirIndexStore:
        Raises:
            StorageError: If FTS search fails
        """
+        # Only enhance query if explicitly requested (not default behavior)
+        # Best practice: Let users control wildcards manually
+        final_query = self._enhance_fts_query(query) if enhance_query else query
+
        with self._lock:
            conn = self._get_connection()
            try:
                rows = conn.execute(
                    """
-                    SELECT rowid, full_path, bm25(files_fts) AS rank,
-                           snippet(files_fts, 2, '[bold red]', '[/bold red]', '...', 20) AS excerpt
-                    FROM files_fts
-                    WHERE files_fts MATCH ?
+                    SELECT rowid, full_path, bm25(files_fts_exact) AS rank,
+                           snippet(files_fts_exact, 2, '[bold red]', '[/bold red]', '...', 20) AS excerpt
+                    FROM files_fts_exact
+                    WHERE files_fts_exact MATCH ?
                    ORDER BY rank
                    LIMIT ?
                    """,
-                    (query, limit),
+                    (final_query, limit),
                ).fetchall()
            except sqlite3.DatabaseError as exc:
                raise StorageError(f"FTS search failed: {exc}") from exc
@@ -1249,10 +1306,11 @@ class DirIndexStore:
            if kind:
                rows = conn.execute(
                    """
-                    SELECT name, kind, start_line, end_line
-                    FROM symbols
-                    WHERE name LIKE ? AND kind=?
-                    ORDER BY name
+                    SELECT s.name, s.kind, s.start_line, s.end_line, f.full_path
+                    FROM symbols s
+                    JOIN files f ON s.file_id = f.id
+                    WHERE s.name LIKE ? AND s.kind=?
+                    ORDER BY s.name
                    LIMIT ?
                    """,
                    (pattern, kind, limit),
@@ -1260,10 +1318,11 @@ class DirIndexStore:
            else:
                rows = conn.execute(
                    """
-                    SELECT name, kind, start_line, end_line
-                    FROM symbols
-                    WHERE name LIKE ?
-                    ORDER BY name
+                    SELECT s.name, s.kind, s.start_line, s.end_line, f.full_path
+                    FROM symbols s
+                    JOIN files f ON s.file_id = f.id
+                    WHERE s.name LIKE ?
+                    ORDER BY s.name
                    LIMIT ?
                    """,
                    (pattern, limit),
@@ -1274,6 +1333,7 @@ class DirIndexStore:
                    name=row["name"],
                    kind=row["kind"],
                    range=(row["start_line"], row["end_line"]),
+                    file=row["full_path"],
                )
                for row in rows
            ]
@@ -1359,7 +1419,7 @@ class DirIndexStore:
                """
            )

-            # Subdirectories table
+            # Subdirectories table (v5: removed direct_files)
            conn.execute(
                """
                CREATE TABLE IF NOT EXISTS subdirs (
@@ -1367,13 +1427,12 @@ class DirIndexStore:
                    name TEXT NOT NULL UNIQUE,
                    index_path TEXT NOT NULL,
                    files_count INTEGER DEFAULT 0,
-                    direct_files INTEGER DEFAULT 0,
                    last_updated REAL
                )
                """
            )

-            # Symbols table
+            # Symbols table (v5: removed token_count and symbol_type)
            conn.execute(
                """
                CREATE TABLE IF NOT EXISTS symbols (
@@ -1382,9 +1441,7 @@ class DirIndexStore:
                    name TEXT NOT NULL,
                    kind TEXT NOT NULL,
                    start_line INTEGER,
-                    end_line INTEGER,
-                    token_count INTEGER,
-                    symbol_type TEXT
+                    end_line INTEGER
                )
                """
            )
@@ -1421,14 +1478,13 @@ class DirIndexStore:
                """
            )

-            # Semantic metadata table
+            # Semantic metadata table (v5: removed keywords column)
            conn.execute(
                """
                CREATE TABLE IF NOT EXISTS semantic_metadata (
                    id INTEGER PRIMARY KEY,
                    file_id INTEGER UNIQUE REFERENCES files(id) ON DELETE CASCADE,
                    summary TEXT,
-                    keywords TEXT,
                    purpose TEXT,
                    llm_tool TEXT,
                    generated_at REAL
@@ -1473,13 +1529,12 @@ class DirIndexStore:
                """
            )

-            # Indexes
+            # Indexes (v5: removed idx_symbols_type)
            conn.execute("CREATE INDEX IF NOT EXISTS idx_files_name ON files(name)")
            conn.execute("CREATE INDEX IF NOT EXISTS idx_files_path ON files(full_path)")
            conn.execute("CREATE INDEX IF NOT EXISTS idx_subdirs_name ON subdirs(name)")
            conn.execute("CREATE INDEX IF NOT EXISTS idx_symbols_name ON symbols(name)")
            conn.execute("CREATE INDEX IF NOT EXISTS idx_symbols_file ON symbols(file_id)")
-            conn.execute("CREATE INDEX IF NOT EXISTS idx_symbols_type ON symbols(symbol_type)")
            conn.execute("CREATE INDEX IF NOT EXISTS idx_semantic_file ON semantic_metadata(file_id)")
            conn.execute("CREATE INDEX IF NOT EXISTS idx_keywords_keyword ON keywords(keyword)")
            conn.execute("CREATE INDEX IF NOT EXISTS idx_file_keywords_file_id ON file_keywords(file_id)")
--- a/codex-lens/src/codexlens/storage/migrations/migration_005_cleanup_unused_fields.py
+++ b/codex-lens/src/codexlens/storage/migrations/migration_005_cleanup_unused_fields.py
@@ -0,0 +1,188 @@
+"""
+Migration 005: Remove unused and redundant database fields.
+
+This migration removes four problematic fields identified by Gemini analysis:
+
+1. **semantic_metadata.keywords** (deprecated - replaced by file_keywords table)
+   - Data: Migrated to normalized file_keywords table in migration 001
+   - Impact: Column now redundant, remove to prevent sync issues
+
+2. **symbols.token_count** (unused - always NULL)
+   - Data: Never populated, always NULL
+   - Impact: No data loss, just removes unused column
+
+3. **symbols.symbol_type** (redundant - duplicates kind)
+   - Data: Redundant with symbols.kind field
+   - Impact: No data loss, kind field contains same information
+
+4. **subdirs.direct_files** (unused - never displayed)
+   - Data: Never used in queries or display logic
+   - Impact: No data loss, just removes unused column
+
+Schema changes use table recreation pattern (SQLite best practice):
+- Create new table without deprecated columns
+- Copy data from old table
+- Drop old table
+- Rename new table
+- Recreate indexes
+"""
+
+import logging
+from sqlite3 import Connection
+
+log = logging.getLogger(__name__)
+
+
+def upgrade(db_conn: Connection):
+    """Remove unused and redundant fields from schema.
+
+    Args:
+        db_conn: The SQLite database connection.
+    """
+    cursor = db_conn.cursor()
+
+    try:
+        cursor.execute("BEGIN TRANSACTION")
+
+        # Step 1: Remove semantic_metadata.keywords
+        log.info("Removing semantic_metadata.keywords column...")
+
+        # Check if semantic_metadata table exists
+        cursor.execute(
+            "SELECT name FROM sqlite_master WHERE type='table' AND name='semantic_metadata'"
+        )
+        if cursor.fetchone():
+            cursor.execute("""
+                CREATE TABLE semantic_metadata_new (
+                    id INTEGER PRIMARY KEY AUTOINCREMENT,
+                    file_id INTEGER NOT NULL UNIQUE,
+                    summary TEXT,
+                    purpose TEXT,
+                    llm_tool TEXT,
+                    generated_at REAL,
+                    FOREIGN KEY (file_id) REFERENCES files(id) ON DELETE CASCADE
+                )
+            """)
+
+            cursor.execute("""
+                INSERT INTO semantic_metadata_new (id, file_id, summary, purpose, llm_tool, generated_at)
+                SELECT id, file_id, summary, purpose, llm_tool, generated_at
+                FROM semantic_metadata
+            """)
+
+            cursor.execute("DROP TABLE semantic_metadata")
+            cursor.execute("ALTER TABLE semantic_metadata_new RENAME TO semantic_metadata")
+
+            # Recreate index
+            cursor.execute(
+                "CREATE INDEX IF NOT EXISTS idx_semantic_file ON semantic_metadata(file_id)"
+            )
+            log.info("Removed semantic_metadata.keywords column")
+        else:
+            log.info("semantic_metadata table does not exist, skipping")
+
+        # Step 2: Remove symbols.token_count and symbols.symbol_type
+        log.info("Removing symbols.token_count and symbols.symbol_type columns...")
+
+        # Check if symbols table exists
+        cursor.execute(
+            "SELECT name FROM sqlite_master WHERE type='table' AND name='symbols'"
+        )
+        if cursor.fetchone():
+            cursor.execute("""
+                CREATE TABLE symbols_new (
+                    id INTEGER PRIMARY KEY AUTOINCREMENT,
+                    file_id INTEGER NOT NULL,
+                    name TEXT NOT NULL,
+                    kind TEXT,
+                    start_line INTEGER,
+                    end_line INTEGER,
+                    FOREIGN KEY (file_id) REFERENCES files(id) ON DELETE CASCADE
+                )
+            """)
+
+            cursor.execute("""
+                INSERT INTO symbols_new (id, file_id, name, kind, start_line, end_line)
+                SELECT id, file_id, name, kind, start_line, end_line
+                FROM symbols
+            """)
+
+            cursor.execute("DROP TABLE symbols")
+            cursor.execute("ALTER TABLE symbols_new RENAME TO symbols")
+
+            # Recreate indexes (excluding idx_symbols_type which indexed symbol_type)
+            cursor.execute("CREATE INDEX IF NOT EXISTS idx_symbols_file ON symbols(file_id)")
+            cursor.execute("CREATE INDEX IF NOT EXISTS idx_symbols_name ON symbols(name)")
+            log.info("Removed symbols.token_count and symbols.symbol_type columns")
+        else:
+            log.info("symbols table does not exist, skipping")
+
+        # Step 3: Remove subdirs.direct_files
+        log.info("Removing subdirs.direct_files column...")
+
+        # Check if subdirs table exists
+        cursor.execute(
+            "SELECT name FROM sqlite_master WHERE type='table' AND name='subdirs'"
+        )
+        if cursor.fetchone():
+            cursor.execute("""
+                CREATE TABLE subdirs_new (
+                    id INTEGER PRIMARY KEY AUTOINCREMENT,
+                    name TEXT NOT NULL UNIQUE,
+                    index_path TEXT NOT NULL,
+                    files_count INTEGER DEFAULT 0,
+                    last_updated REAL
+                )
+            """)
+
+            cursor.execute("""
+                INSERT INTO subdirs_new (id, name, index_path, files_count, last_updated)
+                SELECT id, name, index_path, files_count, last_updated
+                FROM subdirs
+            """)
+
+            cursor.execute("DROP TABLE subdirs")
+            cursor.execute("ALTER TABLE subdirs_new RENAME TO subdirs")
+
+            # Recreate index
+            cursor.execute("CREATE INDEX IF NOT EXISTS idx_subdirs_name ON subdirs(name)")
+            log.info("Removed subdirs.direct_files column")
+        else:
+            log.info("subdirs table does not exist, skipping")
+
+        cursor.execute("COMMIT")
+        log.info("Migration 005 completed successfully")
+
+        # Vacuum to reclaim space (outside transaction)
+        try:
+            log.info("Running VACUUM to reclaim space...")
+            cursor.execute("VACUUM")
+            log.info("VACUUM completed successfully")
+        except Exception as e:
+            log.warning(f"VACUUM failed (non-critical): {e}")
+
+    except Exception as e:
+        log.error(f"Migration 005 failed: {e}")
+        try:
+            cursor.execute("ROLLBACK")
+        except Exception:
+            pass
+        raise
+
+
+def downgrade(db_conn: Connection):
+    """Restore removed fields (data will be lost for keywords, token_count, symbol_type, direct_files).
+
+    This is a placeholder - true downgrade is not feasible as data is lost.
+    The migration is designed to be one-way since removed fields are unused/redundant.
+
+    Args:
+        db_conn: The SQLite database connection.
+    """
+    log.warning(
+        "Migration 005 downgrade not supported - removed fields are unused/redundant. "
+        "Data cannot be restored."
+    )
+    raise NotImplementedError(
+        "Migration 005 downgrade not supported - this is a one-way migration"
+    )