mirror of
https://github.com/catlog22/Claude-Code-Workflow.git
synced 2026-02-13 02:41:50 +08:00
Refactor code structure and remove redundant changes
This commit is contained in:
@@ -0,0 +1 @@
|
||||
# This file makes the 'migrations' directory a Python package.
|
||||
@@ -0,0 +1,123 @@
|
||||
"""
|
||||
Migration 001: Normalize keywords into separate tables.
|
||||
|
||||
This migration introduces two new tables, `keywords` and `file_keywords`, to
|
||||
store semantic keywords in a normalized fashion. It then migrates the existing
|
||||
keywords from the `semantic_data` JSON blob in the `files` table into these
|
||||
new tables. This is intended to speed up keyword-based searches significantly.
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
from sqlite3 import Connection
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def upgrade(db_conn: Connection):
|
||||
"""
|
||||
Applies the migration to normalize keywords.
|
||||
|
||||
- Creates `keywords` and `file_keywords` tables.
|
||||
- Creates indexes for efficient querying.
|
||||
- Migrates data from `files.semantic_data` to the new tables.
|
||||
|
||||
Args:
|
||||
db_conn: The SQLite database connection.
|
||||
"""
|
||||
cursor = db_conn.cursor()
|
||||
|
||||
log.info("Creating 'keywords' and 'file_keywords' tables...")
|
||||
# Create a table to store unique keywords
|
||||
cursor.execute(
|
||||
"""
|
||||
CREATE TABLE IF NOT EXISTS keywords (
|
||||
id INTEGER PRIMARY KEY,
|
||||
keyword TEXT NOT NULL UNIQUE
|
||||
)
|
||||
"""
|
||||
)
|
||||
|
||||
# Create a join table to link files and keywords (many-to-many)
|
||||
cursor.execute(
|
||||
"""
|
||||
CREATE TABLE IF NOT EXISTS file_keywords (
|
||||
file_id INTEGER NOT NULL,
|
||||
keyword_id INTEGER NOT NULL,
|
||||
PRIMARY KEY (file_id, keyword_id),
|
||||
FOREIGN KEY (file_id) REFERENCES files (id) ON DELETE CASCADE,
|
||||
FOREIGN KEY (keyword_id) REFERENCES keywords (id) ON DELETE CASCADE
|
||||
)
|
||||
"""
|
||||
)
|
||||
|
||||
log.info("Creating indexes for new keyword tables...")
|
||||
cursor.execute("CREATE INDEX IF NOT EXISTS idx_keywords_keyword ON keywords (keyword)")
|
||||
cursor.execute("CREATE INDEX IF NOT EXISTS idx_file_keywords_file_id ON file_keywords (file_id)")
|
||||
cursor.execute("CREATE INDEX IF NOT EXISTS idx_file_keywords_keyword_id ON file_keywords (keyword_id)")
|
||||
|
||||
log.info("Migrating existing keywords from 'semantic_metadata' table...")
|
||||
|
||||
# Check if semantic_metadata table exists before querying
|
||||
cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='semantic_metadata'")
|
||||
if not cursor.fetchone():
|
||||
log.info("No 'semantic_metadata' table found, skipping data migration.")
|
||||
return
|
||||
|
||||
# Check if 'keywords' column exists in semantic_metadata table
|
||||
# (current schema may already use normalized tables without this column)
|
||||
cursor.execute("PRAGMA table_info(semantic_metadata)")
|
||||
columns = {row[1] for row in cursor.fetchall()}
|
||||
if "keywords" not in columns:
|
||||
log.info("No 'keywords' column in semantic_metadata table, skipping data migration.")
|
||||
return
|
||||
|
||||
cursor.execute("SELECT file_id, keywords FROM semantic_metadata WHERE keywords IS NOT NULL AND keywords != ''")
|
||||
|
||||
files_to_migrate = cursor.fetchall()
|
||||
if not files_to_migrate:
|
||||
log.info("No existing files with semantic metadata to migrate.")
|
||||
return
|
||||
|
||||
log.info(f"Found {len(files_to_migrate)} files with semantic metadata to migrate.")
|
||||
|
||||
for file_id, keywords_json in files_to_migrate:
|
||||
if not keywords_json:
|
||||
continue
|
||||
try:
|
||||
keywords = json.loads(keywords_json)
|
||||
|
||||
if not isinstance(keywords, list):
|
||||
log.warning(f"Keywords for file_id {file_id} is not a list, skipping.")
|
||||
continue
|
||||
|
||||
for keyword in keywords:
|
||||
if not isinstance(keyword, str):
|
||||
log.warning(f"Non-string keyword '{keyword}' found for file_id {file_id}, skipping.")
|
||||
continue
|
||||
|
||||
keyword = keyword.strip()
|
||||
if not keyword:
|
||||
continue
|
||||
|
||||
# Get or create keyword_id
|
||||
cursor.execute("INSERT OR IGNORE INTO keywords (keyword) VALUES (?)", (keyword,))
|
||||
cursor.execute("SELECT id FROM keywords WHERE keyword = ?", (keyword,))
|
||||
keyword_id_result = cursor.fetchone()
|
||||
|
||||
if keyword_id_result:
|
||||
keyword_id = keyword_id_result[0]
|
||||
# Link file to keyword
|
||||
cursor.execute(
|
||||
"INSERT OR IGNORE INTO file_keywords (file_id, keyword_id) VALUES (?, ?)",
|
||||
(file_id, keyword_id),
|
||||
)
|
||||
else:
|
||||
log.error(f"Failed to retrieve or create keyword_id for keyword: {keyword}")
|
||||
|
||||
except json.JSONDecodeError as e:
|
||||
log.warning(f"Could not parse keywords for file_id {file_id}: {e}")
|
||||
except Exception as e:
|
||||
log.error(f"An unexpected error occurred during migration for file_id {file_id}: {e}", exc_info=True)
|
||||
|
||||
log.info("Finished migrating keywords.")
|
||||
@@ -0,0 +1,48 @@
|
||||
"""
|
||||
Migration 002: Add token_count and symbol_type to symbols table.
|
||||
|
||||
This migration adds token counting metadata to symbols for accurate chunk
|
||||
splitting and performance optimization. It also adds symbol_type for better
|
||||
filtering in searches.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from sqlite3 import Connection
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def upgrade(db_conn: Connection):
|
||||
"""
|
||||
Applies the migration to add token metadata to symbols.
|
||||
|
||||
- Adds token_count column to symbols table
|
||||
- Adds symbol_type column to symbols table (for future use)
|
||||
- Creates index on symbol_type for efficient filtering
|
||||
- Backfills existing symbols with NULL token_count (to be calculated lazily)
|
||||
|
||||
Args:
|
||||
db_conn: The SQLite database connection.
|
||||
"""
|
||||
cursor = db_conn.cursor()
|
||||
|
||||
log.info("Adding token_count column to symbols table...")
|
||||
try:
|
||||
cursor.execute("ALTER TABLE symbols ADD COLUMN token_count INTEGER")
|
||||
log.info("Successfully added token_count column.")
|
||||
except Exception as e:
|
||||
# Column might already exist
|
||||
log.warning(f"Could not add token_count column (might already exist): {e}")
|
||||
|
||||
log.info("Adding symbol_type column to symbols table...")
|
||||
try:
|
||||
cursor.execute("ALTER TABLE symbols ADD COLUMN symbol_type TEXT")
|
||||
log.info("Successfully added symbol_type column.")
|
||||
except Exception as e:
|
||||
# Column might already exist
|
||||
log.warning(f"Could not add symbol_type column (might already exist): {e}")
|
||||
|
||||
log.info("Creating index on symbol_type for efficient filtering...")
|
||||
cursor.execute("CREATE INDEX IF NOT EXISTS idx_symbols_type ON symbols(symbol_type)")
|
||||
|
||||
log.info("Migration 002 completed successfully.")
|
||||
@@ -0,0 +1,232 @@
|
||||
"""
|
||||
Migration 004: Add dual FTS tables for exact and fuzzy matching.
|
||||
|
||||
This migration introduces two FTS5 tables:
|
||||
- files_fts_exact: Uses unicode61 tokenizer for exact token matching
|
||||
- files_fts_fuzzy: Uses trigram tokenizer (or extended unicode61) for substring/fuzzy matching
|
||||
|
||||
Both tables are synchronized with the files table via triggers for automatic updates.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from sqlite3 import Connection
|
||||
|
||||
from codexlens.storage.sqlite_utils import check_trigram_support, get_sqlite_version
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def upgrade(db_conn: Connection):
|
||||
"""
|
||||
Applies the migration to add dual FTS tables.
|
||||
|
||||
- Drops old files_fts table and triggers
|
||||
- Creates files_fts_exact with unicode61 tokenizer
|
||||
- Creates files_fts_fuzzy with trigram or extended unicode61 tokenizer
|
||||
- Creates synchronized triggers for both tables
|
||||
- Rebuilds FTS indexes from files table
|
||||
|
||||
Args:
|
||||
db_conn: The SQLite database connection.
|
||||
"""
|
||||
cursor = db_conn.cursor()
|
||||
|
||||
try:
|
||||
# Check trigram support
|
||||
has_trigram = check_trigram_support(db_conn)
|
||||
version = get_sqlite_version(db_conn)
|
||||
log.info(f"SQLite version: {'.'.join(map(str, version))}")
|
||||
|
||||
if has_trigram:
|
||||
log.info("Trigram tokenizer available, using for fuzzy FTS table")
|
||||
fuzzy_tokenizer = "trigram"
|
||||
else:
|
||||
log.warning(
|
||||
f"Trigram tokenizer not available (requires SQLite >= 3.34), "
|
||||
f"using extended unicode61 tokenizer for fuzzy matching"
|
||||
)
|
||||
fuzzy_tokenizer = "unicode61 tokenchars '_-.'"
|
||||
|
||||
# Start transaction
|
||||
cursor.execute("BEGIN TRANSACTION")
|
||||
|
||||
# Check if files table has 'name' column (v2 schema doesn't have it)
|
||||
cursor.execute("PRAGMA table_info(files)")
|
||||
columns = {row[1] for row in cursor.fetchall()}
|
||||
|
||||
if 'name' not in columns:
|
||||
log.info("Adding 'name' column to files table (v2 schema upgrade)...")
|
||||
# Add name column
|
||||
cursor.execute("ALTER TABLE files ADD COLUMN name TEXT")
|
||||
# Populate name from path (extract filename from last '/')
|
||||
# Use Python to do the extraction since SQLite doesn't have reverse()
|
||||
cursor.execute("SELECT rowid, path FROM files")
|
||||
rows = cursor.fetchall()
|
||||
for rowid, path in rows:
|
||||
# Extract filename from path
|
||||
name = path.split('/')[-1] if '/' in path else path
|
||||
cursor.execute("UPDATE files SET name = ? WHERE rowid = ?", (name, rowid))
|
||||
|
||||
# Rename 'path' column to 'full_path' if needed
|
||||
if 'path' in columns and 'full_path' not in columns:
|
||||
log.info("Renaming 'path' to 'full_path' (v2 schema upgrade)...")
|
||||
# Check if indexed_at column exists in v2 schema
|
||||
has_indexed_at = 'indexed_at' in columns
|
||||
has_mtime = 'mtime' in columns
|
||||
|
||||
# SQLite doesn't support RENAME COLUMN before 3.25, so use table recreation
|
||||
cursor.execute("""
|
||||
CREATE TABLE files_new (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
name TEXT NOT NULL,
|
||||
full_path TEXT NOT NULL UNIQUE,
|
||||
content TEXT,
|
||||
language TEXT,
|
||||
mtime REAL,
|
||||
indexed_at TEXT
|
||||
)
|
||||
""")
|
||||
|
||||
# Build INSERT statement based on available columns
|
||||
# Note: v2 schema has no rowid (path is PRIMARY KEY), so use NULL for AUTOINCREMENT
|
||||
if has_indexed_at and has_mtime:
|
||||
cursor.execute("""
|
||||
INSERT INTO files_new (name, full_path, content, language, mtime, indexed_at)
|
||||
SELECT name, path, content, language, mtime, indexed_at FROM files
|
||||
""")
|
||||
elif has_indexed_at:
|
||||
cursor.execute("""
|
||||
INSERT INTO files_new (name, full_path, content, language, indexed_at)
|
||||
SELECT name, path, content, language, indexed_at FROM files
|
||||
""")
|
||||
elif has_mtime:
|
||||
cursor.execute("""
|
||||
INSERT INTO files_new (name, full_path, content, language, mtime)
|
||||
SELECT name, path, content, language, mtime FROM files
|
||||
""")
|
||||
else:
|
||||
cursor.execute("""
|
||||
INSERT INTO files_new (name, full_path, content, language)
|
||||
SELECT name, path, content, language FROM files
|
||||
""")
|
||||
|
||||
cursor.execute("DROP TABLE files")
|
||||
cursor.execute("ALTER TABLE files_new RENAME TO files")
|
||||
|
||||
log.info("Dropping old FTS triggers and table...")
|
||||
# Drop old triggers
|
||||
cursor.execute("DROP TRIGGER IF EXISTS files_ai")
|
||||
cursor.execute("DROP TRIGGER IF EXISTS files_ad")
|
||||
cursor.execute("DROP TRIGGER IF EXISTS files_au")
|
||||
|
||||
# Drop old FTS table
|
||||
cursor.execute("DROP TABLE IF EXISTS files_fts")
|
||||
|
||||
# Create exact FTS table (unicode61 with underscores/hyphens/dots as token chars)
|
||||
# Note: tokenchars includes '.' to properly tokenize qualified names like PortRole.FLOW
|
||||
log.info("Creating files_fts_exact table with unicode61 tokenizer...")
|
||||
cursor.execute(
|
||||
"""
|
||||
CREATE VIRTUAL TABLE files_fts_exact USING fts5(
|
||||
name, full_path UNINDEXED, content,
|
||||
content='files',
|
||||
content_rowid='id',
|
||||
tokenize="unicode61 tokenchars '_-.'"
|
||||
)
|
||||
"""
|
||||
)
|
||||
|
||||
# Create fuzzy FTS table (trigram or extended unicode61)
|
||||
log.info(f"Creating files_fts_fuzzy table with {fuzzy_tokenizer} tokenizer...")
|
||||
cursor.execute(
|
||||
f"""
|
||||
CREATE VIRTUAL TABLE files_fts_fuzzy USING fts5(
|
||||
name, full_path UNINDEXED, content,
|
||||
content='files',
|
||||
content_rowid='id',
|
||||
tokenize="{fuzzy_tokenizer}"
|
||||
)
|
||||
"""
|
||||
)
|
||||
|
||||
# Create synchronized triggers for files_fts_exact
|
||||
log.info("Creating triggers for files_fts_exact...")
|
||||
cursor.execute(
|
||||
"""
|
||||
CREATE TRIGGER files_exact_ai AFTER INSERT ON files BEGIN
|
||||
INSERT INTO files_fts_exact(rowid, name, full_path, content)
|
||||
VALUES(new.id, new.name, new.full_path, new.content);
|
||||
END
|
||||
"""
|
||||
)
|
||||
cursor.execute(
|
||||
"""
|
||||
CREATE TRIGGER files_exact_ad AFTER DELETE ON files BEGIN
|
||||
INSERT INTO files_fts_exact(files_fts_exact, rowid, name, full_path, content)
|
||||
VALUES('delete', old.id, old.name, old.full_path, old.content);
|
||||
END
|
||||
"""
|
||||
)
|
||||
cursor.execute(
|
||||
"""
|
||||
CREATE TRIGGER files_exact_au AFTER UPDATE ON files BEGIN
|
||||
INSERT INTO files_fts_exact(files_fts_exact, rowid, name, full_path, content)
|
||||
VALUES('delete', old.id, old.name, old.full_path, old.content);
|
||||
INSERT INTO files_fts_exact(rowid, name, full_path, content)
|
||||
VALUES(new.id, new.name, new.full_path, new.content);
|
||||
END
|
||||
"""
|
||||
)
|
||||
|
||||
# Create synchronized triggers for files_fts_fuzzy
|
||||
log.info("Creating triggers for files_fts_fuzzy...")
|
||||
cursor.execute(
|
||||
"""
|
||||
CREATE TRIGGER files_fuzzy_ai AFTER INSERT ON files BEGIN
|
||||
INSERT INTO files_fts_fuzzy(rowid, name, full_path, content)
|
||||
VALUES(new.id, new.name, new.full_path, new.content);
|
||||
END
|
||||
"""
|
||||
)
|
||||
cursor.execute(
|
||||
"""
|
||||
CREATE TRIGGER files_fuzzy_ad AFTER DELETE ON files BEGIN
|
||||
INSERT INTO files_fts_fuzzy(files_fts_fuzzy, rowid, name, full_path, content)
|
||||
VALUES('delete', old.id, old.name, old.full_path, old.content);
|
||||
END
|
||||
"""
|
||||
)
|
||||
cursor.execute(
|
||||
"""
|
||||
CREATE TRIGGER files_fuzzy_au AFTER UPDATE ON files BEGIN
|
||||
INSERT INTO files_fts_fuzzy(files_fts_fuzzy, rowid, name, full_path, content)
|
||||
VALUES('delete', old.id, old.name, old.full_path, old.content);
|
||||
INSERT INTO files_fts_fuzzy(rowid, name, full_path, content)
|
||||
VALUES(new.id, new.name, new.full_path, new.content);
|
||||
END
|
||||
"""
|
||||
)
|
||||
|
||||
# Rebuild FTS indexes from files table
|
||||
log.info("Rebuilding FTS indexes from files table...")
|
||||
cursor.execute("INSERT INTO files_fts_exact(files_fts_exact) VALUES('rebuild')")
|
||||
cursor.execute("INSERT INTO files_fts_fuzzy(files_fts_fuzzy) VALUES('rebuild')")
|
||||
|
||||
# Commit transaction
|
||||
cursor.execute("COMMIT")
|
||||
log.info("Migration 004 completed successfully")
|
||||
|
||||
# Vacuum to reclaim space (outside transaction)
|
||||
try:
|
||||
log.info("Running VACUUM to reclaim space...")
|
||||
cursor.execute("VACUUM")
|
||||
except Exception as e:
|
||||
log.warning(f"VACUUM failed (non-critical): {e}")
|
||||
|
||||
except Exception as e:
|
||||
log.error(f"Migration 004 failed: {e}")
|
||||
try:
|
||||
cursor.execute("ROLLBACK")
|
||||
except Exception:
|
||||
pass
|
||||
raise
|
||||
@@ -0,0 +1,196 @@
|
||||
"""
|
||||
Migration 005: Remove unused and redundant database fields.
|
||||
|
||||
This migration removes four problematic fields identified by Gemini analysis:
|
||||
|
||||
1. **semantic_metadata.keywords** (deprecated - replaced by file_keywords table)
|
||||
- Data: Migrated to normalized file_keywords table in migration 001
|
||||
- Impact: Column now redundant, remove to prevent sync issues
|
||||
|
||||
2. **symbols.token_count** (unused - always NULL)
|
||||
- Data: Never populated, always NULL
|
||||
- Impact: No data loss, just removes unused column
|
||||
|
||||
3. **symbols.symbol_type** (redundant - duplicates kind)
|
||||
- Data: Redundant with symbols.kind field
|
||||
- Impact: No data loss, kind field contains same information
|
||||
|
||||
4. **subdirs.direct_files** (unused - never displayed)
|
||||
- Data: Never used in queries or display logic
|
||||
- Impact: No data loss, just removes unused column
|
||||
|
||||
Schema changes use table recreation pattern (SQLite best practice):
|
||||
- Create new table without deprecated columns
|
||||
- Copy data from old table
|
||||
- Drop old table
|
||||
- Rename new table
|
||||
- Recreate indexes
|
||||
"""
|
||||
|
||||
import logging
|
||||
from sqlite3 import Connection
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def upgrade(db_conn: Connection):
|
||||
"""Remove unused and redundant fields from schema.
|
||||
|
||||
Note: Transaction management is handled by MigrationManager.
|
||||
This migration should NOT start its own transaction.
|
||||
|
||||
Args:
|
||||
db_conn: The SQLite database connection.
|
||||
"""
|
||||
cursor = db_conn.cursor()
|
||||
|
||||
# Step 1: Remove semantic_metadata.keywords (if column exists)
|
||||
log.info("Checking semantic_metadata.keywords column...")
|
||||
|
||||
cursor.execute(
|
||||
"SELECT name FROM sqlite_master WHERE type='table' AND name='semantic_metadata'"
|
||||
)
|
||||
if cursor.fetchone():
|
||||
# Check if keywords column exists
|
||||
cursor.execute("PRAGMA table_info(semantic_metadata)")
|
||||
columns = {row[1] for row in cursor.fetchall()}
|
||||
|
||||
if "keywords" in columns:
|
||||
log.info("Removing semantic_metadata.keywords column...")
|
||||
cursor.execute("""
|
||||
CREATE TABLE semantic_metadata_new (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
file_id INTEGER NOT NULL UNIQUE,
|
||||
summary TEXT,
|
||||
purpose TEXT,
|
||||
llm_tool TEXT,
|
||||
generated_at REAL,
|
||||
FOREIGN KEY (file_id) REFERENCES files(id) ON DELETE CASCADE
|
||||
)
|
||||
""")
|
||||
|
||||
cursor.execute("""
|
||||
INSERT INTO semantic_metadata_new (id, file_id, summary, purpose, llm_tool, generated_at)
|
||||
SELECT id, file_id, summary, purpose, llm_tool, generated_at
|
||||
FROM semantic_metadata
|
||||
""")
|
||||
|
||||
cursor.execute("DROP TABLE semantic_metadata")
|
||||
cursor.execute("ALTER TABLE semantic_metadata_new RENAME TO semantic_metadata")
|
||||
|
||||
# Recreate index
|
||||
cursor.execute(
|
||||
"CREATE INDEX IF NOT EXISTS idx_semantic_file ON semantic_metadata(file_id)"
|
||||
)
|
||||
log.info("Removed semantic_metadata.keywords column")
|
||||
else:
|
||||
log.info("semantic_metadata.keywords column does not exist, skipping")
|
||||
else:
|
||||
log.info("semantic_metadata table does not exist, skipping")
|
||||
|
||||
# Step 2: Remove symbols.token_count and symbols.symbol_type (if columns exist)
|
||||
log.info("Checking symbols.token_count and symbols.symbol_type columns...")
|
||||
|
||||
cursor.execute(
|
||||
"SELECT name FROM sqlite_master WHERE type='table' AND name='symbols'"
|
||||
)
|
||||
if cursor.fetchone():
|
||||
# Check if token_count or symbol_type columns exist
|
||||
cursor.execute("PRAGMA table_info(symbols)")
|
||||
columns = {row[1] for row in cursor.fetchall()}
|
||||
|
||||
if "token_count" in columns or "symbol_type" in columns:
|
||||
log.info("Removing symbols.token_count and symbols.symbol_type columns...")
|
||||
cursor.execute("""
|
||||
CREATE TABLE symbols_new (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
file_id INTEGER NOT NULL,
|
||||
name TEXT NOT NULL,
|
||||
kind TEXT,
|
||||
start_line INTEGER,
|
||||
end_line INTEGER,
|
||||
FOREIGN KEY (file_id) REFERENCES files(id) ON DELETE CASCADE
|
||||
)
|
||||
""")
|
||||
|
||||
cursor.execute("""
|
||||
INSERT INTO symbols_new (id, file_id, name, kind, start_line, end_line)
|
||||
SELECT id, file_id, name, kind, start_line, end_line
|
||||
FROM symbols
|
||||
""")
|
||||
|
||||
cursor.execute("DROP TABLE symbols")
|
||||
cursor.execute("ALTER TABLE symbols_new RENAME TO symbols")
|
||||
|
||||
# Recreate indexes
|
||||
cursor.execute("CREATE INDEX IF NOT EXISTS idx_symbols_file ON symbols(file_id)")
|
||||
cursor.execute("CREATE INDEX IF NOT EXISTS idx_symbols_name ON symbols(name)")
|
||||
log.info("Removed symbols.token_count and symbols.symbol_type columns")
|
||||
else:
|
||||
log.info("symbols.token_count/symbol_type columns do not exist, skipping")
|
||||
else:
|
||||
log.info("symbols table does not exist, skipping")
|
||||
|
||||
# Step 3: Remove subdirs.direct_files (if column exists)
|
||||
log.info("Checking subdirs.direct_files column...")
|
||||
|
||||
cursor.execute(
|
||||
"SELECT name FROM sqlite_master WHERE type='table' AND name='subdirs'"
|
||||
)
|
||||
if cursor.fetchone():
|
||||
# Check if direct_files column exists
|
||||
cursor.execute("PRAGMA table_info(subdirs)")
|
||||
columns = {row[1] for row in cursor.fetchall()}
|
||||
|
||||
if "direct_files" in columns:
|
||||
log.info("Removing subdirs.direct_files column...")
|
||||
cursor.execute("""
|
||||
CREATE TABLE subdirs_new (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
name TEXT NOT NULL UNIQUE,
|
||||
index_path TEXT NOT NULL,
|
||||
files_count INTEGER DEFAULT 0,
|
||||
last_updated REAL
|
||||
)
|
||||
""")
|
||||
|
||||
cursor.execute("""
|
||||
INSERT INTO subdirs_new (id, name, index_path, files_count, last_updated)
|
||||
SELECT id, name, index_path, files_count, last_updated
|
||||
FROM subdirs
|
||||
""")
|
||||
|
||||
cursor.execute("DROP TABLE subdirs")
|
||||
cursor.execute("ALTER TABLE subdirs_new RENAME TO subdirs")
|
||||
|
||||
# Recreate index
|
||||
cursor.execute("CREATE INDEX IF NOT EXISTS idx_subdirs_name ON subdirs(name)")
|
||||
log.info("Removed subdirs.direct_files column")
|
||||
else:
|
||||
log.info("subdirs.direct_files column does not exist, skipping")
|
||||
else:
|
||||
log.info("subdirs table does not exist, skipping")
|
||||
|
||||
log.info("Migration 005 completed successfully")
|
||||
|
||||
# Vacuum to reclaim space (outside transaction, optional)
|
||||
# Note: VACUUM cannot run inside a transaction, so we skip it here
|
||||
# The caller can run VACUUM separately if desired
|
||||
|
||||
|
||||
def downgrade(db_conn: Connection):
|
||||
"""Restore removed fields (data will be lost for keywords, token_count, symbol_type, direct_files).
|
||||
|
||||
This is a placeholder - true downgrade is not feasible as data is lost.
|
||||
The migration is designed to be one-way since removed fields are unused/redundant.
|
||||
|
||||
Args:
|
||||
db_conn: The SQLite database connection.
|
||||
"""
|
||||
log.warning(
|
||||
"Migration 005 downgrade not supported - removed fields are unused/redundant. "
|
||||
"Data cannot be restored."
|
||||
)
|
||||
raise NotImplementedError(
|
||||
"Migration 005 downgrade not supported - this is a one-way migration"
|
||||
)
|
||||
@@ -0,0 +1,37 @@
|
||||
"""
|
||||
Migration 006: Ensure relationship tables and indexes exist.
|
||||
|
||||
This migration is intentionally idempotent. It creates the `code_relationships`
|
||||
table (used for graph visualization) and its indexes if missing.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from sqlite3 import Connection
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def upgrade(db_conn: Connection) -> None:
|
||||
cursor = db_conn.cursor()
|
||||
|
||||
log.info("Ensuring code_relationships table exists...")
|
||||
cursor.execute(
|
||||
"""
|
||||
CREATE TABLE IF NOT EXISTS code_relationships (
|
||||
id INTEGER PRIMARY KEY,
|
||||
source_symbol_id INTEGER NOT NULL REFERENCES symbols (id) ON DELETE CASCADE,
|
||||
target_qualified_name TEXT NOT NULL,
|
||||
relationship_type TEXT NOT NULL,
|
||||
source_line INTEGER NOT NULL,
|
||||
target_file TEXT
|
||||
)
|
||||
"""
|
||||
)
|
||||
|
||||
log.info("Ensuring relationship indexes exist...")
|
||||
cursor.execute("CREATE INDEX IF NOT EXISTS idx_rel_source ON code_relationships(source_symbol_id)")
|
||||
cursor.execute("CREATE INDEX IF NOT EXISTS idx_rel_target ON code_relationships(target_qualified_name)")
|
||||
cursor.execute("CREATE INDEX IF NOT EXISTS idx_rel_type ON code_relationships(relationship_type)")
|
||||
|
||||
@@ -0,0 +1,47 @@
|
||||
"""
|
||||
Migration 007: Add precomputed graph neighbor table for search expansion.
|
||||
|
||||
Adds:
|
||||
- graph_neighbors: cached N-hop neighbors between symbols (keyed by symbol ids)
|
||||
|
||||
This table is derived data (a cache) and is safe to rebuild at any time.
|
||||
The migration is intentionally idempotent.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from sqlite3 import Connection
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def upgrade(db_conn: Connection) -> None:
|
||||
cursor = db_conn.cursor()
|
||||
|
||||
log.info("Creating graph_neighbors table...")
|
||||
cursor.execute(
|
||||
"""
|
||||
CREATE TABLE IF NOT EXISTS graph_neighbors (
|
||||
source_symbol_id INTEGER NOT NULL REFERENCES symbols(id) ON DELETE CASCADE,
|
||||
neighbor_symbol_id INTEGER NOT NULL REFERENCES symbols(id) ON DELETE CASCADE,
|
||||
relationship_depth INTEGER NOT NULL,
|
||||
PRIMARY KEY (source_symbol_id, neighbor_symbol_id)
|
||||
)
|
||||
"""
|
||||
)
|
||||
|
||||
log.info("Creating indexes for graph_neighbors...")
|
||||
cursor.execute(
|
||||
"""
|
||||
CREATE INDEX IF NOT EXISTS idx_graph_neighbors_source_depth
|
||||
ON graph_neighbors(source_symbol_id, relationship_depth)
|
||||
"""
|
||||
)
|
||||
cursor.execute(
|
||||
"""
|
||||
CREATE INDEX IF NOT EXISTS idx_graph_neighbors_neighbor
|
||||
ON graph_neighbors(neighbor_symbol_id)
|
||||
"""
|
||||
)
|
||||
|
||||
@@ -0,0 +1,81 @@
|
||||
"""
|
||||
Migration 008: Add Merkle hash tables for content-based incremental indexing.
|
||||
|
||||
Adds:
|
||||
- merkle_hashes: per-file SHA-256 hashes (keyed by file_id)
|
||||
- merkle_state: directory-level root hash (single row, id=1)
|
||||
|
||||
Backfills merkle_hashes using the existing `files.content` column when available.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import logging
|
||||
import time
|
||||
from sqlite3 import Connection
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def upgrade(db_conn: Connection) -> None:
|
||||
cursor = db_conn.cursor()
|
||||
|
||||
log.info("Creating merkle_hashes table...")
|
||||
cursor.execute(
|
||||
"""
|
||||
CREATE TABLE IF NOT EXISTS merkle_hashes (
|
||||
file_id INTEGER PRIMARY KEY REFERENCES files(id) ON DELETE CASCADE,
|
||||
sha256 TEXT NOT NULL,
|
||||
updated_at REAL
|
||||
)
|
||||
"""
|
||||
)
|
||||
|
||||
log.info("Creating merkle_state table...")
|
||||
cursor.execute(
|
||||
"""
|
||||
CREATE TABLE IF NOT EXISTS merkle_state (
|
||||
id INTEGER PRIMARY KEY CHECK (id = 1),
|
||||
root_hash TEXT,
|
||||
updated_at REAL
|
||||
)
|
||||
"""
|
||||
)
|
||||
|
||||
# Backfill file hashes from stored content (best-effort).
|
||||
try:
|
||||
rows = cursor.execute("SELECT id, content FROM files").fetchall()
|
||||
except Exception as exc:
|
||||
log.warning("Unable to backfill merkle hashes (files table missing?): %s", exc)
|
||||
return
|
||||
|
||||
now = time.time()
|
||||
inserts: list[tuple[int, str, float]] = []
|
||||
|
||||
for row in rows:
|
||||
file_id = int(row[0])
|
||||
content = row[1]
|
||||
if content is None:
|
||||
continue
|
||||
try:
|
||||
digest = hashlib.sha256(str(content).encode("utf-8", errors="ignore")).hexdigest()
|
||||
inserts.append((file_id, digest, now))
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
if not inserts:
|
||||
return
|
||||
|
||||
log.info("Backfilling %d file hashes...", len(inserts))
|
||||
cursor.executemany(
|
||||
"""
|
||||
INSERT INTO merkle_hashes(file_id, sha256, updated_at)
|
||||
VALUES(?, ?, ?)
|
||||
ON CONFLICT(file_id) DO UPDATE SET
|
||||
sha256=excluded.sha256,
|
||||
updated_at=excluded.updated_at
|
||||
""",
|
||||
inserts,
|
||||
)
|
||||
|
||||
@@ -0,0 +1,103 @@
|
||||
"""
|
||||
Migration 009: Add SPLADE sparse retrieval tables.
|
||||
|
||||
This migration introduces SPLADE (Sparse Lexical AnD Expansion) support:
|
||||
- splade_metadata: Model configuration (model name, vocab size, ONNX path)
|
||||
- splade_posting_list: Inverted index mapping token_id -> (chunk_id, weight)
|
||||
|
||||
The SPLADE tables are designed for efficient sparse vector retrieval:
|
||||
- Token-based lookup for query expansion
|
||||
- Chunk-based deletion for index maintenance
|
||||
- Maintains backward compatibility with existing FTS tables
|
||||
"""
|
||||
|
||||
import logging
|
||||
from sqlite3 import Connection
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def upgrade(db_conn: Connection) -> None:
|
||||
"""
|
||||
Adds SPLADE tables for sparse retrieval.
|
||||
|
||||
Creates:
|
||||
- splade_metadata: Stores model configuration and ONNX path
|
||||
- splade_posting_list: Inverted index with token_id -> (chunk_id, weight) mappings
|
||||
- Indexes for efficient token-based and chunk-based lookups
|
||||
|
||||
Args:
|
||||
db_conn: The SQLite database connection.
|
||||
"""
|
||||
cursor = db_conn.cursor()
|
||||
|
||||
log.info("Creating splade_metadata table...")
|
||||
cursor.execute(
|
||||
"""
|
||||
CREATE TABLE IF NOT EXISTS splade_metadata (
|
||||
id INTEGER PRIMARY KEY DEFAULT 1,
|
||||
model_name TEXT NOT NULL,
|
||||
vocab_size INTEGER NOT NULL,
|
||||
onnx_path TEXT,
|
||||
created_at REAL
|
||||
)
|
||||
"""
|
||||
)
|
||||
|
||||
log.info("Creating splade_posting_list table...")
|
||||
cursor.execute(
|
||||
"""
|
||||
CREATE TABLE IF NOT EXISTS splade_posting_list (
|
||||
token_id INTEGER NOT NULL,
|
||||
chunk_id INTEGER NOT NULL,
|
||||
weight REAL NOT NULL,
|
||||
PRIMARY KEY (token_id, chunk_id),
|
||||
FOREIGN KEY (chunk_id) REFERENCES semantic_chunks(id) ON DELETE CASCADE
|
||||
)
|
||||
"""
|
||||
)
|
||||
|
||||
log.info("Creating indexes for splade_posting_list...")
|
||||
# Index for efficient chunk-based lookups (deletion, updates)
|
||||
cursor.execute(
|
||||
"""
|
||||
CREATE INDEX IF NOT EXISTS idx_splade_by_chunk
|
||||
ON splade_posting_list(chunk_id)
|
||||
"""
|
||||
)
|
||||
|
||||
# Index for efficient term-based retrieval
|
||||
cursor.execute(
|
||||
"""
|
||||
CREATE INDEX IF NOT EXISTS idx_splade_by_token
|
||||
ON splade_posting_list(token_id)
|
||||
"""
|
||||
)
|
||||
|
||||
log.info("Migration 009 completed successfully")
|
||||
|
||||
|
||||
def downgrade(db_conn: Connection) -> None:
|
||||
"""
|
||||
Removes SPLADE tables.
|
||||
|
||||
Drops:
|
||||
- splade_posting_list (and associated indexes)
|
||||
- splade_metadata
|
||||
|
||||
Args:
|
||||
db_conn: The SQLite database connection.
|
||||
"""
|
||||
cursor = db_conn.cursor()
|
||||
|
||||
log.info("Dropping SPLADE indexes...")
|
||||
cursor.execute("DROP INDEX IF EXISTS idx_splade_by_chunk")
|
||||
cursor.execute("DROP INDEX IF EXISTS idx_splade_by_token")
|
||||
|
||||
log.info("Dropping splade_posting_list table...")
|
||||
cursor.execute("DROP TABLE IF EXISTS splade_posting_list")
|
||||
|
||||
log.info("Dropping splade_metadata table...")
|
||||
cursor.execute("DROP TABLE IF EXISTS splade_metadata")
|
||||
|
||||
log.info("Migration 009 downgrade completed successfully")
|
||||
@@ -0,0 +1,162 @@
|
||||
"""
|
||||
Migration 010: Add multi-vector storage support for cascade retrieval.
|
||||
|
||||
This migration introduces the chunks table with multi-vector support:
|
||||
- chunks: Stores code chunks with multiple embedding types
|
||||
- embedding: Original embedding for backward compatibility
|
||||
- embedding_binary: 256-dim binary vector for coarse ranking (fast)
|
||||
- embedding_dense: 2048-dim dense vector for fine ranking (precise)
|
||||
|
||||
The multi-vector architecture enables cascade retrieval:
|
||||
1. First stage: Fast binary vector search for candidate retrieval
|
||||
2. Second stage: Dense vector reranking for precision
|
||||
"""
|
||||
|
||||
import logging
|
||||
from sqlite3 import Connection
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def upgrade(db_conn: Connection) -> None:
|
||||
"""
|
||||
Adds chunks table with multi-vector embedding columns.
|
||||
|
||||
Creates:
|
||||
- chunks: Table for storing code chunks with multiple embedding types
|
||||
- idx_chunks_file_path: Index for efficient file-based lookups
|
||||
|
||||
Also migrates existing chunks tables by adding new columns if needed.
|
||||
|
||||
Args:
|
||||
db_conn: The SQLite database connection.
|
||||
"""
|
||||
cursor = db_conn.cursor()
|
||||
|
||||
# Check if chunks table already exists
|
||||
table_exists = cursor.execute(
|
||||
"SELECT name FROM sqlite_master WHERE type='table' AND name='chunks'"
|
||||
).fetchone()
|
||||
|
||||
if table_exists:
|
||||
# Migrate existing table - add new columns if missing
|
||||
log.info("chunks table exists, checking for missing columns...")
|
||||
|
||||
col_info = cursor.execute("PRAGMA table_info(chunks)").fetchall()
|
||||
existing_columns = {row[1] for row in col_info}
|
||||
|
||||
if "embedding_binary" not in existing_columns:
|
||||
log.info("Adding embedding_binary column to chunks table...")
|
||||
cursor.execute(
|
||||
"ALTER TABLE chunks ADD COLUMN embedding_binary BLOB"
|
||||
)
|
||||
|
||||
if "embedding_dense" not in existing_columns:
|
||||
log.info("Adding embedding_dense column to chunks table...")
|
||||
cursor.execute(
|
||||
"ALTER TABLE chunks ADD COLUMN embedding_dense BLOB"
|
||||
)
|
||||
else:
|
||||
# Create new table with all columns
|
||||
log.info("Creating chunks table with multi-vector support...")
|
||||
cursor.execute(
|
||||
"""
|
||||
CREATE TABLE chunks (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
file_path TEXT NOT NULL,
|
||||
content TEXT NOT NULL,
|
||||
embedding BLOB,
|
||||
embedding_binary BLOB,
|
||||
embedding_dense BLOB,
|
||||
metadata TEXT,
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
||||
)
|
||||
"""
|
||||
)
|
||||
|
||||
# Create index for file-based lookups
|
||||
log.info("Creating index for chunks table...")
|
||||
cursor.execute(
|
||||
"""
|
||||
CREATE INDEX IF NOT EXISTS idx_chunks_file_path
|
||||
ON chunks(file_path)
|
||||
"""
|
||||
)
|
||||
|
||||
log.info("Migration 010 completed successfully")
|
||||
|
||||
|
||||
def downgrade(db_conn: Connection) -> None:
|
||||
"""
|
||||
Removes multi-vector columns from chunks table.
|
||||
|
||||
Note: This does not drop the chunks table entirely to preserve data.
|
||||
Only the new columns added by this migration are removed.
|
||||
|
||||
Args:
|
||||
db_conn: The SQLite database connection.
|
||||
"""
|
||||
cursor = db_conn.cursor()
|
||||
|
||||
log.info("Removing multi-vector columns from chunks table...")
|
||||
|
||||
# SQLite doesn't support DROP COLUMN directly in older versions
|
||||
# We need to recreate the table without the columns
|
||||
|
||||
# Check if chunks table exists
|
||||
table_exists = cursor.execute(
|
||||
"SELECT name FROM sqlite_master WHERE type='table' AND name='chunks'"
|
||||
).fetchone()
|
||||
|
||||
if not table_exists:
|
||||
log.info("chunks table does not exist, nothing to downgrade")
|
||||
return
|
||||
|
||||
# Check if the columns exist before trying to remove them
|
||||
col_info = cursor.execute("PRAGMA table_info(chunks)").fetchall()
|
||||
existing_columns = {row[1] for row in col_info}
|
||||
|
||||
needs_migration = (
|
||||
"embedding_binary" in existing_columns or
|
||||
"embedding_dense" in existing_columns
|
||||
)
|
||||
|
||||
if not needs_migration:
|
||||
log.info("Multi-vector columns not present, nothing to remove")
|
||||
return
|
||||
|
||||
# Recreate table without the new columns
|
||||
log.info("Recreating chunks table without multi-vector columns...")
|
||||
|
||||
cursor.execute(
|
||||
"""
|
||||
CREATE TABLE chunks_backup (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
file_path TEXT NOT NULL,
|
||||
content TEXT NOT NULL,
|
||||
embedding BLOB,
|
||||
metadata TEXT,
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
||||
)
|
||||
"""
|
||||
)
|
||||
|
||||
cursor.execute(
|
||||
"""
|
||||
INSERT INTO chunks_backup (id, file_path, content, embedding, metadata, created_at)
|
||||
SELECT id, file_path, content, embedding, metadata, created_at FROM chunks
|
||||
"""
|
||||
)
|
||||
|
||||
cursor.execute("DROP TABLE chunks")
|
||||
cursor.execute("ALTER TABLE chunks_backup RENAME TO chunks")
|
||||
|
||||
# Recreate index
|
||||
cursor.execute(
|
||||
"""
|
||||
CREATE INDEX IF NOT EXISTS idx_chunks_file_path
|
||||
ON chunks(file_path)
|
||||
"""
|
||||
)
|
||||
|
||||
log.info("Migration 010 downgrade completed successfully")
|
||||
Reference in New Issue
Block a user