Refactor code structure and remove redundant changes

This commit is contained in:
catlog22
2026-01-24 14:47:47 +08:00
parent cf5fecd66d
commit f2b0a5bbc9
113 changed files with 43217 additions and 235 deletions

View File

@@ -0,0 +1 @@
# This file makes the 'migrations' directory a Python package.

View File

@@ -0,0 +1,123 @@
"""
Migration 001: Normalize keywords into separate tables.
This migration introduces two new tables, `keywords` and `file_keywords`, to
store semantic keywords in a normalized fashion. It then migrates the existing
keywords from the `semantic_data` JSON blob in the `files` table into these
new tables. This is intended to speed up keyword-based searches significantly.
"""
import json
import logging
from sqlite3 import Connection
log = logging.getLogger(__name__)
def upgrade(db_conn: Connection):
"""
Applies the migration to normalize keywords.
- Creates `keywords` and `file_keywords` tables.
- Creates indexes for efficient querying.
- Migrates data from `files.semantic_data` to the new tables.
Args:
db_conn: The SQLite database connection.
"""
cursor = db_conn.cursor()
log.info("Creating 'keywords' and 'file_keywords' tables...")
# Create a table to store unique keywords
cursor.execute(
"""
CREATE TABLE IF NOT EXISTS keywords (
id INTEGER PRIMARY KEY,
keyword TEXT NOT NULL UNIQUE
)
"""
)
# Create a join table to link files and keywords (many-to-many)
cursor.execute(
"""
CREATE TABLE IF NOT EXISTS file_keywords (
file_id INTEGER NOT NULL,
keyword_id INTEGER NOT NULL,
PRIMARY KEY (file_id, keyword_id),
FOREIGN KEY (file_id) REFERENCES files (id) ON DELETE CASCADE,
FOREIGN KEY (keyword_id) REFERENCES keywords (id) ON DELETE CASCADE
)
"""
)
log.info("Creating indexes for new keyword tables...")
cursor.execute("CREATE INDEX IF NOT EXISTS idx_keywords_keyword ON keywords (keyword)")
cursor.execute("CREATE INDEX IF NOT EXISTS idx_file_keywords_file_id ON file_keywords (file_id)")
cursor.execute("CREATE INDEX IF NOT EXISTS idx_file_keywords_keyword_id ON file_keywords (keyword_id)")
log.info("Migrating existing keywords from 'semantic_metadata' table...")
# Check if semantic_metadata table exists before querying
cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='semantic_metadata'")
if not cursor.fetchone():
log.info("No 'semantic_metadata' table found, skipping data migration.")
return
# Check if 'keywords' column exists in semantic_metadata table
# (current schema may already use normalized tables without this column)
cursor.execute("PRAGMA table_info(semantic_metadata)")
columns = {row[1] for row in cursor.fetchall()}
if "keywords" not in columns:
log.info("No 'keywords' column in semantic_metadata table, skipping data migration.")
return
cursor.execute("SELECT file_id, keywords FROM semantic_metadata WHERE keywords IS NOT NULL AND keywords != ''")
files_to_migrate = cursor.fetchall()
if not files_to_migrate:
log.info("No existing files with semantic metadata to migrate.")
return
log.info(f"Found {len(files_to_migrate)} files with semantic metadata to migrate.")
for file_id, keywords_json in files_to_migrate:
if not keywords_json:
continue
try:
keywords = json.loads(keywords_json)
if not isinstance(keywords, list):
log.warning(f"Keywords for file_id {file_id} is not a list, skipping.")
continue
for keyword in keywords:
if not isinstance(keyword, str):
log.warning(f"Non-string keyword '{keyword}' found for file_id {file_id}, skipping.")
continue
keyword = keyword.strip()
if not keyword:
continue
# Get or create keyword_id
cursor.execute("INSERT OR IGNORE INTO keywords (keyword) VALUES (?)", (keyword,))
cursor.execute("SELECT id FROM keywords WHERE keyword = ?", (keyword,))
keyword_id_result = cursor.fetchone()
if keyword_id_result:
keyword_id = keyword_id_result[0]
# Link file to keyword
cursor.execute(
"INSERT OR IGNORE INTO file_keywords (file_id, keyword_id) VALUES (?, ?)",
(file_id, keyword_id),
)
else:
log.error(f"Failed to retrieve or create keyword_id for keyword: {keyword}")
except json.JSONDecodeError as e:
log.warning(f"Could not parse keywords for file_id {file_id}: {e}")
except Exception as e:
log.error(f"An unexpected error occurred during migration for file_id {file_id}: {e}", exc_info=True)
log.info("Finished migrating keywords.")

View File

@@ -0,0 +1,48 @@
"""
Migration 002: Add token_count and symbol_type to symbols table.
This migration adds token counting metadata to symbols for accurate chunk
splitting and performance optimization. It also adds symbol_type for better
filtering in searches.
"""
import logging
from sqlite3 import Connection
log = logging.getLogger(__name__)
def upgrade(db_conn: Connection):
"""
Applies the migration to add token metadata to symbols.
- Adds token_count column to symbols table
- Adds symbol_type column to symbols table (for future use)
- Creates index on symbol_type for efficient filtering
- Backfills existing symbols with NULL token_count (to be calculated lazily)
Args:
db_conn: The SQLite database connection.
"""
cursor = db_conn.cursor()
log.info("Adding token_count column to symbols table...")
try:
cursor.execute("ALTER TABLE symbols ADD COLUMN token_count INTEGER")
log.info("Successfully added token_count column.")
except Exception as e:
# Column might already exist
log.warning(f"Could not add token_count column (might already exist): {e}")
log.info("Adding symbol_type column to symbols table...")
try:
cursor.execute("ALTER TABLE symbols ADD COLUMN symbol_type TEXT")
log.info("Successfully added symbol_type column.")
except Exception as e:
# Column might already exist
log.warning(f"Could not add symbol_type column (might already exist): {e}")
log.info("Creating index on symbol_type for efficient filtering...")
cursor.execute("CREATE INDEX IF NOT EXISTS idx_symbols_type ON symbols(symbol_type)")
log.info("Migration 002 completed successfully.")

View File

@@ -0,0 +1,232 @@
"""
Migration 004: Add dual FTS tables for exact and fuzzy matching.
This migration introduces two FTS5 tables:
- files_fts_exact: Uses unicode61 tokenizer for exact token matching
- files_fts_fuzzy: Uses trigram tokenizer (or extended unicode61) for substring/fuzzy matching
Both tables are synchronized with the files table via triggers for automatic updates.
"""
import logging
from sqlite3 import Connection
from codexlens.storage.sqlite_utils import check_trigram_support, get_sqlite_version
log = logging.getLogger(__name__)
def upgrade(db_conn: Connection):
"""
Applies the migration to add dual FTS tables.
- Drops old files_fts table and triggers
- Creates files_fts_exact with unicode61 tokenizer
- Creates files_fts_fuzzy with trigram or extended unicode61 tokenizer
- Creates synchronized triggers for both tables
- Rebuilds FTS indexes from files table
Args:
db_conn: The SQLite database connection.
"""
cursor = db_conn.cursor()
try:
# Check trigram support
has_trigram = check_trigram_support(db_conn)
version = get_sqlite_version(db_conn)
log.info(f"SQLite version: {'.'.join(map(str, version))}")
if has_trigram:
log.info("Trigram tokenizer available, using for fuzzy FTS table")
fuzzy_tokenizer = "trigram"
else:
log.warning(
f"Trigram tokenizer not available (requires SQLite >= 3.34), "
f"using extended unicode61 tokenizer for fuzzy matching"
)
fuzzy_tokenizer = "unicode61 tokenchars '_-.'"
# Start transaction
cursor.execute("BEGIN TRANSACTION")
# Check if files table has 'name' column (v2 schema doesn't have it)
cursor.execute("PRAGMA table_info(files)")
columns = {row[1] for row in cursor.fetchall()}
if 'name' not in columns:
log.info("Adding 'name' column to files table (v2 schema upgrade)...")
# Add name column
cursor.execute("ALTER TABLE files ADD COLUMN name TEXT")
# Populate name from path (extract filename from last '/')
# Use Python to do the extraction since SQLite doesn't have reverse()
cursor.execute("SELECT rowid, path FROM files")
rows = cursor.fetchall()
for rowid, path in rows:
# Extract filename from path
name = path.split('/')[-1] if '/' in path else path
cursor.execute("UPDATE files SET name = ? WHERE rowid = ?", (name, rowid))
# Rename 'path' column to 'full_path' if needed
if 'path' in columns and 'full_path' not in columns:
log.info("Renaming 'path' to 'full_path' (v2 schema upgrade)...")
# Check if indexed_at column exists in v2 schema
has_indexed_at = 'indexed_at' in columns
has_mtime = 'mtime' in columns
# SQLite doesn't support RENAME COLUMN before 3.25, so use table recreation
cursor.execute("""
CREATE TABLE files_new (
id INTEGER PRIMARY KEY AUTOINCREMENT,
name TEXT NOT NULL,
full_path TEXT NOT NULL UNIQUE,
content TEXT,
language TEXT,
mtime REAL,
indexed_at TEXT
)
""")
# Build INSERT statement based on available columns
# Note: v2 schema has no rowid (path is PRIMARY KEY), so use NULL for AUTOINCREMENT
if has_indexed_at and has_mtime:
cursor.execute("""
INSERT INTO files_new (name, full_path, content, language, mtime, indexed_at)
SELECT name, path, content, language, mtime, indexed_at FROM files
""")
elif has_indexed_at:
cursor.execute("""
INSERT INTO files_new (name, full_path, content, language, indexed_at)
SELECT name, path, content, language, indexed_at FROM files
""")
elif has_mtime:
cursor.execute("""
INSERT INTO files_new (name, full_path, content, language, mtime)
SELECT name, path, content, language, mtime FROM files
""")
else:
cursor.execute("""
INSERT INTO files_new (name, full_path, content, language)
SELECT name, path, content, language FROM files
""")
cursor.execute("DROP TABLE files")
cursor.execute("ALTER TABLE files_new RENAME TO files")
log.info("Dropping old FTS triggers and table...")
# Drop old triggers
cursor.execute("DROP TRIGGER IF EXISTS files_ai")
cursor.execute("DROP TRIGGER IF EXISTS files_ad")
cursor.execute("DROP TRIGGER IF EXISTS files_au")
# Drop old FTS table
cursor.execute("DROP TABLE IF EXISTS files_fts")
# Create exact FTS table (unicode61 with underscores/hyphens/dots as token chars)
# Note: tokenchars includes '.' to properly tokenize qualified names like PortRole.FLOW
log.info("Creating files_fts_exact table with unicode61 tokenizer...")
cursor.execute(
"""
CREATE VIRTUAL TABLE files_fts_exact USING fts5(
name, full_path UNINDEXED, content,
content='files',
content_rowid='id',
tokenize="unicode61 tokenchars '_-.'"
)
"""
)
# Create fuzzy FTS table (trigram or extended unicode61)
log.info(f"Creating files_fts_fuzzy table with {fuzzy_tokenizer} tokenizer...")
cursor.execute(
f"""
CREATE VIRTUAL TABLE files_fts_fuzzy USING fts5(
name, full_path UNINDEXED, content,
content='files',
content_rowid='id',
tokenize="{fuzzy_tokenizer}"
)
"""
)
# Create synchronized triggers for files_fts_exact
log.info("Creating triggers for files_fts_exact...")
cursor.execute(
"""
CREATE TRIGGER files_exact_ai AFTER INSERT ON files BEGIN
INSERT INTO files_fts_exact(rowid, name, full_path, content)
VALUES(new.id, new.name, new.full_path, new.content);
END
"""
)
cursor.execute(
"""
CREATE TRIGGER files_exact_ad AFTER DELETE ON files BEGIN
INSERT INTO files_fts_exact(files_fts_exact, rowid, name, full_path, content)
VALUES('delete', old.id, old.name, old.full_path, old.content);
END
"""
)
cursor.execute(
"""
CREATE TRIGGER files_exact_au AFTER UPDATE ON files BEGIN
INSERT INTO files_fts_exact(files_fts_exact, rowid, name, full_path, content)
VALUES('delete', old.id, old.name, old.full_path, old.content);
INSERT INTO files_fts_exact(rowid, name, full_path, content)
VALUES(new.id, new.name, new.full_path, new.content);
END
"""
)
# Create synchronized triggers for files_fts_fuzzy
log.info("Creating triggers for files_fts_fuzzy...")
cursor.execute(
"""
CREATE TRIGGER files_fuzzy_ai AFTER INSERT ON files BEGIN
INSERT INTO files_fts_fuzzy(rowid, name, full_path, content)
VALUES(new.id, new.name, new.full_path, new.content);
END
"""
)
cursor.execute(
"""
CREATE TRIGGER files_fuzzy_ad AFTER DELETE ON files BEGIN
INSERT INTO files_fts_fuzzy(files_fts_fuzzy, rowid, name, full_path, content)
VALUES('delete', old.id, old.name, old.full_path, old.content);
END
"""
)
cursor.execute(
"""
CREATE TRIGGER files_fuzzy_au AFTER UPDATE ON files BEGIN
INSERT INTO files_fts_fuzzy(files_fts_fuzzy, rowid, name, full_path, content)
VALUES('delete', old.id, old.name, old.full_path, old.content);
INSERT INTO files_fts_fuzzy(rowid, name, full_path, content)
VALUES(new.id, new.name, new.full_path, new.content);
END
"""
)
# Rebuild FTS indexes from files table
log.info("Rebuilding FTS indexes from files table...")
cursor.execute("INSERT INTO files_fts_exact(files_fts_exact) VALUES('rebuild')")
cursor.execute("INSERT INTO files_fts_fuzzy(files_fts_fuzzy) VALUES('rebuild')")
# Commit transaction
cursor.execute("COMMIT")
log.info("Migration 004 completed successfully")
# Vacuum to reclaim space (outside transaction)
try:
log.info("Running VACUUM to reclaim space...")
cursor.execute("VACUUM")
except Exception as e:
log.warning(f"VACUUM failed (non-critical): {e}")
except Exception as e:
log.error(f"Migration 004 failed: {e}")
try:
cursor.execute("ROLLBACK")
except Exception:
pass
raise

View File

@@ -0,0 +1,196 @@
"""
Migration 005: Remove unused and redundant database fields.
This migration removes four problematic fields identified by Gemini analysis:
1. **semantic_metadata.keywords** (deprecated - replaced by file_keywords table)
- Data: Migrated to normalized file_keywords table in migration 001
- Impact: Column now redundant, remove to prevent sync issues
2. **symbols.token_count** (unused - always NULL)
- Data: Never populated, always NULL
- Impact: No data loss, just removes unused column
3. **symbols.symbol_type** (redundant - duplicates kind)
- Data: Redundant with symbols.kind field
- Impact: No data loss, kind field contains same information
4. **subdirs.direct_files** (unused - never displayed)
- Data: Never used in queries or display logic
- Impact: No data loss, just removes unused column
Schema changes use table recreation pattern (SQLite best practice):
- Create new table without deprecated columns
- Copy data from old table
- Drop old table
- Rename new table
- Recreate indexes
"""
import logging
from sqlite3 import Connection
log = logging.getLogger(__name__)
def upgrade(db_conn: Connection):
"""Remove unused and redundant fields from schema.
Note: Transaction management is handled by MigrationManager.
This migration should NOT start its own transaction.
Args:
db_conn: The SQLite database connection.
"""
cursor = db_conn.cursor()
# Step 1: Remove semantic_metadata.keywords (if column exists)
log.info("Checking semantic_metadata.keywords column...")
cursor.execute(
"SELECT name FROM sqlite_master WHERE type='table' AND name='semantic_metadata'"
)
if cursor.fetchone():
# Check if keywords column exists
cursor.execute("PRAGMA table_info(semantic_metadata)")
columns = {row[1] for row in cursor.fetchall()}
if "keywords" in columns:
log.info("Removing semantic_metadata.keywords column...")
cursor.execute("""
CREATE TABLE semantic_metadata_new (
id INTEGER PRIMARY KEY AUTOINCREMENT,
file_id INTEGER NOT NULL UNIQUE,
summary TEXT,
purpose TEXT,
llm_tool TEXT,
generated_at REAL,
FOREIGN KEY (file_id) REFERENCES files(id) ON DELETE CASCADE
)
""")
cursor.execute("""
INSERT INTO semantic_metadata_new (id, file_id, summary, purpose, llm_tool, generated_at)
SELECT id, file_id, summary, purpose, llm_tool, generated_at
FROM semantic_metadata
""")
cursor.execute("DROP TABLE semantic_metadata")
cursor.execute("ALTER TABLE semantic_metadata_new RENAME TO semantic_metadata")
# Recreate index
cursor.execute(
"CREATE INDEX IF NOT EXISTS idx_semantic_file ON semantic_metadata(file_id)"
)
log.info("Removed semantic_metadata.keywords column")
else:
log.info("semantic_metadata.keywords column does not exist, skipping")
else:
log.info("semantic_metadata table does not exist, skipping")
# Step 2: Remove symbols.token_count and symbols.symbol_type (if columns exist)
log.info("Checking symbols.token_count and symbols.symbol_type columns...")
cursor.execute(
"SELECT name FROM sqlite_master WHERE type='table' AND name='symbols'"
)
if cursor.fetchone():
# Check if token_count or symbol_type columns exist
cursor.execute("PRAGMA table_info(symbols)")
columns = {row[1] for row in cursor.fetchall()}
if "token_count" in columns or "symbol_type" in columns:
log.info("Removing symbols.token_count and symbols.symbol_type columns...")
cursor.execute("""
CREATE TABLE symbols_new (
id INTEGER PRIMARY KEY AUTOINCREMENT,
file_id INTEGER NOT NULL,
name TEXT NOT NULL,
kind TEXT,
start_line INTEGER,
end_line INTEGER,
FOREIGN KEY (file_id) REFERENCES files(id) ON DELETE CASCADE
)
""")
cursor.execute("""
INSERT INTO symbols_new (id, file_id, name, kind, start_line, end_line)
SELECT id, file_id, name, kind, start_line, end_line
FROM symbols
""")
cursor.execute("DROP TABLE symbols")
cursor.execute("ALTER TABLE symbols_new RENAME TO symbols")
# Recreate indexes
cursor.execute("CREATE INDEX IF NOT EXISTS idx_symbols_file ON symbols(file_id)")
cursor.execute("CREATE INDEX IF NOT EXISTS idx_symbols_name ON symbols(name)")
log.info("Removed symbols.token_count and symbols.symbol_type columns")
else:
log.info("symbols.token_count/symbol_type columns do not exist, skipping")
else:
log.info("symbols table does not exist, skipping")
# Step 3: Remove subdirs.direct_files (if column exists)
log.info("Checking subdirs.direct_files column...")
cursor.execute(
"SELECT name FROM sqlite_master WHERE type='table' AND name='subdirs'"
)
if cursor.fetchone():
# Check if direct_files column exists
cursor.execute("PRAGMA table_info(subdirs)")
columns = {row[1] for row in cursor.fetchall()}
if "direct_files" in columns:
log.info("Removing subdirs.direct_files column...")
cursor.execute("""
CREATE TABLE subdirs_new (
id INTEGER PRIMARY KEY AUTOINCREMENT,
name TEXT NOT NULL UNIQUE,
index_path TEXT NOT NULL,
files_count INTEGER DEFAULT 0,
last_updated REAL
)
""")
cursor.execute("""
INSERT INTO subdirs_new (id, name, index_path, files_count, last_updated)
SELECT id, name, index_path, files_count, last_updated
FROM subdirs
""")
cursor.execute("DROP TABLE subdirs")
cursor.execute("ALTER TABLE subdirs_new RENAME TO subdirs")
# Recreate index
cursor.execute("CREATE INDEX IF NOT EXISTS idx_subdirs_name ON subdirs(name)")
log.info("Removed subdirs.direct_files column")
else:
log.info("subdirs.direct_files column does not exist, skipping")
else:
log.info("subdirs table does not exist, skipping")
log.info("Migration 005 completed successfully")
# Vacuum to reclaim space (outside transaction, optional)
# Note: VACUUM cannot run inside a transaction, so we skip it here
# The caller can run VACUUM separately if desired
def downgrade(db_conn: Connection):
"""Restore removed fields (data will be lost for keywords, token_count, symbol_type, direct_files).
This is a placeholder - true downgrade is not feasible as data is lost.
The migration is designed to be one-way since removed fields are unused/redundant.
Args:
db_conn: The SQLite database connection.
"""
log.warning(
"Migration 005 downgrade not supported - removed fields are unused/redundant. "
"Data cannot be restored."
)
raise NotImplementedError(
"Migration 005 downgrade not supported - this is a one-way migration"
)

View File

@@ -0,0 +1,37 @@
"""
Migration 006: Ensure relationship tables and indexes exist.
This migration is intentionally idempotent. It creates the `code_relationships`
table (used for graph visualization) and its indexes if missing.
"""
from __future__ import annotations
import logging
from sqlite3 import Connection
log = logging.getLogger(__name__)
def upgrade(db_conn: Connection) -> None:
cursor = db_conn.cursor()
log.info("Ensuring code_relationships table exists...")
cursor.execute(
"""
CREATE TABLE IF NOT EXISTS code_relationships (
id INTEGER PRIMARY KEY,
source_symbol_id INTEGER NOT NULL REFERENCES symbols (id) ON DELETE CASCADE,
target_qualified_name TEXT NOT NULL,
relationship_type TEXT NOT NULL,
source_line INTEGER NOT NULL,
target_file TEXT
)
"""
)
log.info("Ensuring relationship indexes exist...")
cursor.execute("CREATE INDEX IF NOT EXISTS idx_rel_source ON code_relationships(source_symbol_id)")
cursor.execute("CREATE INDEX IF NOT EXISTS idx_rel_target ON code_relationships(target_qualified_name)")
cursor.execute("CREATE INDEX IF NOT EXISTS idx_rel_type ON code_relationships(relationship_type)")

View File

@@ -0,0 +1,47 @@
"""
Migration 007: Add precomputed graph neighbor table for search expansion.
Adds:
- graph_neighbors: cached N-hop neighbors between symbols (keyed by symbol ids)
This table is derived data (a cache) and is safe to rebuild at any time.
The migration is intentionally idempotent.
"""
from __future__ import annotations
import logging
from sqlite3 import Connection
log = logging.getLogger(__name__)
def upgrade(db_conn: Connection) -> None:
cursor = db_conn.cursor()
log.info("Creating graph_neighbors table...")
cursor.execute(
"""
CREATE TABLE IF NOT EXISTS graph_neighbors (
source_symbol_id INTEGER NOT NULL REFERENCES symbols(id) ON DELETE CASCADE,
neighbor_symbol_id INTEGER NOT NULL REFERENCES symbols(id) ON DELETE CASCADE,
relationship_depth INTEGER NOT NULL,
PRIMARY KEY (source_symbol_id, neighbor_symbol_id)
)
"""
)
log.info("Creating indexes for graph_neighbors...")
cursor.execute(
"""
CREATE INDEX IF NOT EXISTS idx_graph_neighbors_source_depth
ON graph_neighbors(source_symbol_id, relationship_depth)
"""
)
cursor.execute(
"""
CREATE INDEX IF NOT EXISTS idx_graph_neighbors_neighbor
ON graph_neighbors(neighbor_symbol_id)
"""
)

View File

@@ -0,0 +1,81 @@
"""
Migration 008: Add Merkle hash tables for content-based incremental indexing.
Adds:
- merkle_hashes: per-file SHA-256 hashes (keyed by file_id)
- merkle_state: directory-level root hash (single row, id=1)
Backfills merkle_hashes using the existing `files.content` column when available.
"""
from __future__ import annotations
import hashlib
import logging
import time
from sqlite3 import Connection
log = logging.getLogger(__name__)
def upgrade(db_conn: Connection) -> None:
cursor = db_conn.cursor()
log.info("Creating merkle_hashes table...")
cursor.execute(
"""
CREATE TABLE IF NOT EXISTS merkle_hashes (
file_id INTEGER PRIMARY KEY REFERENCES files(id) ON DELETE CASCADE,
sha256 TEXT NOT NULL,
updated_at REAL
)
"""
)
log.info("Creating merkle_state table...")
cursor.execute(
"""
CREATE TABLE IF NOT EXISTS merkle_state (
id INTEGER PRIMARY KEY CHECK (id = 1),
root_hash TEXT,
updated_at REAL
)
"""
)
# Backfill file hashes from stored content (best-effort).
try:
rows = cursor.execute("SELECT id, content FROM files").fetchall()
except Exception as exc:
log.warning("Unable to backfill merkle hashes (files table missing?): %s", exc)
return
now = time.time()
inserts: list[tuple[int, str, float]] = []
for row in rows:
file_id = int(row[0])
content = row[1]
if content is None:
continue
try:
digest = hashlib.sha256(str(content).encode("utf-8", errors="ignore")).hexdigest()
inserts.append((file_id, digest, now))
except Exception:
continue
if not inserts:
return
log.info("Backfilling %d file hashes...", len(inserts))
cursor.executemany(
"""
INSERT INTO merkle_hashes(file_id, sha256, updated_at)
VALUES(?, ?, ?)
ON CONFLICT(file_id) DO UPDATE SET
sha256=excluded.sha256,
updated_at=excluded.updated_at
""",
inserts,
)

View File

@@ -0,0 +1,103 @@
"""
Migration 009: Add SPLADE sparse retrieval tables.
This migration introduces SPLADE (Sparse Lexical AnD Expansion) support:
- splade_metadata: Model configuration (model name, vocab size, ONNX path)
- splade_posting_list: Inverted index mapping token_id -> (chunk_id, weight)
The SPLADE tables are designed for efficient sparse vector retrieval:
- Token-based lookup for query expansion
- Chunk-based deletion for index maintenance
- Maintains backward compatibility with existing FTS tables
"""
import logging
from sqlite3 import Connection
log = logging.getLogger(__name__)
def upgrade(db_conn: Connection) -> None:
"""
Adds SPLADE tables for sparse retrieval.
Creates:
- splade_metadata: Stores model configuration and ONNX path
- splade_posting_list: Inverted index with token_id -> (chunk_id, weight) mappings
- Indexes for efficient token-based and chunk-based lookups
Args:
db_conn: The SQLite database connection.
"""
cursor = db_conn.cursor()
log.info("Creating splade_metadata table...")
cursor.execute(
"""
CREATE TABLE IF NOT EXISTS splade_metadata (
id INTEGER PRIMARY KEY DEFAULT 1,
model_name TEXT NOT NULL,
vocab_size INTEGER NOT NULL,
onnx_path TEXT,
created_at REAL
)
"""
)
log.info("Creating splade_posting_list table...")
cursor.execute(
"""
CREATE TABLE IF NOT EXISTS splade_posting_list (
token_id INTEGER NOT NULL,
chunk_id INTEGER NOT NULL,
weight REAL NOT NULL,
PRIMARY KEY (token_id, chunk_id),
FOREIGN KEY (chunk_id) REFERENCES semantic_chunks(id) ON DELETE CASCADE
)
"""
)
log.info("Creating indexes for splade_posting_list...")
# Index for efficient chunk-based lookups (deletion, updates)
cursor.execute(
"""
CREATE INDEX IF NOT EXISTS idx_splade_by_chunk
ON splade_posting_list(chunk_id)
"""
)
# Index for efficient term-based retrieval
cursor.execute(
"""
CREATE INDEX IF NOT EXISTS idx_splade_by_token
ON splade_posting_list(token_id)
"""
)
log.info("Migration 009 completed successfully")
def downgrade(db_conn: Connection) -> None:
"""
Removes SPLADE tables.
Drops:
- splade_posting_list (and associated indexes)
- splade_metadata
Args:
db_conn: The SQLite database connection.
"""
cursor = db_conn.cursor()
log.info("Dropping SPLADE indexes...")
cursor.execute("DROP INDEX IF EXISTS idx_splade_by_chunk")
cursor.execute("DROP INDEX IF EXISTS idx_splade_by_token")
log.info("Dropping splade_posting_list table...")
cursor.execute("DROP TABLE IF EXISTS splade_posting_list")
log.info("Dropping splade_metadata table...")
cursor.execute("DROP TABLE IF EXISTS splade_metadata")
log.info("Migration 009 downgrade completed successfully")

View File

@@ -0,0 +1,162 @@
"""
Migration 010: Add multi-vector storage support for cascade retrieval.
This migration introduces the chunks table with multi-vector support:
- chunks: Stores code chunks with multiple embedding types
- embedding: Original embedding for backward compatibility
- embedding_binary: 256-dim binary vector for coarse ranking (fast)
- embedding_dense: 2048-dim dense vector for fine ranking (precise)
The multi-vector architecture enables cascade retrieval:
1. First stage: Fast binary vector search for candidate retrieval
2. Second stage: Dense vector reranking for precision
"""
import logging
from sqlite3 import Connection
log = logging.getLogger(__name__)
def upgrade(db_conn: Connection) -> None:
"""
Adds chunks table with multi-vector embedding columns.
Creates:
- chunks: Table for storing code chunks with multiple embedding types
- idx_chunks_file_path: Index for efficient file-based lookups
Also migrates existing chunks tables by adding new columns if needed.
Args:
db_conn: The SQLite database connection.
"""
cursor = db_conn.cursor()
# Check if chunks table already exists
table_exists = cursor.execute(
"SELECT name FROM sqlite_master WHERE type='table' AND name='chunks'"
).fetchone()
if table_exists:
# Migrate existing table - add new columns if missing
log.info("chunks table exists, checking for missing columns...")
col_info = cursor.execute("PRAGMA table_info(chunks)").fetchall()
existing_columns = {row[1] for row in col_info}
if "embedding_binary" not in existing_columns:
log.info("Adding embedding_binary column to chunks table...")
cursor.execute(
"ALTER TABLE chunks ADD COLUMN embedding_binary BLOB"
)
if "embedding_dense" not in existing_columns:
log.info("Adding embedding_dense column to chunks table...")
cursor.execute(
"ALTER TABLE chunks ADD COLUMN embedding_dense BLOB"
)
else:
# Create new table with all columns
log.info("Creating chunks table with multi-vector support...")
cursor.execute(
"""
CREATE TABLE chunks (
id INTEGER PRIMARY KEY AUTOINCREMENT,
file_path TEXT NOT NULL,
content TEXT NOT NULL,
embedding BLOB,
embedding_binary BLOB,
embedding_dense BLOB,
metadata TEXT,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
"""
)
# Create index for file-based lookups
log.info("Creating index for chunks table...")
cursor.execute(
"""
CREATE INDEX IF NOT EXISTS idx_chunks_file_path
ON chunks(file_path)
"""
)
log.info("Migration 010 completed successfully")
def downgrade(db_conn: Connection) -> None:
"""
Removes multi-vector columns from chunks table.
Note: This does not drop the chunks table entirely to preserve data.
Only the new columns added by this migration are removed.
Args:
db_conn: The SQLite database connection.
"""
cursor = db_conn.cursor()
log.info("Removing multi-vector columns from chunks table...")
# SQLite doesn't support DROP COLUMN directly in older versions
# We need to recreate the table without the columns
# Check if chunks table exists
table_exists = cursor.execute(
"SELECT name FROM sqlite_master WHERE type='table' AND name='chunks'"
).fetchone()
if not table_exists:
log.info("chunks table does not exist, nothing to downgrade")
return
# Check if the columns exist before trying to remove them
col_info = cursor.execute("PRAGMA table_info(chunks)").fetchall()
existing_columns = {row[1] for row in col_info}
needs_migration = (
"embedding_binary" in existing_columns or
"embedding_dense" in existing_columns
)
if not needs_migration:
log.info("Multi-vector columns not present, nothing to remove")
return
# Recreate table without the new columns
log.info("Recreating chunks table without multi-vector columns...")
cursor.execute(
"""
CREATE TABLE chunks_backup (
id INTEGER PRIMARY KEY AUTOINCREMENT,
file_path TEXT NOT NULL,
content TEXT NOT NULL,
embedding BLOB,
metadata TEXT,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
"""
)
cursor.execute(
"""
INSERT INTO chunks_backup (id, file_path, content, embedding, metadata, created_at)
SELECT id, file_path, content, embedding, metadata, created_at FROM chunks
"""
)
cursor.execute("DROP TABLE chunks")
cursor.execute("ALTER TABLE chunks_backup RENAME TO chunks")
# Recreate index
cursor.execute(
"""
CREATE INDEX IF NOT EXISTS idx_chunks_file_path
ON chunks(file_path)
"""
)
log.info("Migration 010 downgrade completed successfully")