Files
Claude-Code-Workflow/codex-lens-v2/src/codexlens_search/search/fts.py
catlog22 0f02b75be1 Enhance search functionality and indexing pipeline
- Updated `cmd_search` to include line numbers and content in search results.
- Modified `IndexingPipeline` to handle start and end line numbers for chunks.
- Enhanced `FTSEngine` to support storing line metadata in the database.
- Improved `SearchPipeline` to return line numbers and full content in search results.
- Added unit tests for bridge, FTS delete operations, metadata store, and watcher functionality.
- Introduced a `.gitignore` file to exclude specific directories.
2026-03-17 14:55:27 +08:00

134 lines
4.8 KiB
Python

from __future__ import annotations
import sqlite3
from pathlib import Path
class FTSEngine:
def __init__(self, db_path: str | Path) -> None:
self._conn = sqlite3.connect(str(db_path), check_same_thread=False)
self._conn.execute(
"CREATE VIRTUAL TABLE IF NOT EXISTS docs "
"USING fts5(content, tokenize='porter unicode61')"
)
self._conn.execute(
"CREATE TABLE IF NOT EXISTS docs_meta "
"(id INTEGER PRIMARY KEY, path TEXT, "
"start_line INTEGER DEFAULT 0, end_line INTEGER DEFAULT 0)"
)
self._conn.commit()
self._migrate_line_columns()
def _migrate_line_columns(self) -> None:
"""Add start_line/end_line columns if missing (for pre-existing DBs)."""
cols = {
row[1]
for row in self._conn.execute("PRAGMA table_info(docs_meta)").fetchall()
}
for col in ("start_line", "end_line"):
if col not in cols:
self._conn.execute(
f"ALTER TABLE docs_meta ADD COLUMN {col} INTEGER DEFAULT 0"
)
self._conn.commit()
def add_documents(self, docs: list[tuple]) -> None:
"""Add documents in batch.
docs: list of (id, path, content) or (id, path, content, start_line, end_line).
"""
if not docs:
return
meta_rows = []
fts_rows = []
for doc in docs:
if len(doc) >= 5:
doc_id, path, content, sl, el = doc[0], doc[1], doc[2], doc[3], doc[4]
else:
doc_id, path, content = doc[0], doc[1], doc[2]
sl, el = 0, 0
meta_rows.append((doc_id, path, sl, el))
fts_rows.append((doc_id, content))
self._conn.executemany(
"INSERT OR REPLACE INTO docs_meta (id, path, start_line, end_line) "
"VALUES (?, ?, ?, ?)",
meta_rows,
)
self._conn.executemany(
"INSERT OR REPLACE INTO docs (rowid, content) VALUES (?, ?)",
fts_rows,
)
self._conn.commit()
def exact_search(self, query: str, top_k: int = 50) -> list[tuple[int, float]]:
"""FTS5 MATCH query, return (id, bm25_score) sorted by score descending."""
try:
rows = self._conn.execute(
"SELECT rowid, bm25(docs) AS score FROM docs "
"WHERE docs MATCH ? ORDER BY score LIMIT ?",
(query, top_k),
).fetchall()
except sqlite3.OperationalError:
return []
# bm25 in SQLite FTS5 returns negative values (lower = better match)
# Negate so higher is better
return [(int(row[0]), -float(row[1])) for row in rows]
def fuzzy_search(self, query: str, top_k: int = 50) -> list[tuple[int, float]]:
"""Prefix search: each token + '*', return (id, score) sorted descending."""
tokens = query.strip().split()
if not tokens:
return []
prefix_query = " ".join(t + "*" for t in tokens)
try:
rows = self._conn.execute(
"SELECT rowid, bm25(docs) AS score FROM docs "
"WHERE docs MATCH ? ORDER BY score LIMIT ?",
(prefix_query, top_k),
).fetchall()
except sqlite3.OperationalError:
return []
return [(int(row[0]), -float(row[1])) for row in rows]
def get_content(self, doc_id: int) -> str:
"""Retrieve content for a doc_id."""
row = self._conn.execute(
"SELECT content FROM docs WHERE rowid = ?", (doc_id,)
).fetchone()
return row[0] if row else ""
def get_chunk_ids_by_path(self, path: str) -> list[int]:
"""Return all doc IDs associated with a given file path."""
rows = self._conn.execute(
"SELECT id FROM docs_meta WHERE path = ?", (path,)
).fetchall()
return [r[0] for r in rows]
def delete_by_path(self, path: str) -> int:
"""Delete all docs and docs_meta rows for a given file path.
Returns the number of deleted documents.
"""
ids = self.get_chunk_ids_by_path(path)
if not ids:
return 0
placeholders = ",".join("?" for _ in ids)
self._conn.execute(
f"DELETE FROM docs WHERE rowid IN ({placeholders})", ids
)
self._conn.execute(
f"DELETE FROM docs_meta WHERE id IN ({placeholders})", ids
)
self._conn.commit()
return len(ids)
def get_doc_meta(self, doc_id: int) -> tuple[str, int, int]:
"""Return (path, start_line, end_line) for a doc_id."""
row = self._conn.execute(
"SELECT path, start_line, end_line FROM docs_meta WHERE id = ?",
(doc_id,),
).fetchone()
if row:
return row[0], row[1] or 0, row[2] or 0
return "", 0, 0