mirror of
https://github.com/catlog22/Claude-Code-Workflow.git
synced 2026-03-21 19:08:17 +08:00
- Updated `cmd_search` to include line numbers and content in search results. - Modified `IndexingPipeline` to handle start and end line numbers for chunks. - Enhanced `FTSEngine` to support storing line metadata in the database. - Improved `SearchPipeline` to return line numbers and full content in search results. - Added unit tests for bridge, FTS delete operations, metadata store, and watcher functionality. - Introduced a `.gitignore` file to exclude specific directories.
134 lines
4.8 KiB
Python
134 lines
4.8 KiB
Python
from __future__ import annotations
|
|
|
|
import sqlite3
|
|
from pathlib import Path
|
|
|
|
|
|
class FTSEngine:
|
|
def __init__(self, db_path: str | Path) -> None:
|
|
self._conn = sqlite3.connect(str(db_path), check_same_thread=False)
|
|
self._conn.execute(
|
|
"CREATE VIRTUAL TABLE IF NOT EXISTS docs "
|
|
"USING fts5(content, tokenize='porter unicode61')"
|
|
)
|
|
self._conn.execute(
|
|
"CREATE TABLE IF NOT EXISTS docs_meta "
|
|
"(id INTEGER PRIMARY KEY, path TEXT, "
|
|
"start_line INTEGER DEFAULT 0, end_line INTEGER DEFAULT 0)"
|
|
)
|
|
self._conn.commit()
|
|
self._migrate_line_columns()
|
|
|
|
def _migrate_line_columns(self) -> None:
|
|
"""Add start_line/end_line columns if missing (for pre-existing DBs)."""
|
|
cols = {
|
|
row[1]
|
|
for row in self._conn.execute("PRAGMA table_info(docs_meta)").fetchall()
|
|
}
|
|
for col in ("start_line", "end_line"):
|
|
if col not in cols:
|
|
self._conn.execute(
|
|
f"ALTER TABLE docs_meta ADD COLUMN {col} INTEGER DEFAULT 0"
|
|
)
|
|
self._conn.commit()
|
|
|
|
def add_documents(self, docs: list[tuple]) -> None:
|
|
"""Add documents in batch.
|
|
|
|
docs: list of (id, path, content) or (id, path, content, start_line, end_line).
|
|
"""
|
|
if not docs:
|
|
return
|
|
meta_rows = []
|
|
fts_rows = []
|
|
for doc in docs:
|
|
if len(doc) >= 5:
|
|
doc_id, path, content, sl, el = doc[0], doc[1], doc[2], doc[3], doc[4]
|
|
else:
|
|
doc_id, path, content = doc[0], doc[1], doc[2]
|
|
sl, el = 0, 0
|
|
meta_rows.append((doc_id, path, sl, el))
|
|
fts_rows.append((doc_id, content))
|
|
self._conn.executemany(
|
|
"INSERT OR REPLACE INTO docs_meta (id, path, start_line, end_line) "
|
|
"VALUES (?, ?, ?, ?)",
|
|
meta_rows,
|
|
)
|
|
self._conn.executemany(
|
|
"INSERT OR REPLACE INTO docs (rowid, content) VALUES (?, ?)",
|
|
fts_rows,
|
|
)
|
|
self._conn.commit()
|
|
|
|
def exact_search(self, query: str, top_k: int = 50) -> list[tuple[int, float]]:
|
|
"""FTS5 MATCH query, return (id, bm25_score) sorted by score descending."""
|
|
try:
|
|
rows = self._conn.execute(
|
|
"SELECT rowid, bm25(docs) AS score FROM docs "
|
|
"WHERE docs MATCH ? ORDER BY score LIMIT ?",
|
|
(query, top_k),
|
|
).fetchall()
|
|
except sqlite3.OperationalError:
|
|
return []
|
|
# bm25 in SQLite FTS5 returns negative values (lower = better match)
|
|
# Negate so higher is better
|
|
return [(int(row[0]), -float(row[1])) for row in rows]
|
|
|
|
def fuzzy_search(self, query: str, top_k: int = 50) -> list[tuple[int, float]]:
|
|
"""Prefix search: each token + '*', return (id, score) sorted descending."""
|
|
tokens = query.strip().split()
|
|
if not tokens:
|
|
return []
|
|
prefix_query = " ".join(t + "*" for t in tokens)
|
|
try:
|
|
rows = self._conn.execute(
|
|
"SELECT rowid, bm25(docs) AS score FROM docs "
|
|
"WHERE docs MATCH ? ORDER BY score LIMIT ?",
|
|
(prefix_query, top_k),
|
|
).fetchall()
|
|
except sqlite3.OperationalError:
|
|
return []
|
|
return [(int(row[0]), -float(row[1])) for row in rows]
|
|
|
|
def get_content(self, doc_id: int) -> str:
|
|
"""Retrieve content for a doc_id."""
|
|
row = self._conn.execute(
|
|
"SELECT content FROM docs WHERE rowid = ?", (doc_id,)
|
|
).fetchone()
|
|
return row[0] if row else ""
|
|
|
|
def get_chunk_ids_by_path(self, path: str) -> list[int]:
|
|
"""Return all doc IDs associated with a given file path."""
|
|
rows = self._conn.execute(
|
|
"SELECT id FROM docs_meta WHERE path = ?", (path,)
|
|
).fetchall()
|
|
return [r[0] for r in rows]
|
|
|
|
def delete_by_path(self, path: str) -> int:
|
|
"""Delete all docs and docs_meta rows for a given file path.
|
|
|
|
Returns the number of deleted documents.
|
|
"""
|
|
ids = self.get_chunk_ids_by_path(path)
|
|
if not ids:
|
|
return 0
|
|
placeholders = ",".join("?" for _ in ids)
|
|
self._conn.execute(
|
|
f"DELETE FROM docs WHERE rowid IN ({placeholders})", ids
|
|
)
|
|
self._conn.execute(
|
|
f"DELETE FROM docs_meta WHERE id IN ({placeholders})", ids
|
|
)
|
|
self._conn.commit()
|
|
return len(ids)
|
|
|
|
def get_doc_meta(self, doc_id: int) -> tuple[str, int, int]:
|
|
"""Return (path, start_line, end_line) for a doc_id."""
|
|
row = self._conn.execute(
|
|
"SELECT path, start_line, end_line FROM docs_meta WHERE id = ?",
|
|
(doc_id,),
|
|
).fetchone()
|
|
if row:
|
|
return row[0], row[1] or 0, row[2] or 0
|
|
return "", 0, 0
|