Enhance search functionality and indexing pipeline

- Updated `cmd_search` to include line numbers and content in search results.
- Modified `IndexingPipeline` to handle start and end line numbers for chunks.
- Enhanced `FTSEngine` to support storing line metadata in the database.
- Improved `SearchPipeline` to return line numbers and full content in search results.
- Added unit tests for bridge, FTS delete operations, metadata store, and watcher functionality.
- Introduced a `.gitignore` file to exclude specific directories.
This commit is contained in:
catlog22
2026-03-17 14:55:27 +08:00
parent bfe5426b7e
commit 0f02b75be1
25 changed files with 2014 additions and 1482 deletions

View File

@@ -129,7 +129,14 @@ def cmd_search(args: argparse.Namespace) -> None:
results = search.search(args.query, top_k=args.top_k)
_json_output([
{"path": r.path, "score": r.score, "snippet": r.snippet}
{
"path": r.path,
"score": r.score,
"line": r.line,
"end_line": r.end_line,
"snippet": r.snippet,
"content": r.content,
}
for r in results
])

View File

@@ -146,14 +146,16 @@ class IndexingPipeline:
batch_ids = []
batch_texts = []
batch_paths = []
for chunk_text, path in file_chunks:
batch_lines: list[tuple[int, int]] = []
for chunk_text, path, sl, el in file_chunks:
batch_ids.append(chunk_id)
batch_texts.append(chunk_text)
batch_paths.append(path)
batch_lines.append((sl, el))
chunk_id += 1
chunks_created += len(batch_ids)
embed_queue.put((batch_ids, batch_texts, batch_paths))
embed_queue.put((batch_ids, batch_texts, batch_paths, batch_lines))
# Signal embed worker: no more data
embed_queue.put(_SENTINEL)
@@ -203,12 +205,12 @@ class IndexingPipeline:
if item is _SENTINEL:
break
batch_ids, batch_texts, batch_paths = item
batch_ids, batch_texts, batch_paths, batch_lines = item
try:
vecs = self._embedder.embed_batch(batch_texts)
vec_array = np.array(vecs, dtype=np.float32)
id_array = np.array(batch_ids, dtype=np.int64)
out_q.put((id_array, vec_array, batch_texts, batch_paths))
out_q.put((id_array, vec_array, batch_texts, batch_paths, batch_lines))
except Exception as exc:
logger.error("Embed worker error: %s", exc)
on_error(exc)
@@ -221,19 +223,20 @@ class IndexingPipeline:
in_q: queue.Queue,
on_error: callable,
) -> None:
"""Stage 3: Pull (ids, vecs, texts, paths), write to stores."""
"""Stage 3: Pull (ids, vecs, texts, paths, lines), write to stores."""
while True:
item = in_q.get()
if item is _SENTINEL:
break
id_array, vec_array, texts, paths = item
id_array, vec_array, texts, paths, line_ranges = item
try:
self._binary_store.add(id_array, vec_array)
self._ann_index.add(id_array, vec_array)
fts_docs = [
(int(id_array[i]), paths[i], texts[i])
(int(id_array[i]), paths[i], texts[i],
line_ranges[i][0], line_ranges[i][1])
for i in range(len(id_array))
]
self._fts.add_documents(fts_docs)
@@ -251,32 +254,39 @@ class IndexingPipeline:
path: str,
max_chars: int,
overlap: int,
) -> list[tuple[str, str]]:
) -> list[tuple[str, str, int, int]]:
"""Split file text into overlapping chunks.
Returns list of (chunk_text, path) tuples.
Returns list of (chunk_text, path, start_line, end_line) tuples.
Line numbers are 1-based.
"""
if not text.strip():
return []
chunks: list[tuple[str, str]] = []
chunks: list[tuple[str, str, int, int]] = []
lines = text.splitlines(keepends=True)
current: list[str] = []
current_len = 0
chunk_start_line = 1 # 1-based
lines_consumed = 0
for line in lines:
lines_consumed += 1
if current_len + len(line) > max_chars and current:
chunk = "".join(current)
chunks.append((chunk, path))
end_line = lines_consumed - 1
chunks.append((chunk, path, chunk_start_line, end_line))
# overlap: keep last N characters
tail = "".join(current)[-overlap:]
tail = chunk[-overlap:] if overlap else ""
tail_newlines = tail.count("\n")
chunk_start_line = max(1, end_line - tail_newlines + 1)
current = [tail] if tail else []
current_len = len(tail)
current.append(line)
current_len += len(line)
if current:
chunks.append(("".join(current), path))
chunks.append(("".join(current), path, chunk_start_line, lines_consumed))
return chunks
@@ -370,10 +380,12 @@ class IndexingPipeline:
batch_ids = []
batch_texts = []
batch_paths = []
for i, (chunk_text, path) in enumerate(file_chunks):
batch_lines: list[tuple[int, int]] = []
for i, (chunk_text, path, sl, el) in enumerate(file_chunks):
batch_ids.append(start_id + i)
batch_texts.append(chunk_text)
batch_paths.append(path)
batch_lines.append((sl, el))
# Embed synchronously
vecs = self._embedder.embed_batch(batch_texts)
@@ -384,7 +396,8 @@ class IndexingPipeline:
self._binary_store.add(id_array, vec_array)
self._ann_index.add(id_array, vec_array)
fts_docs = [
(batch_ids[i], batch_paths[i], batch_texts[i])
(batch_ids[i], batch_paths[i], batch_texts[i],
batch_lines[i][0], batch_lines[i][1])
for i in range(len(batch_ids))
]
self._fts.add_documents(fts_docs)

View File

@@ -13,21 +13,50 @@ class FTSEngine:
)
self._conn.execute(
"CREATE TABLE IF NOT EXISTS docs_meta "
"(id INTEGER PRIMARY KEY, path TEXT)"
"(id INTEGER PRIMARY KEY, path TEXT, "
"start_line INTEGER DEFAULT 0, end_line INTEGER DEFAULT 0)"
)
self._conn.commit()
self._migrate_line_columns()
def add_documents(self, docs: list[tuple[int, str, str]]) -> None:
"""Add documents in batch. docs: list of (id, path, content)."""
def _migrate_line_columns(self) -> None:
"""Add start_line/end_line columns if missing (for pre-existing DBs)."""
cols = {
row[1]
for row in self._conn.execute("PRAGMA table_info(docs_meta)").fetchall()
}
for col in ("start_line", "end_line"):
if col not in cols:
self._conn.execute(
f"ALTER TABLE docs_meta ADD COLUMN {col} INTEGER DEFAULT 0"
)
self._conn.commit()
def add_documents(self, docs: list[tuple]) -> None:
"""Add documents in batch.
docs: list of (id, path, content) or (id, path, content, start_line, end_line).
"""
if not docs:
return
meta_rows = []
fts_rows = []
for doc in docs:
if len(doc) >= 5:
doc_id, path, content, sl, el = doc[0], doc[1], doc[2], doc[3], doc[4]
else:
doc_id, path, content = doc[0], doc[1], doc[2]
sl, el = 0, 0
meta_rows.append((doc_id, path, sl, el))
fts_rows.append((doc_id, content))
self._conn.executemany(
"INSERT OR REPLACE INTO docs_meta (id, path) VALUES (?, ?)",
[(doc_id, path) for doc_id, path, content in docs],
"INSERT OR REPLACE INTO docs_meta (id, path, start_line, end_line) "
"VALUES (?, ?, ?, ?)",
meta_rows,
)
self._conn.executemany(
"INSERT OR REPLACE INTO docs (rowid, content) VALUES (?, ?)",
[(doc_id, content) for doc_id, path, content in docs],
fts_rows,
)
self._conn.commit()
@@ -92,3 +121,13 @@ class FTSEngine:
)
self._conn.commit()
return len(ids)
def get_doc_meta(self, doc_id: int) -> tuple[str, int, int]:
"""Return (path, start_line, end_line) for a doc_id."""
row = self._conn.execute(
"SELECT path, start_line, end_line FROM docs_meta WHERE id = ?",
(doc_id,),
).fetchone()
if row:
return row[0], row[1] or 0, row[2] or 0
return "", 0, 0

View File

@@ -28,6 +28,9 @@ class SearchResult:
path: str
score: float
snippet: str = ""
line: int = 0
end_line: int = 0
content: str = ""
class SearchPipeline:
@@ -162,15 +165,17 @@ class SearchPipeline:
results: list[SearchResult] = []
for doc_id, score in ranked[:final_top_k]:
path = self._fts._conn.execute(
"SELECT path FROM docs_meta WHERE id = ?", (doc_id,)
).fetchone()
path, start_line, end_line = self._fts.get_doc_meta(doc_id)
full_content = self._fts.get_content(doc_id)
results.append(
SearchResult(
id=doc_id,
path=path[0] if path else "",
path=path,
score=float(score),
snippet=self._fts.get_content(doc_id)[:200],
snippet=full_content[:200],
line=start_line,
end_line=end_line,
content=full_content,
)
)
return results