mirror of
https://github.com/catlog22/Claude-Code-Workflow.git
synced 2026-03-18 18:48:48 +08:00
Enhance search functionality and indexing pipeline
- Updated `cmd_search` to include line numbers and content in search results. - Modified `IndexingPipeline` to handle start and end line numbers for chunks. - Enhanced `FTSEngine` to support storing line metadata in the database. - Improved `SearchPipeline` to return line numbers and full content in search results. - Added unit tests for bridge, FTS delete operations, metadata store, and watcher functionality. - Introduced a `.gitignore` file to exclude specific directories.
This commit is contained in:
@@ -129,7 +129,14 @@ def cmd_search(args: argparse.Namespace) -> None:
|
||||
|
||||
results = search.search(args.query, top_k=args.top_k)
|
||||
_json_output([
|
||||
{"path": r.path, "score": r.score, "snippet": r.snippet}
|
||||
{
|
||||
"path": r.path,
|
||||
"score": r.score,
|
||||
"line": r.line,
|
||||
"end_line": r.end_line,
|
||||
"snippet": r.snippet,
|
||||
"content": r.content,
|
||||
}
|
||||
for r in results
|
||||
])
|
||||
|
||||
|
||||
@@ -146,14 +146,16 @@ class IndexingPipeline:
|
||||
batch_ids = []
|
||||
batch_texts = []
|
||||
batch_paths = []
|
||||
for chunk_text, path in file_chunks:
|
||||
batch_lines: list[tuple[int, int]] = []
|
||||
for chunk_text, path, sl, el in file_chunks:
|
||||
batch_ids.append(chunk_id)
|
||||
batch_texts.append(chunk_text)
|
||||
batch_paths.append(path)
|
||||
batch_lines.append((sl, el))
|
||||
chunk_id += 1
|
||||
|
||||
chunks_created += len(batch_ids)
|
||||
embed_queue.put((batch_ids, batch_texts, batch_paths))
|
||||
embed_queue.put((batch_ids, batch_texts, batch_paths, batch_lines))
|
||||
|
||||
# Signal embed worker: no more data
|
||||
embed_queue.put(_SENTINEL)
|
||||
@@ -203,12 +205,12 @@ class IndexingPipeline:
|
||||
if item is _SENTINEL:
|
||||
break
|
||||
|
||||
batch_ids, batch_texts, batch_paths = item
|
||||
batch_ids, batch_texts, batch_paths, batch_lines = item
|
||||
try:
|
||||
vecs = self._embedder.embed_batch(batch_texts)
|
||||
vec_array = np.array(vecs, dtype=np.float32)
|
||||
id_array = np.array(batch_ids, dtype=np.int64)
|
||||
out_q.put((id_array, vec_array, batch_texts, batch_paths))
|
||||
out_q.put((id_array, vec_array, batch_texts, batch_paths, batch_lines))
|
||||
except Exception as exc:
|
||||
logger.error("Embed worker error: %s", exc)
|
||||
on_error(exc)
|
||||
@@ -221,19 +223,20 @@ class IndexingPipeline:
|
||||
in_q: queue.Queue,
|
||||
on_error: callable,
|
||||
) -> None:
|
||||
"""Stage 3: Pull (ids, vecs, texts, paths), write to stores."""
|
||||
"""Stage 3: Pull (ids, vecs, texts, paths, lines), write to stores."""
|
||||
while True:
|
||||
item = in_q.get()
|
||||
if item is _SENTINEL:
|
||||
break
|
||||
|
||||
id_array, vec_array, texts, paths = item
|
||||
id_array, vec_array, texts, paths, line_ranges = item
|
||||
try:
|
||||
self._binary_store.add(id_array, vec_array)
|
||||
self._ann_index.add(id_array, vec_array)
|
||||
|
||||
fts_docs = [
|
||||
(int(id_array[i]), paths[i], texts[i])
|
||||
(int(id_array[i]), paths[i], texts[i],
|
||||
line_ranges[i][0], line_ranges[i][1])
|
||||
for i in range(len(id_array))
|
||||
]
|
||||
self._fts.add_documents(fts_docs)
|
||||
@@ -251,32 +254,39 @@ class IndexingPipeline:
|
||||
path: str,
|
||||
max_chars: int,
|
||||
overlap: int,
|
||||
) -> list[tuple[str, str]]:
|
||||
) -> list[tuple[str, str, int, int]]:
|
||||
"""Split file text into overlapping chunks.
|
||||
|
||||
Returns list of (chunk_text, path) tuples.
|
||||
Returns list of (chunk_text, path, start_line, end_line) tuples.
|
||||
Line numbers are 1-based.
|
||||
"""
|
||||
if not text.strip():
|
||||
return []
|
||||
|
||||
chunks: list[tuple[str, str]] = []
|
||||
chunks: list[tuple[str, str, int, int]] = []
|
||||
lines = text.splitlines(keepends=True)
|
||||
current: list[str] = []
|
||||
current_len = 0
|
||||
chunk_start_line = 1 # 1-based
|
||||
lines_consumed = 0
|
||||
|
||||
for line in lines:
|
||||
lines_consumed += 1
|
||||
if current_len + len(line) > max_chars and current:
|
||||
chunk = "".join(current)
|
||||
chunks.append((chunk, path))
|
||||
end_line = lines_consumed - 1
|
||||
chunks.append((chunk, path, chunk_start_line, end_line))
|
||||
# overlap: keep last N characters
|
||||
tail = "".join(current)[-overlap:]
|
||||
tail = chunk[-overlap:] if overlap else ""
|
||||
tail_newlines = tail.count("\n")
|
||||
chunk_start_line = max(1, end_line - tail_newlines + 1)
|
||||
current = [tail] if tail else []
|
||||
current_len = len(tail)
|
||||
current.append(line)
|
||||
current_len += len(line)
|
||||
|
||||
if current:
|
||||
chunks.append(("".join(current), path))
|
||||
chunks.append(("".join(current), path, chunk_start_line, lines_consumed))
|
||||
|
||||
return chunks
|
||||
|
||||
@@ -370,10 +380,12 @@ class IndexingPipeline:
|
||||
batch_ids = []
|
||||
batch_texts = []
|
||||
batch_paths = []
|
||||
for i, (chunk_text, path) in enumerate(file_chunks):
|
||||
batch_lines: list[tuple[int, int]] = []
|
||||
for i, (chunk_text, path, sl, el) in enumerate(file_chunks):
|
||||
batch_ids.append(start_id + i)
|
||||
batch_texts.append(chunk_text)
|
||||
batch_paths.append(path)
|
||||
batch_lines.append((sl, el))
|
||||
|
||||
# Embed synchronously
|
||||
vecs = self._embedder.embed_batch(batch_texts)
|
||||
@@ -384,7 +396,8 @@ class IndexingPipeline:
|
||||
self._binary_store.add(id_array, vec_array)
|
||||
self._ann_index.add(id_array, vec_array)
|
||||
fts_docs = [
|
||||
(batch_ids[i], batch_paths[i], batch_texts[i])
|
||||
(batch_ids[i], batch_paths[i], batch_texts[i],
|
||||
batch_lines[i][0], batch_lines[i][1])
|
||||
for i in range(len(batch_ids))
|
||||
]
|
||||
self._fts.add_documents(fts_docs)
|
||||
|
||||
@@ -13,21 +13,50 @@ class FTSEngine:
|
||||
)
|
||||
self._conn.execute(
|
||||
"CREATE TABLE IF NOT EXISTS docs_meta "
|
||||
"(id INTEGER PRIMARY KEY, path TEXT)"
|
||||
"(id INTEGER PRIMARY KEY, path TEXT, "
|
||||
"start_line INTEGER DEFAULT 0, end_line INTEGER DEFAULT 0)"
|
||||
)
|
||||
self._conn.commit()
|
||||
self._migrate_line_columns()
|
||||
|
||||
def add_documents(self, docs: list[tuple[int, str, str]]) -> None:
|
||||
"""Add documents in batch. docs: list of (id, path, content)."""
|
||||
def _migrate_line_columns(self) -> None:
|
||||
"""Add start_line/end_line columns if missing (for pre-existing DBs)."""
|
||||
cols = {
|
||||
row[1]
|
||||
for row in self._conn.execute("PRAGMA table_info(docs_meta)").fetchall()
|
||||
}
|
||||
for col in ("start_line", "end_line"):
|
||||
if col not in cols:
|
||||
self._conn.execute(
|
||||
f"ALTER TABLE docs_meta ADD COLUMN {col} INTEGER DEFAULT 0"
|
||||
)
|
||||
self._conn.commit()
|
||||
|
||||
def add_documents(self, docs: list[tuple]) -> None:
|
||||
"""Add documents in batch.
|
||||
|
||||
docs: list of (id, path, content) or (id, path, content, start_line, end_line).
|
||||
"""
|
||||
if not docs:
|
||||
return
|
||||
meta_rows = []
|
||||
fts_rows = []
|
||||
for doc in docs:
|
||||
if len(doc) >= 5:
|
||||
doc_id, path, content, sl, el = doc[0], doc[1], doc[2], doc[3], doc[4]
|
||||
else:
|
||||
doc_id, path, content = doc[0], doc[1], doc[2]
|
||||
sl, el = 0, 0
|
||||
meta_rows.append((doc_id, path, sl, el))
|
||||
fts_rows.append((doc_id, content))
|
||||
self._conn.executemany(
|
||||
"INSERT OR REPLACE INTO docs_meta (id, path) VALUES (?, ?)",
|
||||
[(doc_id, path) for doc_id, path, content in docs],
|
||||
"INSERT OR REPLACE INTO docs_meta (id, path, start_line, end_line) "
|
||||
"VALUES (?, ?, ?, ?)",
|
||||
meta_rows,
|
||||
)
|
||||
self._conn.executemany(
|
||||
"INSERT OR REPLACE INTO docs (rowid, content) VALUES (?, ?)",
|
||||
[(doc_id, content) for doc_id, path, content in docs],
|
||||
fts_rows,
|
||||
)
|
||||
self._conn.commit()
|
||||
|
||||
@@ -92,3 +121,13 @@ class FTSEngine:
|
||||
)
|
||||
self._conn.commit()
|
||||
return len(ids)
|
||||
|
||||
def get_doc_meta(self, doc_id: int) -> tuple[str, int, int]:
|
||||
"""Return (path, start_line, end_line) for a doc_id."""
|
||||
row = self._conn.execute(
|
||||
"SELECT path, start_line, end_line FROM docs_meta WHERE id = ?",
|
||||
(doc_id,),
|
||||
).fetchone()
|
||||
if row:
|
||||
return row[0], row[1] or 0, row[2] or 0
|
||||
return "", 0, 0
|
||||
|
||||
@@ -28,6 +28,9 @@ class SearchResult:
|
||||
path: str
|
||||
score: float
|
||||
snippet: str = ""
|
||||
line: int = 0
|
||||
end_line: int = 0
|
||||
content: str = ""
|
||||
|
||||
|
||||
class SearchPipeline:
|
||||
@@ -162,15 +165,17 @@ class SearchPipeline:
|
||||
|
||||
results: list[SearchResult] = []
|
||||
for doc_id, score in ranked[:final_top_k]:
|
||||
path = self._fts._conn.execute(
|
||||
"SELECT path FROM docs_meta WHERE id = ?", (doc_id,)
|
||||
).fetchone()
|
||||
path, start_line, end_line = self._fts.get_doc_meta(doc_id)
|
||||
full_content = self._fts.get_content(doc_id)
|
||||
results.append(
|
||||
SearchResult(
|
||||
id=doc_id,
|
||||
path=path[0] if path else "",
|
||||
path=path,
|
||||
score=float(score),
|
||||
snippet=self._fts.get_content(doc_id)[:200],
|
||||
snippet=full_content[:200],
|
||||
line=start_line,
|
||||
end_line=end_line,
|
||||
content=full_content,
|
||||
)
|
||||
)
|
||||
return results
|
||||
|
||||
Reference in New Issue
Block a user