Enhance search functionality and indexing pipeline

- Updated `cmd_search` to include line numbers and content in search results. - Modified `IndexingPipeline` to handle start and end line numbers for chunks. - Enhanced `FTSEngine` to support storing line metadata in the database. - Improved `SearchPipeline` to return line numbers and full content in search results. - Added unit tests for bridge, FTS delete operations, metadata store, and watcher functionality. - Introduced a `.gitignore` file to exclude specific directories.
2026-03-18 18:48:48 +08:00 · 2026-03-17 14:55:27 +08:00
parent bfe5426b7e
commit 0f02b75be1
25 changed files with 2014 additions and 1482 deletions
--- a/codex-lens-v2/src/codexlens_search/bridge.py
+++ b/codex-lens-v2/src/codexlens_search/bridge.py
@@ -129,7 +129,14 @@ def cmd_search(args: argparse.Namespace) -> None:

    results = search.search(args.query, top_k=args.top_k)
    _json_output([
-        {"path": r.path, "score": r.score, "snippet": r.snippet}
+        {
+            "path": r.path,
+            "score": r.score,
+            "line": r.line,
+            "end_line": r.end_line,
+            "snippet": r.snippet,
+            "content": r.content,
+        }
        for r in results
    ])

--- a/codex-lens-v2/src/codexlens_search/indexing/pipeline.py
+++ b/codex-lens-v2/src/codexlens_search/indexing/pipeline.py
@@ -146,14 +146,16 @@ class IndexingPipeline:
            batch_ids = []
            batch_texts = []
            batch_paths = []
-            for chunk_text, path in file_chunks:
+            batch_lines: list[tuple[int, int]] = []
+            for chunk_text, path, sl, el in file_chunks:
                batch_ids.append(chunk_id)
                batch_texts.append(chunk_text)
                batch_paths.append(path)
+                batch_lines.append((sl, el))
                chunk_id += 1

            chunks_created += len(batch_ids)
-            embed_queue.put((batch_ids, batch_texts, batch_paths))
+            embed_queue.put((batch_ids, batch_texts, batch_paths, batch_lines))

        # Signal embed worker: no more data
        embed_queue.put(_SENTINEL)
@@ -203,12 +205,12 @@ class IndexingPipeline:
                if item is _SENTINEL:
                    break

-                batch_ids, batch_texts, batch_paths = item
+                batch_ids, batch_texts, batch_paths, batch_lines = item
                try:
                    vecs = self._embedder.embed_batch(batch_texts)
                    vec_array = np.array(vecs, dtype=np.float32)
                    id_array = np.array(batch_ids, dtype=np.int64)
-                    out_q.put((id_array, vec_array, batch_texts, batch_paths))
+                    out_q.put((id_array, vec_array, batch_texts, batch_paths, batch_lines))
                except Exception as exc:
                    logger.error("Embed worker error: %s", exc)
                    on_error(exc)
@@ -221,19 +223,20 @@ class IndexingPipeline:
        in_q: queue.Queue,
        on_error: callable,
    ) -> None:
-        """Stage 3: Pull (ids, vecs, texts, paths), write to stores."""
+        """Stage 3: Pull (ids, vecs, texts, paths, lines), write to stores."""
        while True:
            item = in_q.get()
            if item is _SENTINEL:
                break

-            id_array, vec_array, texts, paths = item
+            id_array, vec_array, texts, paths, line_ranges = item
            try:
                self._binary_store.add(id_array, vec_array)
                self._ann_index.add(id_array, vec_array)

                fts_docs = [
-                    (int(id_array[i]), paths[i], texts[i])
+                    (int(id_array[i]), paths[i], texts[i],
+                     line_ranges[i][0], line_ranges[i][1])
                    for i in range(len(id_array))
                ]
                self._fts.add_documents(fts_docs)
@@ -251,32 +254,39 @@ class IndexingPipeline:
        path: str,
        max_chars: int,
        overlap: int,
-    ) -> list[tuple[str, str]]:
+    ) -> list[tuple[str, str, int, int]]:
        """Split file text into overlapping chunks.

-        Returns list of (chunk_text, path) tuples.
+        Returns list of (chunk_text, path, start_line, end_line) tuples.
+        Line numbers are 1-based.
        """
        if not text.strip():
            return []

-        chunks: list[tuple[str, str]] = []
+        chunks: list[tuple[str, str, int, int]] = []
        lines = text.splitlines(keepends=True)
        current: list[str] = []
        current_len = 0
+        chunk_start_line = 1  # 1-based
+        lines_consumed = 0

        for line in lines:
+            lines_consumed += 1
            if current_len + len(line) > max_chars and current:
                chunk = "".join(current)
-                chunks.append((chunk, path))
+                end_line = lines_consumed - 1
+                chunks.append((chunk, path, chunk_start_line, end_line))
                # overlap: keep last N characters
-                tail = "".join(current)[-overlap:]
+                tail = chunk[-overlap:] if overlap else ""
+                tail_newlines = tail.count("\n")
+                chunk_start_line = max(1, end_line - tail_newlines + 1)
                current = [tail] if tail else []
                current_len = len(tail)
            current.append(line)
            current_len += len(line)

        if current:
-            chunks.append(("".join(current), path))
+            chunks.append(("".join(current), path, chunk_start_line, lines_consumed))

        return chunks

@@ -370,10 +380,12 @@ class IndexingPipeline:
        batch_ids = []
        batch_texts = []
        batch_paths = []
-        for i, (chunk_text, path) in enumerate(file_chunks):
+        batch_lines: list[tuple[int, int]] = []
+        for i, (chunk_text, path, sl, el) in enumerate(file_chunks):
            batch_ids.append(start_id + i)
            batch_texts.append(chunk_text)
            batch_paths.append(path)
+            batch_lines.append((sl, el))

        # Embed synchronously
        vecs = self._embedder.embed_batch(batch_texts)
@@ -384,7 +396,8 @@ class IndexingPipeline:
        self._binary_store.add(id_array, vec_array)
        self._ann_index.add(id_array, vec_array)
        fts_docs = [
-            (batch_ids[i], batch_paths[i], batch_texts[i])
+            (batch_ids[i], batch_paths[i], batch_texts[i],
+             batch_lines[i][0], batch_lines[i][1])
            for i in range(len(batch_ids))
        ]
        self._fts.add_documents(fts_docs)
--- a/codex-lens-v2/src/codexlens_search/search/fts.py
+++ b/codex-lens-v2/src/codexlens_search/search/fts.py
@@ -13,21 +13,50 @@ class FTSEngine:
        )
        self._conn.execute(
            "CREATE TABLE IF NOT EXISTS docs_meta "
-            "(id INTEGER PRIMARY KEY, path TEXT)"
+            "(id INTEGER PRIMARY KEY, path TEXT, "
+            "start_line INTEGER DEFAULT 0, end_line INTEGER DEFAULT 0)"
        )
        self._conn.commit()
+        self._migrate_line_columns()

-    def add_documents(self, docs: list[tuple[int, str, str]]) -> None:
-        """Add documents in batch. docs: list of (id, path, content)."""
+    def _migrate_line_columns(self) -> None:
+        """Add start_line/end_line columns if missing (for pre-existing DBs)."""
+        cols = {
+            row[1]
+            for row in self._conn.execute("PRAGMA table_info(docs_meta)").fetchall()
+        }
+        for col in ("start_line", "end_line"):
+            if col not in cols:
+                self._conn.execute(
+                    f"ALTER TABLE docs_meta ADD COLUMN {col} INTEGER DEFAULT 0"
+                )
+        self._conn.commit()
+
+    def add_documents(self, docs: list[tuple]) -> None:
+        """Add documents in batch.
+
+        docs: list of (id, path, content) or (id, path, content, start_line, end_line).
+        """
        if not docs:
            return
+        meta_rows = []
+        fts_rows = []
+        for doc in docs:
+            if len(doc) >= 5:
+                doc_id, path, content, sl, el = doc[0], doc[1], doc[2], doc[3], doc[4]
+            else:
+                doc_id, path, content = doc[0], doc[1], doc[2]
+                sl, el = 0, 0
+            meta_rows.append((doc_id, path, sl, el))
+            fts_rows.append((doc_id, content))
        self._conn.executemany(
-            "INSERT OR REPLACE INTO docs_meta (id, path) VALUES (?, ?)",
-            [(doc_id, path) for doc_id, path, content in docs],
+            "INSERT OR REPLACE INTO docs_meta (id, path, start_line, end_line) "
+            "VALUES (?, ?, ?, ?)",
+            meta_rows,
        )
        self._conn.executemany(
            "INSERT OR REPLACE INTO docs (rowid, content) VALUES (?, ?)",
-            [(doc_id, content) for doc_id, path, content in docs],
+            fts_rows,
        )
        self._conn.commit()

@@ -92,3 +121,13 @@ class FTSEngine:
        )
        self._conn.commit()
        return len(ids)
+
+    def get_doc_meta(self, doc_id: int) -> tuple[str, int, int]:
+        """Return (path, start_line, end_line) for a doc_id."""
+        row = self._conn.execute(
+            "SELECT path, start_line, end_line FROM docs_meta WHERE id = ?",
+            (doc_id,),
+        ).fetchone()
+        if row:
+            return row[0], row[1] or 0, row[2] or 0
+        return "", 0, 0
--- a/codex-lens-v2/src/codexlens_search/search/pipeline.py
+++ b/codex-lens-v2/src/codexlens_search/search/pipeline.py
@@ -28,6 +28,9 @@ class SearchResult:
    path: str
    score: float
    snippet: str = ""
+    line: int = 0
+    end_line: int = 0
+    content: str = ""


 class SearchPipeline:
@@ -162,15 +165,17 @@ class SearchPipeline:

        results: list[SearchResult] = []
        for doc_id, score in ranked[:final_top_k]:
-            path = self._fts._conn.execute(
-                "SELECT path FROM docs_meta WHERE id = ?", (doc_id,)
-            ).fetchone()
+            path, start_line, end_line = self._fts.get_doc_meta(doc_id)
+            full_content = self._fts.get_content(doc_id)
            results.append(
                SearchResult(
                    id=doc_id,
-                    path=path[0] if path else "",
+                    path=path,
                    score=float(score),
-                    snippet=self._fts.get_content(doc_id)[:200],
+                    snippet=full_content[:200],
+                    line=start_line,
+                    end_line=end_line,
+                    content=full_content,
                )
            )
        return results