"""Three-stage parallel indexing pipeline: chunk -> embed -> index. Uses threading.Thread with queue.Queue for producer-consumer handoff. The GIL is acceptable because embedding (onnxruntime) releases it in C extensions. """ from __future__ import annotations import hashlib import logging import queue import threading import time from dataclasses import dataclass from pathlib import Path import numpy as np from codexlens_search.config import Config from codexlens_search.core.binary import BinaryStore from codexlens_search.core.index import ANNIndex from codexlens_search.embed.base import BaseEmbedder from codexlens_search.indexing.metadata import MetadataStore from codexlens_search.search.fts import FTSEngine logger = logging.getLogger(__name__) # Sentinel value to signal worker shutdown _SENTINEL = None # Defaults for chunking (can be overridden via index_files kwargs) _DEFAULT_MAX_CHUNK_CHARS = 800 _DEFAULT_CHUNK_OVERLAP = 100 @dataclass class IndexStats: """Statistics returned after indexing completes.""" files_processed: int = 0 chunks_created: int = 0 duration_seconds: float = 0.0 class IndexingPipeline: """Parallel 3-stage indexing pipeline with queue-based handoff. Stage 1 (main thread): Read files, chunk text, push to embed_queue. Stage 2 (embed worker): Pull text batches, call embed_batch(), push vectors to index_queue. Stage 3 (index worker): Pull vectors+ids, call BinaryStore.add(), ANNIndex.add(), FTS.add_documents(). After all stages complete, save() is called on BinaryStore and ANNIndex exactly once. """ def __init__( self, embedder: BaseEmbedder, binary_store: BinaryStore, ann_index: ANNIndex, fts: FTSEngine, config: Config, metadata: MetadataStore | None = None, ) -> None: self._embedder = embedder self._binary_store = binary_store self._ann_index = ann_index self._fts = fts self._config = config self._metadata = metadata def index_files( self, files: list[Path], *, root: Path | None = None, max_chunk_chars: int = _DEFAULT_MAX_CHUNK_CHARS, chunk_overlap: int = _DEFAULT_CHUNK_OVERLAP, max_file_size: int = 50_000, ) -> IndexStats: """Run the 3-stage pipeline on the given files. Args: files: List of file paths to index. root: Optional root for computing relative paths. If None, uses each file's absolute path as its identifier. max_chunk_chars: Maximum characters per chunk. chunk_overlap: Character overlap between consecutive chunks. max_file_size: Skip files larger than this (bytes). Returns: IndexStats with counts and timing. """ if not files: return IndexStats() t0 = time.monotonic() embed_queue: queue.Queue = queue.Queue(maxsize=4) index_queue: queue.Queue = queue.Queue(maxsize=4) # Track errors from workers worker_errors: list[Exception] = [] error_lock = threading.Lock() def _record_error(exc: Exception) -> None: with error_lock: worker_errors.append(exc) # --- Start workers --- embed_thread = threading.Thread( target=self._embed_worker, args=(embed_queue, index_queue, _record_error), daemon=True, name="indexing-embed", ) index_thread = threading.Thread( target=self._index_worker, args=(index_queue, _record_error), daemon=True, name="indexing-index", ) embed_thread.start() index_thread.start() # --- Stage 1: chunk files (main thread) --- chunk_id = 0 files_processed = 0 chunks_created = 0 for fpath in files: try: if fpath.stat().st_size > max_file_size: continue text = fpath.read_text(encoding="utf-8", errors="replace") except Exception as exc: logger.debug("Skipping %s: %s", fpath, exc) continue rel_path = str(fpath.relative_to(root)) if root else str(fpath) file_chunks = self._chunk_text(text, rel_path, max_chunk_chars, chunk_overlap) if not file_chunks: continue files_processed += 1 # Assign sequential IDs and push batch to embed queue batch_ids = [] batch_texts = [] batch_paths = [] batch_lines: list[tuple[int, int]] = [] for chunk_text, path, sl, el in file_chunks: batch_ids.append(chunk_id) batch_texts.append(chunk_text) batch_paths.append(path) batch_lines.append((sl, el)) chunk_id += 1 chunks_created += len(batch_ids) embed_queue.put((batch_ids, batch_texts, batch_paths, batch_lines)) # Signal embed worker: no more data embed_queue.put(_SENTINEL) # Wait for workers to finish embed_thread.join() index_thread.join() # --- Final flush --- self._binary_store.save() self._ann_index.save() duration = time.monotonic() - t0 stats = IndexStats( files_processed=files_processed, chunks_created=chunks_created, duration_seconds=round(duration, 2), ) logger.info( "Indexing complete: %d files, %d chunks in %.1fs", stats.files_processed, stats.chunks_created, stats.duration_seconds, ) # Raise first worker error if any occurred if worker_errors: raise worker_errors[0] return stats # ------------------------------------------------------------------ # Workers # ------------------------------------------------------------------ def _embed_worker( self, in_q: queue.Queue, out_q: queue.Queue, on_error: callable, ) -> None: """Stage 2: Pull chunk batches, embed, push (ids, vecs, docs) to index queue.""" try: while True: item = in_q.get() if item is _SENTINEL: break batch_ids, batch_texts, batch_paths, batch_lines = item try: vecs = self._embedder.embed_batch(batch_texts) vec_array = np.array(vecs, dtype=np.float32) id_array = np.array(batch_ids, dtype=np.int64) out_q.put((id_array, vec_array, batch_texts, batch_paths, batch_lines)) except Exception as exc: logger.error("Embed worker error: %s", exc) on_error(exc) finally: # Signal index worker: no more data out_q.put(_SENTINEL) def _index_worker( self, in_q: queue.Queue, on_error: callable, ) -> None: """Stage 3: Pull (ids, vecs, texts, paths, lines), write to stores.""" while True: item = in_q.get() if item is _SENTINEL: break id_array, vec_array, texts, paths, line_ranges = item try: self._binary_store.add(id_array, vec_array) self._ann_index.add(id_array, vec_array) fts_docs = [ (int(id_array[i]), paths[i], texts[i], line_ranges[i][0], line_ranges[i][1]) for i in range(len(id_array)) ] self._fts.add_documents(fts_docs) except Exception as exc: logger.error("Index worker error: %s", exc) on_error(exc) # ------------------------------------------------------------------ # Chunking # ------------------------------------------------------------------ @staticmethod def _chunk_text( text: str, path: str, max_chars: int, overlap: int, ) -> list[tuple[str, str, int, int]]: """Split file text into overlapping chunks. Returns list of (chunk_text, path, start_line, end_line) tuples. Line numbers are 1-based. """ if not text.strip(): return [] chunks: list[tuple[str, str, int, int]] = [] lines = text.splitlines(keepends=True) current: list[str] = [] current_len = 0 chunk_start_line = 1 # 1-based lines_consumed = 0 for line in lines: lines_consumed += 1 if current_len + len(line) > max_chars and current: chunk = "".join(current) end_line = lines_consumed - 1 chunks.append((chunk, path, chunk_start_line, end_line)) # overlap: keep last N characters tail = chunk[-overlap:] if overlap else "" tail_newlines = tail.count("\n") chunk_start_line = max(1, end_line - tail_newlines + 1) current = [tail] if tail else [] current_len = len(tail) current.append(line) current_len += len(line) if current: chunks.append(("".join(current), path, chunk_start_line, lines_consumed)) return chunks # ------------------------------------------------------------------ # Incremental API # ------------------------------------------------------------------ @staticmethod def _content_hash(text: str) -> str: """Compute SHA-256 hex digest of file content.""" return hashlib.sha256(text.encode("utf-8", errors="replace")).hexdigest() def _require_metadata(self) -> MetadataStore: """Return metadata store or raise if not configured.""" if self._metadata is None: raise RuntimeError( "MetadataStore is required for incremental indexing. " "Pass metadata= to IndexingPipeline.__init__." ) return self._metadata def _next_chunk_id(self) -> int: """Return the next available chunk ID from MetadataStore.""" meta = self._require_metadata() return meta.max_chunk_id() + 1 def index_file( self, file_path: Path, *, root: Path | None = None, force: bool = False, max_chunk_chars: int = _DEFAULT_MAX_CHUNK_CHARS, chunk_overlap: int = _DEFAULT_CHUNK_OVERLAP, max_file_size: int = 50_000, ) -> IndexStats: """Index a single file incrementally. Skips files that have not changed (same content_hash) unless *force* is True. Args: file_path: Path to the file to index. root: Optional root for computing relative path identifiers. force: Re-index even if content hash has not changed. max_chunk_chars: Maximum characters per chunk. chunk_overlap: Character overlap between consecutive chunks. max_file_size: Skip files larger than this (bytes). Returns: IndexStats with counts and timing. """ meta = self._require_metadata() t0 = time.monotonic() # Read file try: if file_path.stat().st_size > max_file_size: logger.debug("Skipping %s: exceeds max_file_size", file_path) return IndexStats(duration_seconds=round(time.monotonic() - t0, 2)) text = file_path.read_text(encoding="utf-8", errors="replace") except Exception as exc: logger.debug("Skipping %s: %s", file_path, exc) return IndexStats(duration_seconds=round(time.monotonic() - t0, 2)) content_hash = self._content_hash(text) rel_path = str(file_path.relative_to(root)) if root else str(file_path) # Check if update is needed if not force and not meta.file_needs_update(rel_path, content_hash): logger.debug("Skipping %s: unchanged", rel_path) return IndexStats(duration_seconds=round(time.monotonic() - t0, 2)) # If file was previously indexed, remove old data first if meta.get_file_hash(rel_path) is not None: meta.mark_file_deleted(rel_path) self._fts.delete_by_path(rel_path) # Chunk file_chunks = self._chunk_text(text, rel_path, max_chunk_chars, chunk_overlap) if not file_chunks: # Register file with no chunks meta.register_file(rel_path, content_hash, file_path.stat().st_mtime) return IndexStats( files_processed=1, duration_seconds=round(time.monotonic() - t0, 2), ) # Assign chunk IDs start_id = self._next_chunk_id() batch_ids = [] batch_texts = [] batch_paths = [] batch_lines: list[tuple[int, int]] = [] for i, (chunk_text, path, sl, el) in enumerate(file_chunks): batch_ids.append(start_id + i) batch_texts.append(chunk_text) batch_paths.append(path) batch_lines.append((sl, el)) # Embed synchronously vecs = self._embedder.embed_batch(batch_texts) vec_array = np.array(vecs, dtype=np.float32) id_array = np.array(batch_ids, dtype=np.int64) # Index: write to stores self._binary_store.add(id_array, vec_array) self._ann_index.add(id_array, vec_array) fts_docs = [ (batch_ids[i], batch_paths[i], batch_texts[i], batch_lines[i][0], batch_lines[i][1]) for i in range(len(batch_ids)) ] self._fts.add_documents(fts_docs) # Register in metadata meta.register_file(rel_path, content_hash, file_path.stat().st_mtime) chunk_id_hashes = [ (batch_ids[i], self._content_hash(batch_texts[i])) for i in range(len(batch_ids)) ] meta.register_chunks(rel_path, chunk_id_hashes) # Flush stores self._binary_store.save() self._ann_index.save() duration = time.monotonic() - t0 stats = IndexStats( files_processed=1, chunks_created=len(batch_ids), duration_seconds=round(duration, 2), ) logger.info( "Indexed file %s: %d chunks in %.2fs", rel_path, stats.chunks_created, stats.duration_seconds, ) return stats def remove_file(self, file_path: str) -> None: """Mark a file as deleted via tombstone strategy. Marks all chunk IDs for the file in MetadataStore.deleted_chunks and removes the file's FTS entries. Args: file_path: The relative path identifier of the file to remove. """ meta = self._require_metadata() count = meta.mark_file_deleted(file_path) fts_count = self._fts.delete_by_path(file_path) logger.info( "Removed file %s: %d chunks tombstoned, %d FTS entries deleted", file_path, count, fts_count, ) def sync( self, file_paths: list[Path], *, root: Path | None = None, max_chunk_chars: int = _DEFAULT_MAX_CHUNK_CHARS, chunk_overlap: int = _DEFAULT_CHUNK_OVERLAP, max_file_size: int = 50_000, ) -> IndexStats: """Reconcile index state against a current file list. Identifies files that are new, changed, or removed and processes each accordingly. Args: file_paths: Current list of files that should be indexed. root: Optional root for computing relative path identifiers. max_chunk_chars: Maximum characters per chunk. chunk_overlap: Character overlap between consecutive chunks. max_file_size: Skip files larger than this (bytes). Returns: Aggregated IndexStats for all operations. """ meta = self._require_metadata() t0 = time.monotonic() # Build set of current relative paths current_rel_paths: dict[str, Path] = {} for fpath in file_paths: rel = str(fpath.relative_to(root)) if root else str(fpath) current_rel_paths[rel] = fpath # Get known files from metadata known_files = meta.get_all_files() # {rel_path: content_hash} # Detect removed files removed = set(known_files.keys()) - set(current_rel_paths.keys()) for rel in removed: self.remove_file(rel) # Index new and changed files total_files = 0 total_chunks = 0 for rel, fpath in current_rel_paths.items(): stats = self.index_file( fpath, root=root, max_chunk_chars=max_chunk_chars, chunk_overlap=chunk_overlap, max_file_size=max_file_size, ) total_files += stats.files_processed total_chunks += stats.chunks_created duration = time.monotonic() - t0 result = IndexStats( files_processed=total_files, chunks_created=total_chunks, duration_seconds=round(duration, 2), ) logger.info( "Sync complete: %d files indexed, %d chunks created, " "%d files removed in %.1fs", result.files_processed, result.chunks_created, len(removed), result.duration_seconds, ) return result def compact(self) -> None: """Rebuild indexes excluding tombstoned chunk IDs. Reads all deleted IDs from MetadataStore, rebuilds BinaryStore and ANNIndex without those entries, then clears the deleted_chunks table. """ meta = self._require_metadata() deleted_ids = meta.compact_deleted() if not deleted_ids: logger.debug("Compact: no deleted IDs, nothing to do") return logger.info("Compact: rebuilding indexes, excluding %d deleted IDs", len(deleted_ids)) # Rebuild BinaryStore: read current data, filter, replace if self._binary_store._count > 0: active_ids = self._binary_store._ids[: self._binary_store._count] active_matrix = self._binary_store._matrix[: self._binary_store._count] mask = ~np.isin(active_ids, list(deleted_ids)) kept_ids = active_ids[mask] kept_matrix = active_matrix[mask] # Reset store self._binary_store._count = 0 self._binary_store._matrix = None self._binary_store._ids = None if len(kept_ids) > 0: self._binary_store._ensure_capacity(len(kept_ids)) self._binary_store._matrix[: len(kept_ids)] = kept_matrix self._binary_store._ids[: len(kept_ids)] = kept_ids self._binary_store._count = len(kept_ids) self._binary_store.save() # Rebuild ANNIndex: must reconstruct from scratch since HNSW # does not support deletion. We re-initialize and re-add kept items. # Note: we need the float32 vectors, but BinaryStore only has quantized. # ANNIndex (hnswlib) supports mark_deleted, but compact means full rebuild. # Since we don't have original float vectors cached, we rely on the fact # that ANNIndex.mark_deleted is not available in all hnswlib versions. # Instead, we reinitialize the index and let future searches filter via # deleted_ids at query time. The BinaryStore is already compacted above. # For a full ANN rebuild, the caller should re-run index_files() on all # files after compact. logger.info( "Compact: BinaryStore rebuilt (%d entries kept). " "Note: ANNIndex retains stale entries; run full re-index for clean ANN state.", self._binary_store._count, )