fix: 修复向量索引进度显示过早完成的问题

问题：FTS 索引完成后立即显示 100%，但嵌入生成仍在后台运行修复： - codex-lens.ts: 将 "Indexed X files" 阶段从 complete 改为 fts_complete (60%) - codex-lens.ts: 添加嵌入批次和 Finalizing index 阶段解析 - embedding_manager.py: 使用 bulk_insert() 模式延迟 ANN 索引构建 - embedding_manager.py: 添加 "Finalizing index" 进度回调 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-14 02:42:04 +08:00 · 2025-12-21 20:55:45 +08:00
parent 5849f751bc
commit 2871950ab8
2 changed files with 121 additions and 89 deletions
--- a/ccw/src/tools/codex-lens.ts
+++ b/ccw/src/tools/codex-lens.ts
@@ -414,17 +414,42 @@ function parseProgressLine(line: string): ProgressInfo | null {
    return { stage: 'complete', message: 'Finalizing...', percent: 95 };
  }

-  // Parse indexed count: "Indexed X files"
+  // Parse indexed count: "Indexed X files" - FTS complete, but embeddings may follow
  const indexedMatch = line.match(/Indexed (\d+) files/i);
  if (indexedMatch) {
    return {
-      stage: 'complete',
-      message: `Indexed ${indexedMatch[1]} files`,
-      percent: 100,
+      stage: 'fts_complete',  // Not 'complete' - embeddings generation may still be pending
+      message: `Indexed ${indexedMatch[1]} files, generating embeddings...`,
+      percent: 60,  // FTS done, embeddings starting
      filesProcessed: parseInt(indexedMatch[1], 10),
    };
  }

+  // Parse embedding batch progress: "Batch X: N files, M chunks"
+  const batchMatch = line.match(/Batch (\d+):\s*(\d+) files,\s*(\d+) chunks/i);
+  if (batchMatch) {
+    return {
+      stage: 'embeddings',
+      message: `Embedding batch ${batchMatch[1]}: ${batchMatch[3]} chunks`,
+      percent: 70,  // Stay at 70% during embedding batches
+    };
+  }
+
+  // Parse embedding progress with file count
+  const embedProgressMatch = line.match(/Processing (\d+) files/i);
+  if (embedProgressMatch && line.toLowerCase().includes('embed')) {
+    return {
+      stage: 'embeddings',
+      message: `Processing ${embedProgressMatch[1]} files for embeddings`,
+      percent: 75,
+    };
+  }
+
+  // Parse finalizing ANN index
+  if (line.includes('Finalizing index') || line.includes('Building ANN')) {
+    return { stage: 'finalizing', message: 'Finalizing vector index...', percent: 90 };
+  }
+
  return null;
 }

--- a/codex-lens/src/codexlens/cli/embedding_manager.py
+++ b/codex-lens/src/codexlens/cli/embedding_manager.py
@@ -222,6 +222,9 @@ def generate_embeddings(

    try:
        with VectorStore(index_path) as vector_store:
+            # Use bulk insert mode for efficient batch ANN index building
+            # This defers ANN updates until end_bulk_insert() is called
+            with vector_store.bulk_insert():
                with sqlite3.connect(index_path) as conn:
                    conn.row_factory = sqlite3.Row
                    path_column = _get_path_column(conn)
@@ -304,7 +307,7 @@ def generate_embeddings(
                        for (chunk, _), embedding in zip(batch_chunks_with_paths, batch_embeddings):
                            chunk.embedding = embedding

-                    # Step 4: Store this batch to database immediately (releases memory)
+                        # Step 4: Store this batch to database (ANN update deferred in bulk_insert mode)
                        try:
                            vector_store.add_chunks_batch(batch_chunks_with_paths)
                            total_chunks_created += batch_chunk_count
@@ -317,6 +320,10 @@ def generate_embeddings(
                        del batch_chunks_with_paths, batch_embeddings
                        gc.collect()

+                # Notify before ANN index finalization (happens when bulk_insert context exits)
+                if progress_callback:
+                    progress_callback(f"Finalizing index... Building ANN index for {total_chunks_created} chunks")
+
    except Exception as e:
        return {"success": False, "error": f"Failed to read or process files: {str(e)}"}