refactor: 移除图索引功能，修复内存泄露，优化嵌入生成

主要更改: 1. 移除图索引功能 (graph indexing) - 删除 graph_analyzer.py 及相关迁移文件 - 移除 CLI 的 graph 命令和 --enrich 标志 - 清理 chain_search.py 中的图查询方法 (370行) - 删除相关测试文件 2. 修复嵌入生成内存问题 - 重构 generate_embeddings.py 使用流式批处理 - 改用 embedding_manager 的内存安全实现 - 文件从 548 行精简到 259 行 (52.7% 减少) 3. 修复内存泄露 - chain_search.py: quick_search 使用 with 语句管理 ChainSearchEngine - embedding_manager.py: 使用 with 语句管理 VectorStore - vector_store.py: 添加暴力搜索内存警告 4. 代码清理 - 移除 Symbol 模型的 token_count 和 symbol_type 字段 - 清理相关测试用例测试: 760 passed, 7 skipped 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-10 02:24:35 +08:00 · 2025-12-21 16:22:03 +08:00
parent 15d5890861
commit 3e9a309079
19 changed files with 165 additions and 3909 deletions
--- a/codex-lens/src/codexlens/cli/commands.py
+++ b/codex-lens/src/codexlens/cli/commands.py
@@ -268,7 +268,6 @@ def search(
    files_only: bool = typer.Option(False, "--files-only", "-f", help="Return only file paths without content snippets."),
    mode: str = typer.Option("auto", "--mode", "-m", help="Search mode: auto, exact, fuzzy, hybrid, vector, pure-vector."),
    weights: Optional[str] = typer.Option(None, "--weights", help="Custom RRF weights as 'exact,fuzzy,vector' (e.g., '0.5,0.3,0.2')."),
-    enrich: bool = typer.Option(False, "--enrich", help="Enrich results with code graph relationships (calls, imports)."),
    json_mode: bool = typer.Option(False, "--json", help="Output JSON response."),
    verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable debug logging."),
 ) -> None:
@@ -423,30 +422,10 @@ def search(
                for r in result.results
            ]

-            # Enrich results with relationship data if requested
-            enriched = False
-            if enrich:
-                try:
-                    from codexlens.search.enrichment import RelationshipEnricher
-
-                    # Find index path for the search path
-                    project_record = registry.find_by_source_path(str(search_path))
-                    if project_record:
-                        index_path = Path(project_record["index_root"]) / "_index.db"
-                        if index_path.exists():
-                            with RelationshipEnricher(index_path) as enricher:
-                                results_list = enricher.enrich(results_list, limit=limit)
-                            enriched = True
-                except Exception as e:
-                    # Enrichment failure should not break search
-                    if verbose:
-                        console.print(f"[yellow]Warning: Enrichment failed: {e}[/yellow]")
-
            payload = {
                "query": query,
                "mode": actual_mode,
                "count": len(results_list),
-                "enriched": enriched,
                "results": results_list,
                "stats": {
                    "dirs_searched": result.stats.dirs_searched,
@@ -458,8 +437,7 @@ def search(
                print_json(success=True, result=payload)
            else:
                render_search_results(result.results, verbose=verbose)
-                enrich_status = " | [green]Enriched[/green]" if enriched else ""
-                console.print(f"[dim]Mode: {actual_mode} | Searched {result.stats.dirs_searched} directories in {result.stats.time_ms:.1f}ms{enrich_status}[/dim]")
+                console.print(f"[dim]Mode: {actual_mode} | Searched {result.stats.dirs_searched} directories in {result.stats.time_ms:.1f}ms[/dim]")

    except SearchError as exc:
        if json_mode:
@@ -1376,103 +1354,6 @@ def clean(
            raise typer.Exit(code=1)


-@app.command()
-def graph(
-    query_type: str = typer.Argument(..., help="Query type: callers, callees, or inheritance"),
-    symbol: str = typer.Argument(..., help="Symbol name to query"),
-    path: Path = typer.Option(Path("."), "--path", "-p", help="Directory to search from."),
-    limit: int = typer.Option(50, "--limit", "-n", min=1, max=500, help="Max results."),
-    depth: int = typer.Option(-1, "--depth", "-d", help="Search depth (-1 = unlimited)."),
-    json_mode: bool = typer.Option(False, "--json", help="Output JSON response."),
-    verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable debug logging."),
-) -> None:
-    """Query semantic graph for code relationships.
-
-    Supported query types:
-    - callers: Find all functions/methods that call the given symbol
-    - callees: Find all functions/methods called by the given symbol
-    - inheritance: Find inheritance relationships for the given class
-
-    Examples:
-        codex-lens graph callers my_function
-        codex-lens graph callees MyClass.method --path src/
-        codex-lens graph inheritance BaseClass
-    """
-    _configure_logging(verbose)
-    search_path = path.expanduser().resolve()
-
-    # Validate query type
-    valid_types = ["callers", "callees", "inheritance"]
-    if query_type not in valid_types:
-        if json_mode:
-            print_json(success=False, error=f"Invalid query type: {query_type}. Must be one of: {', '.join(valid_types)}")
-        else:
-            console.print(f"[red]Invalid query type:[/red] {query_type}")
-            console.print(f"[dim]Valid types: {', '.join(valid_types)}[/dim]")
-            raise typer.Exit(code=1)
-
-    registry: RegistryStore | None = None
-    try:
-        registry = RegistryStore()
-        registry.initialize()
-        mapper = PathMapper()
-
-        engine = ChainSearchEngine(registry, mapper)
-        options = SearchOptions(depth=depth, total_limit=limit)
-
-        # Execute graph query based on type
-        if query_type == "callers":
-            results = engine.search_callers(symbol, search_path, options=options)
-            result_type = "callers"
-        elif query_type == "callees":
-            results = engine.search_callees(symbol, search_path, options=options)
-            result_type = "callees"
-        else:  # inheritance
-            results = engine.search_inheritance(symbol, search_path, options=options)
-            result_type = "inheritance"
-
-        payload = {
-            "query_type": query_type,
-            "symbol": symbol,
-            "count": len(results),
-            "relationships": results
-        }
-
-        if json_mode:
-            print_json(success=True, result=payload)
-        else:
-            from .output import render_graph_results
-            render_graph_results(results, query_type=query_type, symbol=symbol)
-
-    except SearchError as exc:
-        if json_mode:
-            print_json(success=False, error=f"Graph search error: {exc}")
-        else:
-            console.print(f"[red]Graph query failed (search):[/red] {exc}")
-            raise typer.Exit(code=1)
-    except StorageError as exc:
-        if json_mode:
-            print_json(success=False, error=f"Storage error: {exc}")
-        else:
-            console.print(f"[red]Graph query failed (storage):[/red] {exc}")
-            raise typer.Exit(code=1)
-    except CodexLensError as exc:
-        if json_mode:
-            print_json(success=False, error=str(exc))
-        else:
-            console.print(f"[red]Graph query failed:[/red] {exc}")
-            raise typer.Exit(code=1)
-    except Exception as exc:
-        if json_mode:
-            print_json(success=False, error=f"Unexpected error: {exc}")
-        else:
-            console.print(f"[red]Graph query failed (unexpected):[/red] {exc}")
-            raise typer.Exit(code=1)
-    finally:
-        if registry is not None:
-            registry.close()
-
-
@app.command("semantic-list")
 def semantic_list(
    path: Path = typer.Option(Path("."), "--path", "-p", help="Project path to list metadata from."),
--- a/codex-lens/src/codexlens/cli/embedding_manager.py
+++ b/codex-lens/src/codexlens/cli/embedding_manager.py
@@ -194,7 +194,6 @@ def generate_embeddings(
    try:
        # Use cached embedder (singleton) for performance
        embedder = get_embedder(profile=model_profile)
-        vector_store = VectorStore(index_path)
        chunker = Chunker(config=ChunkConfig(max_chunk_size=chunk_size))

        if progress_callback:
@@ -217,85 +216,86 @@ def generate_embeddings(
    EMBEDDING_BATCH_SIZE = 8  # jina-embeddings-v2-base-code needs small batches

    try:
-        with sqlite3.connect(index_path) as conn:
-            conn.row_factory = sqlite3.Row
-            path_column = _get_path_column(conn)
+        with VectorStore(index_path) as vector_store:
+            with sqlite3.connect(index_path) as conn:
+                conn.row_factory = sqlite3.Row
+                path_column = _get_path_column(conn)

-            # Get total file count for progress reporting
-            total_files = conn.execute("SELECT COUNT(*) FROM files").fetchone()[0]
-            if total_files == 0:
-                return {"success": False, "error": "No files found in index"}
+                # Get total file count for progress reporting
+                total_files = conn.execute("SELECT COUNT(*) FROM files").fetchone()[0]
+                if total_files == 0:
+                    return {"success": False, "error": "No files found in index"}

-            if progress_callback:
-                progress_callback(f"Processing {total_files} files in batches of {FILE_BATCH_SIZE}...")
-
-            cursor = conn.execute(f"SELECT {path_column}, content, language FROM files")
-            batch_number = 0
-
-            while True:
-                # Fetch a batch of files (streaming, not fetchall)
-                file_batch = cursor.fetchmany(FILE_BATCH_SIZE)
-                if not file_batch:
-                    break
-
-                batch_number += 1
-                batch_chunks_with_paths = []
-                files_in_batch_with_chunks = set()
-
-                # Step 1: Chunking for the current file batch
-                for file_row in file_batch:
-                    file_path = file_row[path_column]
-                    content = file_row["content"]
-                    language = file_row["language"] or "python"
-
-                    try:
-                        chunks = chunker.chunk_sliding_window(
-                            content,
-                            file_path=file_path,
-                            language=language
-                        )
-                        if chunks:
-                            for chunk in chunks:
-                                batch_chunks_with_paths.append((chunk, file_path))
-                            files_in_batch_with_chunks.add(file_path)
-                    except Exception as e:
-                        logger.error(f"Failed to chunk {file_path}: {e}")
-                        failed_files.append((file_path, str(e)))
-
-                if not batch_chunks_with_paths:
-                    continue
-
-                batch_chunk_count = len(batch_chunks_with_paths)
                if progress_callback:
-                    progress_callback(f"  Batch {batch_number}: {len(file_batch)} files, {batch_chunk_count} chunks")
+                    progress_callback(f"Processing {total_files} files in batches of {FILE_BATCH_SIZE}...")

-                # Step 2: Generate embeddings for this batch
-                batch_embeddings = []
-                try:
-                    for i in range(0, batch_chunk_count, EMBEDDING_BATCH_SIZE):
-                        batch_end = min(i + EMBEDDING_BATCH_SIZE, batch_chunk_count)
-                        batch_contents = [chunk.content for chunk, _ in batch_chunks_with_paths[i:batch_end]]
-                        embeddings = embedder.embed(batch_contents)
-                        batch_embeddings.extend(embeddings)
-                except Exception as e:
-                    logger.error(f"Failed to generate embeddings for batch {batch_number}: {str(e)}")
-                    failed_files.extend([(file_row[path_column], str(e)) for file_row in file_batch])
-                    continue
+                cursor = conn.execute(f"SELECT {path_column}, content, language FROM files")
+                batch_number = 0

-                # Step 3: Assign embeddings to chunks
-                for (chunk, _), embedding in zip(batch_chunks_with_paths, batch_embeddings):
-                    chunk.embedding = embedding
+                while True:
+                    # Fetch a batch of files (streaming, not fetchall)
+                    file_batch = cursor.fetchmany(FILE_BATCH_SIZE)
+                    if not file_batch:
+                        break

-                # Step 4: Store this batch to database immediately (releases memory)
-                try:
-                    vector_store.add_chunks_batch(batch_chunks_with_paths)
-                    total_chunks_created += batch_chunk_count
-                    total_files_processed += len(files_in_batch_with_chunks)
-                except Exception as e:
-                    logger.error(f"Failed to store batch {batch_number}: {str(e)}")
-                    failed_files.extend([(file_row[path_column], str(e)) for file_row in file_batch])
+                    batch_number += 1
+                    batch_chunks_with_paths = []
+                    files_in_batch_with_chunks = set()

-                # Memory is released here as batch_chunks_with_paths and batch_embeddings go out of scope
+                    # Step 1: Chunking for the current file batch
+                    for file_row in file_batch:
+                        file_path = file_row[path_column]
+                        content = file_row["content"]
+                        language = file_row["language"] or "python"
+
+                        try:
+                            chunks = chunker.chunk_sliding_window(
+                                content,
+                                file_path=file_path,
+                                language=language
+                            )
+                            if chunks:
+                                for chunk in chunks:
+                                    batch_chunks_with_paths.append((chunk, file_path))
+                                files_in_batch_with_chunks.add(file_path)
+                        except Exception as e:
+                            logger.error(f"Failed to chunk {file_path}: {e}")
+                            failed_files.append((file_path, str(e)))
+
+                    if not batch_chunks_with_paths:
+                        continue
+
+                    batch_chunk_count = len(batch_chunks_with_paths)
+                    if progress_callback:
+                        progress_callback(f"  Batch {batch_number}: {len(file_batch)} files, {batch_chunk_count} chunks")
+
+                    # Step 2: Generate embeddings for this batch
+                    batch_embeddings = []
+                    try:
+                        for i in range(0, batch_chunk_count, EMBEDDING_BATCH_SIZE):
+                            batch_end = min(i + EMBEDDING_BATCH_SIZE, batch_chunk_count)
+                            batch_contents = [chunk.content for chunk, _ in batch_chunks_with_paths[i:batch_end]]
+                            embeddings = embedder.embed(batch_contents)
+                            batch_embeddings.extend(embeddings)
+                    except Exception as e:
+                        logger.error(f"Failed to generate embeddings for batch {batch_number}: {str(e)}")
+                        failed_files.extend([(file_row[path_column], str(e)) for file_row in file_batch])
+                        continue
+
+                    # Step 3: Assign embeddings to chunks
+                    for (chunk, _), embedding in zip(batch_chunks_with_paths, batch_embeddings):
+                        chunk.embedding = embedding
+
+                    # Step 4: Store this batch to database immediately (releases memory)
+                    try:
+                        vector_store.add_chunks_batch(batch_chunks_with_paths)
+                        total_chunks_created += batch_chunk_count
+                        total_files_processed += len(files_in_batch_with_chunks)
+                    except Exception as e:
+                        logger.error(f"Failed to store batch {batch_number}: {str(e)}")
+                        failed_files.extend([(file_row[path_column], str(e)) for file_row in file_batch])
+
+                    # Memory is released here as batch_chunks_with_paths and batch_embeddings go out of scope

    except Exception as e:
        return {"success": False, "error": f"Failed to read or process files: {str(e)}"}
--- a/codex-lens/src/codexlens/cli/output.py
+++ b/codex-lens/src/codexlens/cli/output.py
@@ -122,68 +122,3 @@ def render_file_inspect(path: str, language: str, symbols: Iterable[Symbol]) ->
    console.print(header)
    render_symbols(list(symbols), title="Discovered Symbols")

-
-def render_graph_results(results: list[dict[str, Any]], *, query_type: str, symbol: str) -> None:
-    """Render semantic graph query results.
-
-    Args:
-        results: List of relationship dicts
-        query_type: Type of query (callers, callees, inheritance)
-        symbol: Symbol name that was queried
-    """
-    if not results:
-        console.print(f"[yellow]No {query_type} found for symbol:[/yellow] {symbol}")
-        return
-
-    title_map = {
-        "callers": f"Callers of '{symbol}' ({len(results)} found)",
-        "callees": f"Callees of '{symbol}' ({len(results)} found)",
-        "inheritance": f"Inheritance relationships for '{symbol}' ({len(results)} found)"
-    }
-
-    table = Table(title=title_map.get(query_type, f"Graph Results ({len(results)})"))
-
-    if query_type == "callers":
-        table.add_column("Caller", style="green")
-        table.add_column("File", style="cyan", no_wrap=False, max_width=40)
-        table.add_column("Line", justify="right", style="yellow")
-        table.add_column("Type", style="dim")
-
-        for rel in results:
-            table.add_row(
-                rel.get("source_symbol", "-"),
-                rel.get("source_file", "-"),
-                str(rel.get("source_line", "-")),
-                rel.get("relationship_type", "-")
-            )
-
-    elif query_type == "callees":
-        table.add_column("Target", style="green")
-        table.add_column("File", style="cyan", no_wrap=False, max_width=40)
-        table.add_column("Line", justify="right", style="yellow")
-        table.add_column("Type", style="dim")
-
-        for rel in results:
-            table.add_row(
-                rel.get("target_symbol", "-"),
-                rel.get("target_file", "-") if rel.get("target_file") else rel.get("source_file", "-"),
-                str(rel.get("source_line", "-")),
-                rel.get("relationship_type", "-")
-            )
-
-    else:  # inheritance
-        table.add_column("Derived Class", style="green")
-        table.add_column("Base Class", style="magenta")
-        table.add_column("File", style="cyan", no_wrap=False, max_width=40)
-        table.add_column("Line", justify="right", style="yellow")
-
-        for rel in results:
-            table.add_row(
-                rel.get("source_symbol", "-"),
-                rel.get("target_symbol", "-"),
-                rel.get("source_file", "-"),
-                str(rel.get("source_line", "-"))
-            )
-
-    console.print(table)
-