refactor: 移除图索引功能,修复内存泄露,优化嵌入生成

主要更改:

1. 移除图索引功能 (graph indexing)
   - 删除 graph_analyzer.py 及相关迁移文件
   - 移除 CLI 的 graph 命令和 --enrich 标志
   - 清理 chain_search.py 中的图查询方法 (370行)
   - 删除相关测试文件

2. 修复嵌入生成内存问题
   - 重构 generate_embeddings.py 使用流式批处理
   - 改用 embedding_manager 的内存安全实现
   - 文件从 548 行精简到 259 行 (52.7% 减少)

3. 修复内存泄露
   - chain_search.py: quick_search 使用 with 语句管理 ChainSearchEngine
   - embedding_manager.py: 使用 with 语句管理 VectorStore
   - vector_store.py: 添加暴力搜索内存警告

4. 代码清理
   - 移除 Symbol 模型的 token_count 和 symbol_type 字段
   - 清理相关测试用例

测试: 760 passed, 7 skipped

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
catlog22
2025-12-21 16:22:03 +08:00
parent 15d5890861
commit 3e9a309079
19 changed files with 165 additions and 3909 deletions

View File

@@ -268,7 +268,6 @@ def search(
files_only: bool = typer.Option(False, "--files-only", "-f", help="Return only file paths without content snippets."),
mode: str = typer.Option("auto", "--mode", "-m", help="Search mode: auto, exact, fuzzy, hybrid, vector, pure-vector."),
weights: Optional[str] = typer.Option(None, "--weights", help="Custom RRF weights as 'exact,fuzzy,vector' (e.g., '0.5,0.3,0.2')."),
enrich: bool = typer.Option(False, "--enrich", help="Enrich results with code graph relationships (calls, imports)."),
json_mode: bool = typer.Option(False, "--json", help="Output JSON response."),
verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable debug logging."),
) -> None:
@@ -423,30 +422,10 @@ def search(
for r in result.results
]
# Enrich results with relationship data if requested
enriched = False
if enrich:
try:
from codexlens.search.enrichment import RelationshipEnricher
# Find index path for the search path
project_record = registry.find_by_source_path(str(search_path))
if project_record:
index_path = Path(project_record["index_root"]) / "_index.db"
if index_path.exists():
with RelationshipEnricher(index_path) as enricher:
results_list = enricher.enrich(results_list, limit=limit)
enriched = True
except Exception as e:
# Enrichment failure should not break search
if verbose:
console.print(f"[yellow]Warning: Enrichment failed: {e}[/yellow]")
payload = {
"query": query,
"mode": actual_mode,
"count": len(results_list),
"enriched": enriched,
"results": results_list,
"stats": {
"dirs_searched": result.stats.dirs_searched,
@@ -458,8 +437,7 @@ def search(
print_json(success=True, result=payload)
else:
render_search_results(result.results, verbose=verbose)
enrich_status = " | [green]Enriched[/green]" if enriched else ""
console.print(f"[dim]Mode: {actual_mode} | Searched {result.stats.dirs_searched} directories in {result.stats.time_ms:.1f}ms{enrich_status}[/dim]")
console.print(f"[dim]Mode: {actual_mode} | Searched {result.stats.dirs_searched} directories in {result.stats.time_ms:.1f}ms[/dim]")
except SearchError as exc:
if json_mode:
@@ -1376,103 +1354,6 @@ def clean(
raise typer.Exit(code=1)
@app.command()
def graph(
query_type: str = typer.Argument(..., help="Query type: callers, callees, or inheritance"),
symbol: str = typer.Argument(..., help="Symbol name to query"),
path: Path = typer.Option(Path("."), "--path", "-p", help="Directory to search from."),
limit: int = typer.Option(50, "--limit", "-n", min=1, max=500, help="Max results."),
depth: int = typer.Option(-1, "--depth", "-d", help="Search depth (-1 = unlimited)."),
json_mode: bool = typer.Option(False, "--json", help="Output JSON response."),
verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable debug logging."),
) -> None:
"""Query semantic graph for code relationships.
Supported query types:
- callers: Find all functions/methods that call the given symbol
- callees: Find all functions/methods called by the given symbol
- inheritance: Find inheritance relationships for the given class
Examples:
codex-lens graph callers my_function
codex-lens graph callees MyClass.method --path src/
codex-lens graph inheritance BaseClass
"""
_configure_logging(verbose)
search_path = path.expanduser().resolve()
# Validate query type
valid_types = ["callers", "callees", "inheritance"]
if query_type not in valid_types:
if json_mode:
print_json(success=False, error=f"Invalid query type: {query_type}. Must be one of: {', '.join(valid_types)}")
else:
console.print(f"[red]Invalid query type:[/red] {query_type}")
console.print(f"[dim]Valid types: {', '.join(valid_types)}[/dim]")
raise typer.Exit(code=1)
registry: RegistryStore | None = None
try:
registry = RegistryStore()
registry.initialize()
mapper = PathMapper()
engine = ChainSearchEngine(registry, mapper)
options = SearchOptions(depth=depth, total_limit=limit)
# Execute graph query based on type
if query_type == "callers":
results = engine.search_callers(symbol, search_path, options=options)
result_type = "callers"
elif query_type == "callees":
results = engine.search_callees(symbol, search_path, options=options)
result_type = "callees"
else: # inheritance
results = engine.search_inheritance(symbol, search_path, options=options)
result_type = "inheritance"
payload = {
"query_type": query_type,
"symbol": symbol,
"count": len(results),
"relationships": results
}
if json_mode:
print_json(success=True, result=payload)
else:
from .output import render_graph_results
render_graph_results(results, query_type=query_type, symbol=symbol)
except SearchError as exc:
if json_mode:
print_json(success=False, error=f"Graph search error: {exc}")
else:
console.print(f"[red]Graph query failed (search):[/red] {exc}")
raise typer.Exit(code=1)
except StorageError as exc:
if json_mode:
print_json(success=False, error=f"Storage error: {exc}")
else:
console.print(f"[red]Graph query failed (storage):[/red] {exc}")
raise typer.Exit(code=1)
except CodexLensError as exc:
if json_mode:
print_json(success=False, error=str(exc))
else:
console.print(f"[red]Graph query failed:[/red] {exc}")
raise typer.Exit(code=1)
except Exception as exc:
if json_mode:
print_json(success=False, error=f"Unexpected error: {exc}")
else:
console.print(f"[red]Graph query failed (unexpected):[/red] {exc}")
raise typer.Exit(code=1)
finally:
if registry is not None:
registry.close()
@app.command("semantic-list")
def semantic_list(
path: Path = typer.Option(Path("."), "--path", "-p", help="Project path to list metadata from."),

View File

@@ -194,7 +194,6 @@ def generate_embeddings(
try:
# Use cached embedder (singleton) for performance
embedder = get_embedder(profile=model_profile)
vector_store = VectorStore(index_path)
chunker = Chunker(config=ChunkConfig(max_chunk_size=chunk_size))
if progress_callback:
@@ -217,85 +216,86 @@ def generate_embeddings(
EMBEDDING_BATCH_SIZE = 8 # jina-embeddings-v2-base-code needs small batches
try:
with sqlite3.connect(index_path) as conn:
conn.row_factory = sqlite3.Row
path_column = _get_path_column(conn)
with VectorStore(index_path) as vector_store:
with sqlite3.connect(index_path) as conn:
conn.row_factory = sqlite3.Row
path_column = _get_path_column(conn)
# Get total file count for progress reporting
total_files = conn.execute("SELECT COUNT(*) FROM files").fetchone()[0]
if total_files == 0:
return {"success": False, "error": "No files found in index"}
# Get total file count for progress reporting
total_files = conn.execute("SELECT COUNT(*) FROM files").fetchone()[0]
if total_files == 0:
return {"success": False, "error": "No files found in index"}
if progress_callback:
progress_callback(f"Processing {total_files} files in batches of {FILE_BATCH_SIZE}...")
cursor = conn.execute(f"SELECT {path_column}, content, language FROM files")
batch_number = 0
while True:
# Fetch a batch of files (streaming, not fetchall)
file_batch = cursor.fetchmany(FILE_BATCH_SIZE)
if not file_batch:
break
batch_number += 1
batch_chunks_with_paths = []
files_in_batch_with_chunks = set()
# Step 1: Chunking for the current file batch
for file_row in file_batch:
file_path = file_row[path_column]
content = file_row["content"]
language = file_row["language"] or "python"
try:
chunks = chunker.chunk_sliding_window(
content,
file_path=file_path,
language=language
)
if chunks:
for chunk in chunks:
batch_chunks_with_paths.append((chunk, file_path))
files_in_batch_with_chunks.add(file_path)
except Exception as e:
logger.error(f"Failed to chunk {file_path}: {e}")
failed_files.append((file_path, str(e)))
if not batch_chunks_with_paths:
continue
batch_chunk_count = len(batch_chunks_with_paths)
if progress_callback:
progress_callback(f" Batch {batch_number}: {len(file_batch)} files, {batch_chunk_count} chunks")
progress_callback(f"Processing {total_files} files in batches of {FILE_BATCH_SIZE}...")
# Step 2: Generate embeddings for this batch
batch_embeddings = []
try:
for i in range(0, batch_chunk_count, EMBEDDING_BATCH_SIZE):
batch_end = min(i + EMBEDDING_BATCH_SIZE, batch_chunk_count)
batch_contents = [chunk.content for chunk, _ in batch_chunks_with_paths[i:batch_end]]
embeddings = embedder.embed(batch_contents)
batch_embeddings.extend(embeddings)
except Exception as e:
logger.error(f"Failed to generate embeddings for batch {batch_number}: {str(e)}")
failed_files.extend([(file_row[path_column], str(e)) for file_row in file_batch])
continue
cursor = conn.execute(f"SELECT {path_column}, content, language FROM files")
batch_number = 0
# Step 3: Assign embeddings to chunks
for (chunk, _), embedding in zip(batch_chunks_with_paths, batch_embeddings):
chunk.embedding = embedding
while True:
# Fetch a batch of files (streaming, not fetchall)
file_batch = cursor.fetchmany(FILE_BATCH_SIZE)
if not file_batch:
break
# Step 4: Store this batch to database immediately (releases memory)
try:
vector_store.add_chunks_batch(batch_chunks_with_paths)
total_chunks_created += batch_chunk_count
total_files_processed += len(files_in_batch_with_chunks)
except Exception as e:
logger.error(f"Failed to store batch {batch_number}: {str(e)}")
failed_files.extend([(file_row[path_column], str(e)) for file_row in file_batch])
batch_number += 1
batch_chunks_with_paths = []
files_in_batch_with_chunks = set()
# Memory is released here as batch_chunks_with_paths and batch_embeddings go out of scope
# Step 1: Chunking for the current file batch
for file_row in file_batch:
file_path = file_row[path_column]
content = file_row["content"]
language = file_row["language"] or "python"
try:
chunks = chunker.chunk_sliding_window(
content,
file_path=file_path,
language=language
)
if chunks:
for chunk in chunks:
batch_chunks_with_paths.append((chunk, file_path))
files_in_batch_with_chunks.add(file_path)
except Exception as e:
logger.error(f"Failed to chunk {file_path}: {e}")
failed_files.append((file_path, str(e)))
if not batch_chunks_with_paths:
continue
batch_chunk_count = len(batch_chunks_with_paths)
if progress_callback:
progress_callback(f" Batch {batch_number}: {len(file_batch)} files, {batch_chunk_count} chunks")
# Step 2: Generate embeddings for this batch
batch_embeddings = []
try:
for i in range(0, batch_chunk_count, EMBEDDING_BATCH_SIZE):
batch_end = min(i + EMBEDDING_BATCH_SIZE, batch_chunk_count)
batch_contents = [chunk.content for chunk, _ in batch_chunks_with_paths[i:batch_end]]
embeddings = embedder.embed(batch_contents)
batch_embeddings.extend(embeddings)
except Exception as e:
logger.error(f"Failed to generate embeddings for batch {batch_number}: {str(e)}")
failed_files.extend([(file_row[path_column], str(e)) for file_row in file_batch])
continue
# Step 3: Assign embeddings to chunks
for (chunk, _), embedding in zip(batch_chunks_with_paths, batch_embeddings):
chunk.embedding = embedding
# Step 4: Store this batch to database immediately (releases memory)
try:
vector_store.add_chunks_batch(batch_chunks_with_paths)
total_chunks_created += batch_chunk_count
total_files_processed += len(files_in_batch_with_chunks)
except Exception as e:
logger.error(f"Failed to store batch {batch_number}: {str(e)}")
failed_files.extend([(file_row[path_column], str(e)) for file_row in file_batch])
# Memory is released here as batch_chunks_with_paths and batch_embeddings go out of scope
except Exception as e:
return {"success": False, "error": f"Failed to read or process files: {str(e)}"}

View File

@@ -122,68 +122,3 @@ def render_file_inspect(path: str, language: str, symbols: Iterable[Symbol]) ->
console.print(header)
render_symbols(list(symbols), title="Discovered Symbols")
def render_graph_results(results: list[dict[str, Any]], *, query_type: str, symbol: str) -> None:
"""Render semantic graph query results.
Args:
results: List of relationship dicts
query_type: Type of query (callers, callees, inheritance)
symbol: Symbol name that was queried
"""
if not results:
console.print(f"[yellow]No {query_type} found for symbol:[/yellow] {symbol}")
return
title_map = {
"callers": f"Callers of '{symbol}' ({len(results)} found)",
"callees": f"Callees of '{symbol}' ({len(results)} found)",
"inheritance": f"Inheritance relationships for '{symbol}' ({len(results)} found)"
}
table = Table(title=title_map.get(query_type, f"Graph Results ({len(results)})"))
if query_type == "callers":
table.add_column("Caller", style="green")
table.add_column("File", style="cyan", no_wrap=False, max_width=40)
table.add_column("Line", justify="right", style="yellow")
table.add_column("Type", style="dim")
for rel in results:
table.add_row(
rel.get("source_symbol", "-"),
rel.get("source_file", "-"),
str(rel.get("source_line", "-")),
rel.get("relationship_type", "-")
)
elif query_type == "callees":
table.add_column("Target", style="green")
table.add_column("File", style="cyan", no_wrap=False, max_width=40)
table.add_column("Line", justify="right", style="yellow")
table.add_column("Type", style="dim")
for rel in results:
table.add_row(
rel.get("target_symbol", "-"),
rel.get("target_file", "-") if rel.get("target_file") else rel.get("source_file", "-"),
str(rel.get("source_line", "-")),
rel.get("relationship_type", "-")
)
else: # inheritance
table.add_column("Derived Class", style="green")
table.add_column("Base Class", style="magenta")
table.add_column("File", style="cyan", no_wrap=False, max_width=40)
table.add_column("Line", justify="right", style="yellow")
for rel in results:
table.add_row(
rel.get("source_symbol", "-"),
rel.get("target_symbol", "-"),
rel.get("source_file", "-"),
str(rel.get("source_line", "-"))
)
console.print(table)