refactor: 移除图索引功能,修复内存泄露,优化嵌入生成

主要更改:

1. 移除图索引功能 (graph indexing)
   - 删除 graph_analyzer.py 及相关迁移文件
   - 移除 CLI 的 graph 命令和 --enrich 标志
   - 清理 chain_search.py 中的图查询方法 (370行)
   - 删除相关测试文件

2. 修复嵌入生成内存问题
   - 重构 generate_embeddings.py 使用流式批处理
   - 改用 embedding_manager 的内存安全实现
   - 文件从 548 行精简到 259 行 (52.7% 减少)

3. 修复内存泄露
   - chain_search.py: quick_search 使用 with 语句管理 ChainSearchEngine
   - embedding_manager.py: 使用 with 语句管理 VectorStore
   - vector_store.py: 添加暴力搜索内存警告

4. 代码清理
   - 移除 Symbol 模型的 token_count 和 symbol_type 字段
   - 清理相关测试用例

测试: 760 passed, 7 skipped

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
catlog22
2025-12-21 16:22:03 +08:00
parent 15d5890861
commit 3e9a309079
19 changed files with 165 additions and 3909 deletions

View File

@@ -17,7 +17,7 @@ from dataclasses import dataclass
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple
from codexlens.entities import CodeRelationship, SearchResult, Symbol
from codexlens.entities import SearchResult, Symbol
from codexlens.errors import StorageError
@@ -237,116 +237,6 @@ class DirIndexStore:
conn.rollback()
raise StorageError(f"Failed to add file {name}: {exc}") from exc
def add_relationships(
self,
file_path: str | Path,
relationships: List[CodeRelationship],
) -> int:
"""Store code relationships for a file.
Args:
file_path: Path to the source file
relationships: List of CodeRelationship objects to store
Returns:
Number of relationships stored
Raises:
StorageError: If database operations fail
"""
if not relationships:
return 0
with self._lock:
conn = self._get_connection()
file_path_str = str(Path(file_path).resolve())
try:
# Get file_id
row = conn.execute(
"SELECT id FROM files WHERE full_path=?", (file_path_str,)
).fetchone()
if not row:
return 0
file_id = int(row["id"])
# Delete existing relationships for symbols in this file
conn.execute(
"""
DELETE FROM code_relationships
WHERE source_symbol_id IN (
SELECT id FROM symbols WHERE file_id=?
)
""",
(file_id,),
)
# Insert new relationships
relationship_rows = []
skipped_relationships = []
for rel in relationships:
# Extract simple name from fully qualified name (e.g., "MyClass.my_method" -> "my_method")
# This handles cases where GraphAnalyzer generates qualified names but symbols table stores simple names
source_symbol_simple = rel.source_symbol.split(".")[-1] if "." in rel.source_symbol else rel.source_symbol
# Find symbol_id by name and file
symbol_row = conn.execute(
"""
SELECT id FROM symbols
WHERE file_id=? AND name=? AND start_line<=? AND end_line>=?
LIMIT 1
""",
(file_id, source_symbol_simple, rel.source_line, rel.source_line),
).fetchone()
if not symbol_row:
# Try matching by simple name only
symbol_row = conn.execute(
"SELECT id FROM symbols WHERE file_id=? AND name=? LIMIT 1",
(file_id, source_symbol_simple),
).fetchone()
if symbol_row:
relationship_rows.append((
int(symbol_row["id"]),
rel.target_symbol,
rel.relationship_type,
rel.source_line,
rel.target_file,
))
else:
# Log warning when symbol lookup fails
skipped_relationships.append(rel.source_symbol)
# Log skipped relationships for debugging
if skipped_relationships:
self.logger.warning(
"Failed to find source symbol IDs for %d relationships in %s: %s",
len(skipped_relationships),
file_path_str,
", ".join(set(skipped_relationships))
)
if relationship_rows:
conn.executemany(
"""
INSERT INTO code_relationships(
source_symbol_id, target_qualified_name, relationship_type,
source_line, target_file
)
VALUES(?, ?, ?, ?, ?)
""",
relationship_rows,
)
conn.commit()
return len(relationship_rows)
except sqlite3.DatabaseError as exc:
conn.rollback()
raise StorageError(f"Failed to add relationships: {exc}") from exc
def add_files_batch(
self, files: List[Tuple[str, Path, str, str, Optional[List[Symbol]]]]
) -> int:

View File

@@ -16,7 +16,6 @@ from typing import Dict, List, Optional, Set
from codexlens.config import Config
from codexlens.parsers.factory import ParserFactory
from codexlens.semantic.graph_analyzer import GraphAnalyzer
from codexlens.storage.dir_index import DirIndexStore
from codexlens.storage.path_mapper import PathMapper
from codexlens.storage.registry import ProjectInfo, RegistryStore
@@ -525,16 +524,6 @@ class IndexTreeBuilder:
symbols=indexed_file.symbols,
)
# Extract and store code relationships for graph visualization
if language_id in {"python", "javascript", "typescript"}:
graph_analyzer = GraphAnalyzer(language_id)
if graph_analyzer.is_available():
relationships = graph_analyzer.analyze_with_symbols(
text, file_path, indexed_file.symbols
)
if relationships:
store.add_relationships(file_path, relationships)
files_count += 1
symbols_count += len(indexed_file.symbols)
@@ -742,16 +731,6 @@ def _build_dir_worker(args: tuple) -> DirBuildResult:
symbols=indexed_file.symbols,
)
# Extract and store code relationships for graph visualization
if language_id in {"python", "javascript", "typescript"}:
graph_analyzer = GraphAnalyzer(language_id)
if graph_analyzer.is_available():
relationships = graph_analyzer.analyze_with_symbols(
text, item, indexed_file.symbols
)
if relationships:
store.add_relationships(item, relationships)
files_count += 1
symbols_count += len(indexed_file.symbols)

View File

@@ -1,57 +0,0 @@
"""
Migration 003: Add code relationships storage.
This migration introduces the `code_relationships` table to store semantic
relationships between code symbols (function calls, inheritance, imports).
This enables graph-based code navigation and dependency analysis.
"""
import logging
from sqlite3 import Connection
log = logging.getLogger(__name__)
def upgrade(db_conn: Connection):
"""
Applies the migration to add code relationships table.
- Creates `code_relationships` table with foreign key to symbols
- Creates indexes for efficient relationship queries
- Supports lazy expansion with target_symbol being qualified names
Args:
db_conn: The SQLite database connection.
"""
cursor = db_conn.cursor()
log.info("Creating 'code_relationships' table...")
cursor.execute(
"""
CREATE TABLE IF NOT EXISTS code_relationships (
id INTEGER PRIMARY KEY,
source_symbol_id INTEGER NOT NULL,
target_qualified_name TEXT NOT NULL,
relationship_type TEXT NOT NULL,
source_line INTEGER NOT NULL,
target_file TEXT,
FOREIGN KEY (source_symbol_id) REFERENCES symbols (id) ON DELETE CASCADE
)
"""
)
log.info("Creating indexes for code_relationships...")
cursor.execute(
"CREATE INDEX IF NOT EXISTS idx_relationships_source ON code_relationships (source_symbol_id)"
)
cursor.execute(
"CREATE INDEX IF NOT EXISTS idx_relationships_target ON code_relationships (target_qualified_name)"
)
cursor.execute(
"CREATE INDEX IF NOT EXISTS idx_relationships_type ON code_relationships (relationship_type)"
)
cursor.execute(
"CREATE INDEX IF NOT EXISTS idx_relationships_source_line ON code_relationships (source_line)"
)
log.info("Finished creating code_relationships table and indexes.")

View File

@@ -10,7 +10,7 @@ from dataclasses import asdict
from pathlib import Path
from typing import Any, Dict, Iterable, List, Optional, Tuple
from codexlens.entities import CodeRelationship, IndexedFile, SearchResult, Symbol
from codexlens.entities import IndexedFile, SearchResult, Symbol
from codexlens.errors import StorageError
@@ -420,167 +420,6 @@ class SQLiteStore:
}
def add_relationships(self, file_path: str | Path, relationships: List[CodeRelationship]) -> None:
"""Store code relationships for a file.
Args:
file_path: Path to the file containing the relationships
relationships: List of CodeRelationship objects to store
"""
if not relationships:
return
with self._lock:
conn = self._get_connection()
resolved_path = str(Path(file_path).resolve())
# Get file_id
row = conn.execute("SELECT id FROM files WHERE path=?", (resolved_path,)).fetchone()
if not row:
raise StorageError(f"File not found in index: {file_path}")
file_id = int(row["id"])
# Delete existing relationships for symbols in this file
conn.execute(
"""
DELETE FROM code_relationships
WHERE source_symbol_id IN (
SELECT id FROM symbols WHERE file_id=?
)
""",
(file_id,)
)
# Insert new relationships
relationship_rows = []
for rel in relationships:
# Find source symbol ID
symbol_row = conn.execute(
"""
SELECT id FROM symbols
WHERE file_id=? AND name=? AND start_line <= ? AND end_line >= ?
ORDER BY (end_line - start_line) ASC
LIMIT 1
""",
(file_id, rel.source_symbol, rel.source_line, rel.source_line)
).fetchone()
if symbol_row:
source_symbol_id = int(symbol_row["id"])
relationship_rows.append((
source_symbol_id,
rel.target_symbol,
rel.relationship_type,
rel.source_line,
rel.target_file
))
if relationship_rows:
conn.executemany(
"""
INSERT INTO code_relationships(
source_symbol_id, target_qualified_name, relationship_type,
source_line, target_file
)
VALUES(?, ?, ?, ?, ?)
""",
relationship_rows
)
conn.commit()
def query_relationships_by_target(
self, target_name: str, *, limit: int = 100
) -> List[Dict[str, Any]]:
"""Query relationships by target symbol name (find all callers).
Args:
target_name: Name of the target symbol
limit: Maximum number of results
Returns:
List of dicts containing relationship info with file paths and line numbers
"""
with self._lock:
conn = self._get_connection()
rows = conn.execute(
"""
SELECT
s.name AS source_symbol,
r.target_qualified_name,
r.relationship_type,
r.source_line,
f.full_path AS source_file,
r.target_file
FROM code_relationships r
JOIN symbols s ON r.source_symbol_id = s.id
JOIN files f ON s.file_id = f.id
WHERE r.target_qualified_name = ?
ORDER BY f.full_path, r.source_line
LIMIT ?
""",
(target_name, limit)
).fetchall()
return [
{
"source_symbol": row["source_symbol"],
"target_symbol": row["target_qualified_name"],
"relationship_type": row["relationship_type"],
"source_line": row["source_line"],
"source_file": row["source_file"],
"target_file": row["target_file"],
}
for row in rows
]
def query_relationships_by_source(
self, source_symbol: str, source_file: str | Path, *, limit: int = 100
) -> List[Dict[str, Any]]:
"""Query relationships by source symbol (find what a symbol calls).
Args:
source_symbol: Name of the source symbol
source_file: File path containing the source symbol
limit: Maximum number of results
Returns:
List of dicts containing relationship info
"""
with self._lock:
conn = self._get_connection()
resolved_path = str(Path(source_file).resolve())
rows = conn.execute(
"""
SELECT
s.name AS source_symbol,
r.target_qualified_name,
r.relationship_type,
r.source_line,
f.path AS source_file,
r.target_file
FROM code_relationships r
JOIN symbols s ON r.source_symbol_id = s.id
JOIN files f ON s.file_id = f.id
WHERE s.name = ? AND f.path = ?
ORDER BY r.source_line
LIMIT ?
""",
(source_symbol, resolved_path, limit)
).fetchall()
return [
{
"source_symbol": row["source_symbol"],
"target_symbol": row["target_qualified_name"],
"relationship_type": row["relationship_type"],
"source_line": row["source_line"],
"source_file": row["source_file"],
"target_file": row["target_file"],
}
for row in rows
]
def _connect(self) -> sqlite3.Connection:
"""Legacy method for backward compatibility."""
return self._get_connection()