mirror of
https://github.com/catlog22/Claude-Code-Workflow.git
synced 2026-02-05 01:50:27 +08:00
feat: Enhance embedding generation to track current index path and improve metadata retrieval
This commit is contained in:
@@ -17,8 +17,8 @@ Usage:
|
|||||||
# Use specific embedding model
|
# Use specific embedding model
|
||||||
python generate_embeddings.py /path/to/_index.db --model code
|
python generate_embeddings.py /path/to/_index.db --model code
|
||||||
|
|
||||||
# Generate embeddings for all indexes in a directory
|
# Generate centralized embeddings for all indexes in a directory
|
||||||
python generate_embeddings.py --scan ~/.codexlens/indexes
|
python generate_embeddings.py --centralized ~/.codexlens/indexes
|
||||||
|
|
||||||
# Force regeneration
|
# Force regeneration
|
||||||
python generate_embeddings.py /path/to/_index.db --force
|
python generate_embeddings.py /path/to/_index.db --force
|
||||||
@@ -27,8 +27,8 @@ Usage:
|
|||||||
import argparse
|
import argparse
|
||||||
import logging
|
import logging
|
||||||
import sys
|
import sys
|
||||||
|
import warnings
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import List
|
|
||||||
|
|
||||||
# Configure logging
|
# Configure logging
|
||||||
logging.basicConfig(
|
logging.basicConfig(
|
||||||
@@ -42,7 +42,7 @@ logger = logging.getLogger(__name__)
|
|||||||
try:
|
try:
|
||||||
from codexlens.cli.embedding_manager import (
|
from codexlens.cli.embedding_manager import (
|
||||||
generate_embeddings,
|
generate_embeddings,
|
||||||
generate_embeddings_recursive,
|
generate_dense_embeddings_centralized,
|
||||||
)
|
)
|
||||||
from codexlens.semantic import SEMANTIC_AVAILABLE
|
from codexlens.semantic import SEMANTIC_AVAILABLE
|
||||||
except ImportError as exc:
|
except ImportError as exc:
|
||||||
@@ -135,13 +135,20 @@ def main():
|
|||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"index_path",
|
"index_path",
|
||||||
type=Path,
|
type=Path,
|
||||||
help="Path to _index.db file or directory to scan"
|
help="Path to _index.db file or directory for centralized mode"
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--centralized",
|
||||||
|
"-c",
|
||||||
|
action="store_true",
|
||||||
|
help="Use centralized vector storage (single HNSW index at project root)"
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--scan",
|
"--scan",
|
||||||
action="store_true",
|
action="store_true",
|
||||||
help="Scan directory tree for all _index.db files"
|
help="(Deprecated) Use --centralized instead"
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
@@ -203,14 +210,25 @@ def main():
|
|||||||
logger.error(f"Path not found: {index_path}")
|
logger.error(f"Path not found: {index_path}")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
# Determine if scanning or single file
|
# Handle deprecated --scan flag
|
||||||
if args.scan or index_path.is_dir():
|
use_centralized = args.centralized
|
||||||
# Scan mode - use recursive implementation
|
if args.scan:
|
||||||
|
warnings.warn(
|
||||||
|
"--scan is deprecated, use --centralized instead",
|
||||||
|
DeprecationWarning
|
||||||
|
)
|
||||||
|
logger.warning("--scan is deprecated. Use --centralized instead.")
|
||||||
|
use_centralized = True
|
||||||
|
|
||||||
|
# Determine if using centralized mode or single file
|
||||||
|
if use_centralized or index_path.is_dir():
|
||||||
|
# Centralized mode - single HNSW index at project root
|
||||||
if index_path.is_file():
|
if index_path.is_file():
|
||||||
logger.error("--scan requires a directory path")
|
logger.error("--centralized requires a directory path")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
result = generate_embeddings_recursive(
|
logger.info(f"Generating centralized embeddings for: {index_path}")
|
||||||
|
result = generate_dense_embeddings_centralized(
|
||||||
index_root=index_path,
|
index_root=index_path,
|
||||||
model_profile=args.model,
|
model_profile=args.model,
|
||||||
force=args.force,
|
force=args.force,
|
||||||
@@ -225,13 +243,14 @@ def main():
|
|||||||
# Log summary
|
# Log summary
|
||||||
data = result["result"]
|
data = result["result"]
|
||||||
logger.info(f"\n{'='*60}")
|
logger.info(f"\n{'='*60}")
|
||||||
logger.info("BATCH PROCESSING COMPLETE")
|
logger.info("CENTRALIZED EMBEDDING COMPLETE")
|
||||||
logger.info(f"{'='*60}")
|
logger.info(f"{'='*60}")
|
||||||
logger.info(f"Indexes processed: {data['indexes_successful']}/{data['indexes_processed']}")
|
logger.info(f"Total chunks created: {data['chunks_created']}")
|
||||||
logger.info(f"Total chunks created: {data['total_chunks_created']}")
|
logger.info(f"Total files processed: {data['files_processed']}")
|
||||||
logger.info(f"Total files processed: {data['total_files_processed']}")
|
if data.get('files_failed', 0) > 0:
|
||||||
if data['total_files_failed'] > 0:
|
logger.warning(f"Total files failed: {data['files_failed']}")
|
||||||
logger.warning(f"Total files failed: {data['total_files_failed']}")
|
logger.info(f"Central index: {data.get('central_index_path', 'N/A')}")
|
||||||
|
logger.info(f"Time: {data.get('elapsed_time', 0):.1f}s")
|
||||||
|
|
||||||
else:
|
else:
|
||||||
# Single index mode
|
# Single index mode
|
||||||
@@ -250,7 +269,7 @@ def main():
|
|||||||
logger.error(f"Failed: {result.get('error', 'Unknown error')}")
|
logger.error(f"Failed: {result.get('error', 'Unknown error')}")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
logger.info("\n✓ Embeddings generation complete!")
|
logger.info("\nv Embeddings generation complete!")
|
||||||
logger.info("\nYou can now use vector search:")
|
logger.info("\nYou can now use vector search:")
|
||||||
logger.info(" codexlens search 'your query' --mode pure-vector")
|
logger.info(" codexlens search 'your query' --mode pure-vector")
|
||||||
|
|
||||||
|
|||||||
@@ -1990,12 +1990,6 @@ def embeddings_generate(
|
|||||||
"--chunk-size",
|
"--chunk-size",
|
||||||
help="Maximum chunk size in characters.",
|
help="Maximum chunk size in characters.",
|
||||||
),
|
),
|
||||||
recursive: bool = typer.Option(
|
|
||||||
False,
|
|
||||||
"--recursive",
|
|
||||||
"-r",
|
|
||||||
help="Recursively process all _index.db files in directory tree.",
|
|
||||||
),
|
|
||||||
max_workers: int = typer.Option(
|
max_workers: int = typer.Option(
|
||||||
1,
|
1,
|
||||||
"--max-workers",
|
"--max-workers",
|
||||||
@@ -2049,7 +2043,6 @@ def embeddings_generate(
|
|||||||
|
|
||||||
from codexlens.cli.embedding_manager import (
|
from codexlens.cli.embedding_manager import (
|
||||||
generate_embeddings,
|
generate_embeddings,
|
||||||
generate_embeddings_recursive,
|
|
||||||
generate_dense_embeddings_centralized,
|
generate_dense_embeddings_centralized,
|
||||||
scan_for_model_conflicts,
|
scan_for_model_conflicts,
|
||||||
check_global_model_lock,
|
check_global_model_lock,
|
||||||
@@ -2070,25 +2063,21 @@ def embeddings_generate(
|
|||||||
# Resolve path
|
# Resolve path
|
||||||
target_path = path.expanduser().resolve()
|
target_path = path.expanduser().resolve()
|
||||||
|
|
||||||
# Determine if we should use recursive mode
|
# Determine index path or root for centralized mode
|
||||||
use_recursive = False
|
|
||||||
index_path = None
|
index_path = None
|
||||||
index_root = None
|
index_root = None
|
||||||
|
|
||||||
if target_path.is_file() and target_path.name == "_index.db":
|
if target_path.is_file() and target_path.name == "_index.db":
|
||||||
# Direct index file
|
# Direct index file
|
||||||
index_path = target_path
|
index_path = target_path
|
||||||
if recursive:
|
index_root = target_path.parent
|
||||||
# Use parent directory for recursive processing
|
|
||||||
use_recursive = True
|
|
||||||
index_root = target_path.parent
|
|
||||||
elif target_path.is_dir():
|
elif target_path.is_dir():
|
||||||
if recursive:
|
# Directory: Try to find index for this project
|
||||||
# Recursive mode: process all _index.db files in directory tree
|
if centralized:
|
||||||
use_recursive = True
|
# Centralized mode uses directory as root
|
||||||
index_root = target_path
|
index_root = target_path
|
||||||
else:
|
else:
|
||||||
# Non-recursive: Try to find index for this project
|
# Single index mode: find the specific index
|
||||||
registry = RegistryStore()
|
registry = RegistryStore()
|
||||||
try:
|
try:
|
||||||
registry.initialize()
|
registry.initialize()
|
||||||
@@ -2099,6 +2088,7 @@ def embeddings_generate(
|
|||||||
console.print(f"[red]Error:[/red] No index found for {target_path}")
|
console.print(f"[red]Error:[/red] No index found for {target_path}")
|
||||||
console.print("Run 'codexlens init' first to create an index")
|
console.print("Run 'codexlens init' first to create an index")
|
||||||
raise typer.Exit(code=1)
|
raise typer.Exit(code=1)
|
||||||
|
index_root = index_path.parent
|
||||||
finally:
|
finally:
|
||||||
registry.close()
|
registry.close()
|
||||||
else:
|
else:
|
||||||
@@ -2115,9 +2105,6 @@ def embeddings_generate(
|
|||||||
effective_root = index_root if index_root else (index_path.parent if index_path else target_path)
|
effective_root = index_root if index_root else (index_path.parent if index_path else target_path)
|
||||||
console.print(f"Index root: [dim]{effective_root}[/dim]")
|
console.print(f"Index root: [dim]{effective_root}[/dim]")
|
||||||
console.print(f"Mode: [green]Centralized[/green]")
|
console.print(f"Mode: [green]Centralized[/green]")
|
||||||
elif use_recursive:
|
|
||||||
console.print(f"Index root: [dim]{index_root}[/dim]")
|
|
||||||
console.print(f"Mode: [yellow]Recursive[/yellow]")
|
|
||||||
else:
|
else:
|
||||||
console.print(f"Index: [dim]{index_path}[/dim]")
|
console.print(f"Index: [dim]{index_path}[/dim]")
|
||||||
console.print(f"Backend: [cyan]{backend}[/cyan]")
|
console.print(f"Backend: [cyan]{backend}[/cyan]")
|
||||||
@@ -2154,7 +2141,7 @@ def embeddings_generate(
|
|||||||
# Pre-check for model conflicts (only if not forcing)
|
# Pre-check for model conflicts (only if not forcing)
|
||||||
if not force:
|
if not force:
|
||||||
# Determine the index root for conflict scanning
|
# Determine the index root for conflict scanning
|
||||||
scan_root = index_root if use_recursive else (index_path.parent if index_path else None)
|
scan_root = index_root if index_root else (index_path.parent if index_path else None)
|
||||||
|
|
||||||
if scan_root:
|
if scan_root:
|
||||||
conflict_result = scan_for_model_conflicts(scan_root, backend, model)
|
conflict_result = scan_for_model_conflicts(scan_root, backend, model)
|
||||||
@@ -2208,16 +2195,6 @@ def embeddings_generate(
|
|||||||
progress_callback=progress_update,
|
progress_callback=progress_update,
|
||||||
max_workers=max_workers,
|
max_workers=max_workers,
|
||||||
)
|
)
|
||||||
elif use_recursive:
|
|
||||||
result = generate_embeddings_recursive(
|
|
||||||
index_root,
|
|
||||||
embedding_backend=backend,
|
|
||||||
model_profile=model,
|
|
||||||
force=force,
|
|
||||||
chunk_size=chunk_size,
|
|
||||||
progress_callback=progress_update,
|
|
||||||
max_workers=max_workers,
|
|
||||||
)
|
|
||||||
else:
|
else:
|
||||||
result = generate_embeddings(
|
result = generate_embeddings(
|
||||||
index_path,
|
index_path,
|
||||||
@@ -2257,7 +2234,7 @@ def embeddings_generate(
|
|||||||
if centralized:
|
if centralized:
|
||||||
# Centralized mode output
|
# Centralized mode output
|
||||||
elapsed = data.get("elapsed_time", 0)
|
elapsed = data.get("elapsed_time", 0)
|
||||||
console.print(f"[green]✓[/green] Centralized embeddings generated successfully!")
|
console.print(f"[green]v[/green] Centralized embeddings generated successfully!")
|
||||||
console.print(f" Model: {data.get('model_name', model)}")
|
console.print(f" Model: {data.get('model_name', model)}")
|
||||||
console.print(f" Chunks created: {data['chunks_created']:,}")
|
console.print(f" Chunks created: {data['chunks_created']:,}")
|
||||||
console.print(f" Files processed: {data['files_processed']}")
|
console.print(f" Files processed: {data['files_processed']}")
|
||||||
@@ -2265,32 +2242,11 @@ def embeddings_generate(
|
|||||||
console.print(f" [yellow]Files failed: {data['files_failed']}[/yellow]")
|
console.print(f" [yellow]Files failed: {data['files_failed']}[/yellow]")
|
||||||
console.print(f" Central index: {data.get('central_index_path', 'N/A')}")
|
console.print(f" Central index: {data.get('central_index_path', 'N/A')}")
|
||||||
console.print(f" Time: {elapsed:.1f}s")
|
console.print(f" Time: {elapsed:.1f}s")
|
||||||
elif use_recursive:
|
|
||||||
# Recursive mode output
|
|
||||||
console.print(f"[green]✓[/green] Recursive embeddings generation complete!")
|
|
||||||
console.print(f" Indexes processed: {data['indexes_processed']}")
|
|
||||||
console.print(f" Indexes successful: {data['indexes_successful']}")
|
|
||||||
if data['indexes_failed'] > 0:
|
|
||||||
console.print(f" [yellow]Indexes failed: {data['indexes_failed']}[/yellow]")
|
|
||||||
console.print(f" Total chunks created: {data['total_chunks_created']:,}")
|
|
||||||
console.print(f" Total files processed: {data['total_files_processed']}")
|
|
||||||
if data['total_files_failed'] > 0:
|
|
||||||
console.print(f" [yellow]Total files failed: {data['total_files_failed']}[/yellow]")
|
|
||||||
console.print(f" Model profile: {data['model_profile']}")
|
|
||||||
|
|
||||||
# Show details if verbose
|
|
||||||
if verbose and data.get('details'):
|
|
||||||
console.print("\n[dim]Index details:[/dim]")
|
|
||||||
for detail in data['details']:
|
|
||||||
status_icon = "[green]✓[/green]" if detail['success'] else "[red]✗[/red]"
|
|
||||||
console.print(f" {status_icon} {detail['path']}")
|
|
||||||
if not detail['success'] and detail.get('error'):
|
|
||||||
console.print(f" [dim]Error: {detail['error']}[/dim]")
|
|
||||||
else:
|
else:
|
||||||
# Single index mode output
|
# Single index mode output
|
||||||
elapsed = data["elapsed_time"]
|
elapsed = data["elapsed_time"]
|
||||||
|
|
||||||
console.print(f"[green]✓[/green] Embeddings generated successfully!")
|
console.print(f"[green]v[/green] Embeddings generated successfully!")
|
||||||
console.print(f" Model: {data['model_name']}")
|
console.print(f" Model: {data['model_name']}")
|
||||||
console.print(f" Chunks created: {data['chunks_created']:,}")
|
console.print(f" Chunks created: {data['chunks_created']:,}")
|
||||||
console.print(f" Files processed: {data['files_processed']}")
|
console.print(f" Files processed: {data['files_processed']}")
|
||||||
|
|||||||
@@ -848,8 +848,10 @@ def generate_embeddings(
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def discover_all_index_dbs(index_root: Path) -> List[Path]:
|
def _discover_index_dbs_internal(index_root: Path) -> List[Path]:
|
||||||
"""Recursively find all _index.db files in an index tree.
|
"""Internal helper to find all _index.db files (no deprecation warning).
|
||||||
|
|
||||||
|
Used internally by generate_dense_embeddings_centralized.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
index_root: Root directory to scan for _index.db files
|
index_root: Root directory to scan for _index.db files
|
||||||
@@ -863,6 +865,30 @@ def discover_all_index_dbs(index_root: Path) -> List[Path]:
|
|||||||
return sorted(index_root.rglob("_index.db"))
|
return sorted(index_root.rglob("_index.db"))
|
||||||
|
|
||||||
|
|
||||||
|
def discover_all_index_dbs(index_root: Path) -> List[Path]:
|
||||||
|
"""Recursively find all _index.db files in an index tree.
|
||||||
|
|
||||||
|
.. deprecated::
|
||||||
|
This function is deprecated. Use centralized indexing with
|
||||||
|
``generate_dense_embeddings_centralized`` instead, which handles
|
||||||
|
index discovery internally.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
index_root: Root directory to scan for _index.db files
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Sorted list of paths to _index.db files
|
||||||
|
"""
|
||||||
|
import warnings
|
||||||
|
warnings.warn(
|
||||||
|
"discover_all_index_dbs is deprecated. Use centralized indexing with "
|
||||||
|
"generate_dense_embeddings_centralized instead.",
|
||||||
|
DeprecationWarning,
|
||||||
|
stacklevel=2
|
||||||
|
)
|
||||||
|
return _discover_index_dbs_internal(index_root)
|
||||||
|
|
||||||
|
|
||||||
def find_all_indexes(scan_dir: Path) -> List[Path]:
|
def find_all_indexes(scan_dir: Path) -> List[Path]:
|
||||||
"""Find all _index.db files in directory tree.
|
"""Find all _index.db files in directory tree.
|
||||||
|
|
||||||
@@ -896,6 +922,11 @@ def generate_embeddings_recursive(
|
|||||||
) -> Dict[str, any]:
|
) -> Dict[str, any]:
|
||||||
"""Generate embeddings for all index databases in a project recursively.
|
"""Generate embeddings for all index databases in a project recursively.
|
||||||
|
|
||||||
|
.. deprecated::
|
||||||
|
This function is deprecated. Use ``generate_dense_embeddings_centralized``
|
||||||
|
instead, which creates a single centralized vector index for the entire project
|
||||||
|
rather than per-directory indexes.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
index_root: Root index directory containing _index.db files
|
index_root: Root index directory containing _index.db files
|
||||||
embedding_backend: Embedding backend to use (fastembed or litellm).
|
embedding_backend: Embedding backend to use (fastembed or litellm).
|
||||||
@@ -921,6 +952,14 @@ def generate_embeddings_recursive(
|
|||||||
Returns:
|
Returns:
|
||||||
Aggregated result dictionary with generation statistics
|
Aggregated result dictionary with generation statistics
|
||||||
"""
|
"""
|
||||||
|
import warnings
|
||||||
|
warnings.warn(
|
||||||
|
"generate_embeddings_recursive is deprecated. Use "
|
||||||
|
"generate_dense_embeddings_centralized instead for centralized indexing.",
|
||||||
|
DeprecationWarning,
|
||||||
|
stacklevel=2
|
||||||
|
)
|
||||||
|
|
||||||
# Get defaults from config if not specified
|
# Get defaults from config if not specified
|
||||||
(default_backend, default_model, default_gpu,
|
(default_backend, default_model, default_gpu,
|
||||||
default_endpoints, default_strategy, default_cooldown) = _get_embedding_defaults()
|
default_endpoints, default_strategy, default_cooldown) = _get_embedding_defaults()
|
||||||
@@ -951,8 +990,8 @@ def generate_embeddings_recursive(
|
|||||||
else:
|
else:
|
||||||
max_workers = 1
|
max_workers = 1
|
||||||
|
|
||||||
# Discover all _index.db files
|
# Discover all _index.db files (using internal helper to avoid double deprecation warning)
|
||||||
index_files = discover_all_index_dbs(index_root)
|
index_files = _discover_index_dbs_internal(index_root)
|
||||||
|
|
||||||
if not index_files:
|
if not index_files:
|
||||||
return {
|
return {
|
||||||
@@ -1120,7 +1159,7 @@ def generate_dense_embeddings_centralized(
|
|||||||
return {"success": False, "error": backend_error or "Embedding backend not available"}
|
return {"success": False, "error": backend_error or "Embedding backend not available"}
|
||||||
|
|
||||||
# Discover all _index.db files
|
# Discover all _index.db files
|
||||||
index_files = discover_all_index_dbs(index_root)
|
index_files = _discover_index_dbs_internal(index_root)
|
||||||
|
|
||||||
if not index_files:
|
if not index_files:
|
||||||
return {
|
return {
|
||||||
@@ -1197,6 +1236,8 @@ def generate_dense_embeddings_centralized(
|
|||||||
# Track chunk ID to file_path mapping for metadata
|
# Track chunk ID to file_path mapping for metadata
|
||||||
chunk_id_to_info: Dict[int, Dict[str, Any]] = {}
|
chunk_id_to_info: Dict[int, Dict[str, Any]] = {}
|
||||||
next_chunk_id = 1
|
next_chunk_id = 1
|
||||||
|
# Track current index_path for source_index_db field
|
||||||
|
current_index_path: Optional[str] = None
|
||||||
|
|
||||||
for idx, index_path in enumerate(index_files, 1):
|
for idx, index_path in enumerate(index_files, 1):
|
||||||
if progress_callback:
|
if progress_callback:
|
||||||
@@ -1206,6 +1247,9 @@ def generate_dense_embeddings_centralized(
|
|||||||
rel_path = index_path
|
rel_path = index_path
|
||||||
progress_callback(f"Processing {idx}/{len(index_files)}: {rel_path}")
|
progress_callback(f"Processing {idx}/{len(index_files)}: {rel_path}")
|
||||||
|
|
||||||
|
# Track current index_path for source_index_db
|
||||||
|
current_index_path = str(index_path)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
with sqlite3.connect(index_path) as conn:
|
with sqlite3.connect(index_path) as conn:
|
||||||
conn.row_factory = sqlite3.Row
|
conn.row_factory = sqlite3.Row
|
||||||
@@ -1250,6 +1294,7 @@ def generate_dense_embeddings_centralized(
|
|||||||
"content": chunk.content,
|
"content": chunk.content,
|
||||||
"metadata": chunk.metadata,
|
"metadata": chunk.metadata,
|
||||||
"category": get_file_category(file_path) or "code",
|
"category": get_file_category(file_path) or "code",
|
||||||
|
"source_index_db": current_index_path,
|
||||||
}
|
}
|
||||||
total_chunks_created += 1
|
total_chunks_created += 1
|
||||||
|
|
||||||
@@ -1303,7 +1348,7 @@ def generate_dense_embeddings_centralized(
|
|||||||
"end_line": metadata.get("end_line"),
|
"end_line": metadata.get("end_line"),
|
||||||
"category": info.get("category"),
|
"category": info.get("category"),
|
||||||
"metadata": metadata,
|
"metadata": metadata,
|
||||||
"source_index_db": None, # Not tracked per-chunk currently
|
"source_index_db": info.get("source_index_db"),
|
||||||
})
|
})
|
||||||
|
|
||||||
meta_store.add_chunks(chunks_to_store)
|
meta_store.add_chunks(chunks_to_store)
|
||||||
@@ -1348,7 +1393,7 @@ def get_embeddings_status(index_root: Path) -> Dict[str, any]:
|
|||||||
Returns:
|
Returns:
|
||||||
Aggregated status with coverage statistics, model info, and timestamps
|
Aggregated status with coverage statistics, model info, and timestamps
|
||||||
"""
|
"""
|
||||||
index_files = discover_all_index_dbs(index_root)
|
index_files = _discover_index_dbs_internal(index_root)
|
||||||
|
|
||||||
if not index_files:
|
if not index_files:
|
||||||
return {
|
return {
|
||||||
@@ -1517,7 +1562,7 @@ def scan_for_model_conflicts(
|
|||||||
- conflicts: List of conflicting index paths with their configs
|
- conflicts: List of conflicting index paths with their configs
|
||||||
- indexes_with_embeddings: Count of indexes that have embeddings
|
- indexes_with_embeddings: Count of indexes that have embeddings
|
||||||
"""
|
"""
|
||||||
index_files = discover_all_index_dbs(index_root)
|
index_files = _discover_index_dbs_internal(index_root)
|
||||||
|
|
||||||
if not index_files:
|
if not index_files:
|
||||||
return {
|
return {
|
||||||
|
|||||||
@@ -693,8 +693,10 @@ class HybridSearchEngine:
|
|||||||
vectors_meta_path, chunk_ids, score_map, category
|
vectors_meta_path, chunk_ids, score_map, category
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self.logger.debug(
|
self.logger.warning(
|
||||||
"Centralized metadata lookup failed, falling back: %s", e
|
"Centralized metadata lookup failed, falling back to legacy traversal: %s. "
|
||||||
|
"Consider regenerating embeddings with: codexlens embeddings-generate --centralized",
|
||||||
|
e
|
||||||
)
|
)
|
||||||
|
|
||||||
# Fallback: traverse _index.db files (legacy path)
|
# Fallback: traverse _index.db files (legacy path)
|
||||||
|
|||||||
@@ -282,10 +282,27 @@ def get_optimal_providers(use_gpu: bool = True, with_device_options: bool = Fals
|
|||||||
return ["CPUExecutionProvider"]
|
return ["CPUExecutionProvider"]
|
||||||
|
|
||||||
gpu_info = detect_gpu()
|
gpu_info = detect_gpu()
|
||||||
|
|
||||||
|
# Check if GPU was requested but not available - log warning
|
||||||
|
if not gpu_info.gpu_available:
|
||||||
|
try:
|
||||||
|
import onnxruntime as ort
|
||||||
|
available_providers = ort.get_available_providers()
|
||||||
|
except ImportError:
|
||||||
|
available_providers = []
|
||||||
|
logger.warning(
|
||||||
|
"GPU acceleration was requested, but no supported GPU provider (CUDA, DirectML) "
|
||||||
|
f"was found. Available providers: {available_providers}. Falling back to CPU."
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# Log which GPU provider is being used
|
||||||
|
gpu_providers = [p for p in gpu_info.onnx_providers if p != "CPUExecutionProvider"]
|
||||||
|
if gpu_providers:
|
||||||
|
logger.info(f"Using {gpu_providers[0]} for ONNX GPU acceleration")
|
||||||
|
|
||||||
if not with_device_options:
|
if not with_device_options:
|
||||||
return gpu_info.onnx_providers
|
return gpu_info.onnx_providers
|
||||||
|
|
||||||
# Build providers with device_id options for GPU providers
|
# Build providers with device_id options for GPU providers
|
||||||
device_id = get_selected_device_id()
|
device_id = get_selected_device_id()
|
||||||
providers = []
|
providers = []
|
||||||
|
|||||||
@@ -232,55 +232,55 @@ class VectorMetadataStore:
|
|||||||
if not chunk_ids:
|
if not chunk_ids:
|
||||||
return []
|
return []
|
||||||
|
|
||||||
with self._lock:
|
# No lock needed for reads: WAL mode + thread-local connections ensure safety
|
||||||
conn = self._get_connection()
|
conn = self._get_connection()
|
||||||
try:
|
try:
|
||||||
placeholders = ",".join("?" * len(chunk_ids))
|
placeholders = ",".join("?" * len(chunk_ids))
|
||||||
|
|
||||||
if category:
|
if category:
|
||||||
query = f'''
|
query = f'''
|
||||||
SELECT chunk_id, file_path, content, start_line, end_line,
|
SELECT chunk_id, file_path, content, start_line, end_line,
|
||||||
category, metadata, source_index_db
|
category, metadata, source_index_db
|
||||||
FROM chunk_metadata
|
FROM chunk_metadata
|
||||||
WHERE chunk_id IN ({placeholders}) AND category = ?
|
WHERE chunk_id IN ({placeholders}) AND category = ?
|
||||||
'''
|
'''
|
||||||
params = list(chunk_ids) + [category]
|
params = list(chunk_ids) + [category]
|
||||||
else:
|
else:
|
||||||
query = f'''
|
query = f'''
|
||||||
SELECT chunk_id, file_path, content, start_line, end_line,
|
SELECT chunk_id, file_path, content, start_line, end_line,
|
||||||
category, metadata, source_index_db
|
category, metadata, source_index_db
|
||||||
FROM chunk_metadata
|
FROM chunk_metadata
|
||||||
WHERE chunk_id IN ({placeholders})
|
WHERE chunk_id IN ({placeholders})
|
||||||
'''
|
'''
|
||||||
params = list(chunk_ids)
|
params = list(chunk_ids)
|
||||||
|
|
||||||
rows = conn.execute(query, params).fetchall()
|
rows = conn.execute(query, params).fetchall()
|
||||||
|
|
||||||
results = []
|
results = []
|
||||||
for row in rows:
|
for row in rows:
|
||||||
metadata = None
|
metadata = None
|
||||||
if row["metadata"]:
|
if row["metadata"]:
|
||||||
try:
|
try:
|
||||||
metadata = json.loads(row["metadata"])
|
metadata = json.loads(row["metadata"])
|
||||||
except json.JSONDecodeError:
|
except json.JSONDecodeError:
|
||||||
metadata = {}
|
metadata = {}
|
||||||
|
|
||||||
results.append({
|
results.append({
|
||||||
"chunk_id": row["chunk_id"],
|
"chunk_id": row["chunk_id"],
|
||||||
"file_path": row["file_path"],
|
"file_path": row["file_path"],
|
||||||
"content": row["content"],
|
"content": row["content"],
|
||||||
"start_line": row["start_line"],
|
"start_line": row["start_line"],
|
||||||
"end_line": row["end_line"],
|
"end_line": row["end_line"],
|
||||||
"category": row["category"],
|
"category": row["category"],
|
||||||
"metadata": metadata or {},
|
"metadata": metadata or {},
|
||||||
"source_index_db": row["source_index_db"],
|
"source_index_db": row["source_index_db"],
|
||||||
})
|
})
|
||||||
|
|
||||||
return results
|
return results
|
||||||
|
|
||||||
except sqlite3.Error as e:
|
except sqlite3.Error as e:
|
||||||
logger.error("Failed to get chunks by IDs: %s", e)
|
logger.error("Failed to get chunks by IDs: %s", e)
|
||||||
return []
|
return []
|
||||||
|
|
||||||
def get_chunk_count(self) -> int:
|
def get_chunk_count(self) -> int:
|
||||||
"""Get total number of chunks in store.
|
"""Get total number of chunks in store.
|
||||||
@@ -288,15 +288,15 @@ class VectorMetadataStore:
|
|||||||
Returns:
|
Returns:
|
||||||
Total chunk count.
|
Total chunk count.
|
||||||
"""
|
"""
|
||||||
with self._lock:
|
# No lock needed for reads: WAL mode + thread-local connections ensure safety
|
||||||
conn = self._get_connection()
|
conn = self._get_connection()
|
||||||
try:
|
try:
|
||||||
row = conn.execute(
|
row = conn.execute(
|
||||||
"SELECT COUNT(*) FROM chunk_metadata"
|
"SELECT COUNT(*) FROM chunk_metadata"
|
||||||
).fetchone()
|
).fetchone()
|
||||||
return row[0] if row else 0
|
return row[0] if row else 0
|
||||||
except sqlite3.Error:
|
except sqlite3.Error:
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
def clear(self) -> None:
|
def clear(self) -> None:
|
||||||
"""Clear all metadata."""
|
"""Clear all metadata."""
|
||||||
|
|||||||
Reference in New Issue
Block a user