mirror of
https://github.com/catlog22/Claude-Code-Workflow.git
synced 2026-02-12 02:37:45 +08:00
feat: 增强解决方案管理功能,支持按解决方案 ID 过滤和简要输出,优化嵌入模型配置读取
This commit is contained in:
@@ -33,6 +33,7 @@ Queue formation command using **issue-queue-agent** that analyzes all bound solu
|
|||||||
| Get next item | `ccw issue next --json` | `Read('queues/*.json')` |
|
| Get next item | `ccw issue next --json` | `Read('queues/*.json')` |
|
||||||
| Update status | `ccw issue update <id> --status ...` | Direct file edit |
|
| Update status | `ccw issue update <id> --status ...` | Direct file edit |
|
||||||
| Sync from queue | `ccw issue update --from-queue` | Direct file edit |
|
| Sync from queue | `ccw issue update --from-queue` | Direct file edit |
|
||||||
|
| **Read solution (brief)** | `ccw issue solution <id> --brief` | `Read('solutions/*.jsonl')` |
|
||||||
|
|
||||||
**Output Options**:
|
**Output Options**:
|
||||||
- `--brief`: JSON with minimal fields (id, status, counts)
|
- `--brief`: JSON with minimal fields (id, status, counts)
|
||||||
@@ -109,14 +110,14 @@ Phase 6: Status Update & Summary
|
|||||||
### Phase 1: Solution Loading & Distribution
|
### Phase 1: Solution Loading & Distribution
|
||||||
|
|
||||||
**Data Loading:**
|
**Data Loading:**
|
||||||
- Load `issues.jsonl` and filter issues with `status === 'planned'` and `bound_solution_id`
|
- Use `ccw issue list --status planned --brief` to get planned issues with `bound_solution_id`
|
||||||
- If no planned issues found → display message, suggest `/issue:plan`
|
- If no planned issues found → display message, suggest `/issue:plan`
|
||||||
|
|
||||||
**Solution Collection** (for each planned issue):
|
**Solution Brief Loading** (for each planned issue):
|
||||||
- Read `solutions/{issue-id}.jsonl`
|
```bash
|
||||||
- Find bound solution by `bound_solution_id`
|
ccw issue solution <issue-id> --brief
|
||||||
- If bound solution not found → warn and skip issue
|
# Returns: [{ solution_id, is_bound, task_count, files_touched[] }]
|
||||||
- Extract `files_touched` from all task `modification_points`
|
```
|
||||||
|
|
||||||
**Build Solution Objects:**
|
**Build Solution Objects:**
|
||||||
```json
|
```json
|
||||||
@@ -130,19 +131,8 @@ Phase 6: Status Update & Summary
|
|||||||
```
|
```
|
||||||
|
|
||||||
**Multi-Queue Distribution** (if `--queues > 1`):
|
**Multi-Queue Distribution** (if `--queues > 1`):
|
||||||
```javascript
|
- Use `files_touched` from brief output for partitioning
|
||||||
const numQueues = args.queues || 1;
|
- Group solutions with overlapping files into same queue
|
||||||
if (numQueues > 1) {
|
|
||||||
// Partition solutions to minimize cross-group file conflicts
|
|
||||||
const groups = partitionByFileOverlap(solutions, numQueues);
|
|
||||||
// groups = [[sol1, sol2], [sol3, sol4], [sol5]]
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
**Partitioning Strategy:**
|
|
||||||
- Group solutions with overlapping `files_touched` into same queue
|
|
||||||
- Use greedy assignment: assign each solution to queue with most file overlap
|
|
||||||
- If no overlap, assign to queue with fewest solutions (balance load)
|
|
||||||
|
|
||||||
**Output:** Array of solution objects (or N arrays if multi-queue)
|
**Output:** Array of solution objects (or N arrays if multi-queue)
|
||||||
|
|
||||||
@@ -168,10 +158,11 @@ const queueIds = numQueues === 1
|
|||||||
|
|
||||||
### Input
|
### Input
|
||||||
${JSON.stringify(solutions)}
|
${JSON.stringify(solutions)}
|
||||||
|
// Each object: { issue_id, solution_id, task_count, files_touched[], priority }
|
||||||
|
|
||||||
### Workflow
|
### Workflow
|
||||||
|
|
||||||
Step 1: Build dependency graph from solutions (nodes=solutions, edges=file conflicts)
|
Step 1: Build dependency graph from solutions (nodes=solutions, edges=file conflicts via files_touched)
|
||||||
Step 2: Use Gemini CLI for conflict analysis (5 types: file, API, data, dependency, architecture)
|
Step 2: Use Gemini CLI for conflict analysis (5 types: file, API, data, dependency, architecture)
|
||||||
Step 3: For high-severity conflicts without clear resolution → add to `clarifications`
|
Step 3: For high-severity conflicts without clear resolution → add to `clarifications`
|
||||||
Step 4: Calculate semantic priority (base from issue priority + task_count boost)
|
Step 4: Calculate semantic priority (base from issue priority + task_count boost)
|
||||||
@@ -201,6 +192,7 @@ Step 6: Write queue JSON + update index
|
|||||||
- Queue Item ID format: S-1, S-2, S-3, ...
|
- Queue Item ID format: S-1, S-2, S-3, ...
|
||||||
- Use provided Queue ID (do NOT generate new)
|
- Use provided Queue ID (do NOT generate new)
|
||||||
- `clarifications` only present if high-severity unresolved conflicts exist
|
- `clarifications` only present if high-severity unresolved conflicts exist
|
||||||
|
- Use `files_touched` from input (already extracted by orchestrator)
|
||||||
|
|
||||||
### Done Criteria
|
### Done Criteria
|
||||||
- [ ] Queue JSON written with all solutions ordered
|
- [ ] Queue JSON written with all solutions ordered
|
||||||
|
|||||||
@@ -203,6 +203,7 @@ interface IssueOptions {
|
|||||||
executor?: string;
|
executor?: string;
|
||||||
priority?: string;
|
priority?: string;
|
||||||
solution?: string;
|
solution?: string;
|
||||||
|
solutionId?: string; // --solution-id <id> for filtering solutions
|
||||||
result?: string;
|
result?: string;
|
||||||
reason?: string;
|
reason?: string;
|
||||||
json?: boolean;
|
json?: boolean;
|
||||||
@@ -869,16 +870,16 @@ async function createAction(options: IssueOptions): Promise<void> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* solution - Create solution from JSON data
|
* solution - Create or read solutions
|
||||||
* Usage: ccw issue solution <issue-id> --data '{"tasks":[...]}'
|
* Create: ccw issue solution <issue-id> --data '{"tasks":[...]}'
|
||||||
* echo '{"tasks":[...]}' | ccw issue solution <issue-id>
|
* Read: ccw issue solution <issue-id> [--brief] [--solution-id <id>]
|
||||||
* Output: JSON with created solution (includes auto-generated ID)
|
* Brief: Returns { solution_id, files_touched[], task_count } for each solution
|
||||||
*/
|
*/
|
||||||
async function solutionAction(issueId: string | undefined, options: IssueOptions): Promise<void> {
|
async function solutionAction(issueId: string | undefined, options: IssueOptions): Promise<void> {
|
||||||
if (!issueId) {
|
if (!issueId) {
|
||||||
console.error(chalk.red('Issue ID required'));
|
console.error(chalk.red('Issue ID required'));
|
||||||
console.error(chalk.gray('Usage: ccw issue solution <issue-id> --data \'{"tasks":[...]}\''));
|
console.error(chalk.gray('Usage: ccw issue solution <issue-id> [--brief] [--solution-id <id>]'));
|
||||||
console.error(chalk.gray(' echo \'{"tasks":[...]}\' | ccw issue solution <issue-id>'));
|
console.error(chalk.gray(' ccw issue solution <issue-id> --data \'{"tasks":[...]}\''));
|
||||||
process.exit(1);
|
process.exit(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -893,20 +894,84 @@ async function solutionAction(issueId: string | undefined, options: IssueOptions
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!jsonData) {
|
// CREATE mode: if --data provided
|
||||||
console.error(chalk.red('JSON data required'));
|
if (jsonData) {
|
||||||
console.error(chalk.gray('Usage: ccw issue solution <issue-id> --data \'{"tasks":[...]}\''));
|
try {
|
||||||
console.error(chalk.gray(' echo \'{"tasks":[...]}\' | ccw issue solution <issue-id>'));
|
const data = JSON.parse(jsonData);
|
||||||
|
const solution = createSolution(issueId, data);
|
||||||
|
console.log(JSON.stringify(solution, null, 2));
|
||||||
|
} catch (err) {
|
||||||
|
console.error(chalk.red((err as Error).message));
|
||||||
|
process.exit(1);
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// READ mode: list solutions for issue
|
||||||
|
const issue = findIssue(issueId);
|
||||||
|
if (!issue) {
|
||||||
|
console.error(chalk.red(`Issue "${issueId}" not found`));
|
||||||
process.exit(1);
|
process.exit(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
const solutions = readSolutions(issueId);
|
||||||
const data = JSON.parse(jsonData);
|
if (solutions.length === 0) {
|
||||||
const solution = createSolution(issueId, data);
|
if (options.json || options.brief) {
|
||||||
console.log(JSON.stringify(solution, null, 2));
|
console.log('[]');
|
||||||
} catch (err) {
|
} else {
|
||||||
console.error(chalk.red((err as Error).message));
|
console.log(chalk.yellow(`No solutions found for ${issueId}`));
|
||||||
process.exit(1);
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Filter by solution-id if specified
|
||||||
|
let targetSolutions = solutions;
|
||||||
|
if (options.solutionId) {
|
||||||
|
targetSolutions = solutions.filter(s => s.id === options.solutionId);
|
||||||
|
if (targetSolutions.length === 0) {
|
||||||
|
console.error(chalk.red(`Solution "${options.solutionId}" not found`));
|
||||||
|
process.exit(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Brief mode: extract files_touched from modification_points
|
||||||
|
if (options.brief) {
|
||||||
|
const briefSolutions = targetSolutions.map(sol => {
|
||||||
|
const filesTouched = new Set<string>();
|
||||||
|
for (const task of sol.tasks) {
|
||||||
|
if (task.modification_points) {
|
||||||
|
for (const mp of task.modification_points) {
|
||||||
|
if (mp.file) filesTouched.add(mp.file);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return {
|
||||||
|
solution_id: sol.id,
|
||||||
|
is_bound: sol.is_bound,
|
||||||
|
task_count: sol.tasks.length,
|
||||||
|
files_touched: Array.from(filesTouched)
|
||||||
|
};
|
||||||
|
});
|
||||||
|
console.log(JSON.stringify(briefSolutions, null, 2));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// JSON mode: full solutions
|
||||||
|
if (options.json) {
|
||||||
|
console.log(JSON.stringify(targetSolutions, null, 2));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Human-readable output
|
||||||
|
console.log(chalk.bold.cyan(`\nSolutions for ${issueId}:\n`));
|
||||||
|
for (const sol of targetSolutions) {
|
||||||
|
const marker = sol.is_bound ? chalk.green('◉ BOUND') : chalk.gray('○');
|
||||||
|
console.log(`${marker} ${sol.id}`);
|
||||||
|
console.log(chalk.gray(` Tasks: ${sol.tasks.length}`));
|
||||||
|
if (sol.description) {
|
||||||
|
console.log(chalk.gray(` ${sol.description.substring(0, 80)}...`));
|
||||||
|
}
|
||||||
|
console.log();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -2429,6 +2494,8 @@ export async function issueCommand(
|
|||||||
console.log(chalk.gray(' list [issue-id] List issues or tasks'));
|
console.log(chalk.gray(' list [issue-id] List issues or tasks'));
|
||||||
console.log(chalk.gray(' history List completed issues (from history)'));
|
console.log(chalk.gray(' history List completed issues (from history)'));
|
||||||
console.log(chalk.gray(' status [issue-id] Show detailed status'));
|
console.log(chalk.gray(' status [issue-id] Show detailed status'));
|
||||||
|
console.log(chalk.gray(' solution <id> List solutions for issue'));
|
||||||
|
console.log(chalk.gray(' solution <id> --brief Brief: solution_id, files_touched, task_count'));
|
||||||
console.log(chalk.gray(' solution <id> --data \'{...}\' Create solution (auto-generates ID)'));
|
console.log(chalk.gray(' solution <id> --data \'{...}\' Create solution (auto-generates ID)'));
|
||||||
console.log(chalk.gray(' bind <issue-id> [sol-id] Bind solution'));
|
console.log(chalk.gray(' bind <issue-id> [sol-id] Bind solution'));
|
||||||
console.log(chalk.gray(' update <issue-id> --status <s> Update issue status'));
|
console.log(chalk.gray(' update <issue-id> --status <s> Update issue status'));
|
||||||
|
|||||||
@@ -125,8 +125,8 @@ def index_init(
|
|||||||
workers: Optional[int] = typer.Option(None, "--workers", "-w", min=1, help="Parallel worker processes (default: auto-detect based on CPU count)."),
|
workers: Optional[int] = typer.Option(None, "--workers", "-w", min=1, help="Parallel worker processes (default: auto-detect based on CPU count)."),
|
||||||
force: bool = typer.Option(False, "--force", "-f", help="Force full reindex (skip incremental mode)."),
|
force: bool = typer.Option(False, "--force", "-f", help="Force full reindex (skip incremental mode)."),
|
||||||
no_embeddings: bool = typer.Option(False, "--no-embeddings", help="Skip automatic embedding generation (if semantic deps installed)."),
|
no_embeddings: bool = typer.Option(False, "--no-embeddings", help="Skip automatic embedding generation (if semantic deps installed)."),
|
||||||
backend: str = typer.Option("fastembed", "--backend", "-b", help="Embedding backend: fastembed (local) or litellm (remote API)."),
|
backend: Optional[str] = typer.Option(None, "--backend", "-b", help="Embedding backend: fastembed (local) or litellm (remote API). Defaults to settings.json config."),
|
||||||
model: str = typer.Option("code", "--model", "-m", help="Embedding model: profile name for fastembed (fast/code/multilingual/balanced) or model name for litellm (e.g. text-embedding-3-small)."),
|
model: Optional[str] = typer.Option(None, "--model", "-m", help="Embedding model: profile name for fastembed or model name for litellm. Defaults to settings.json config."),
|
||||||
max_workers: int = typer.Option(1, "--max-workers", min=1, help="Max concurrent API calls for embedding generation. Recommended: 4-8 for litellm backend."),
|
max_workers: int = typer.Option(1, "--max-workers", min=1, help="Max concurrent API calls for embedding generation. Recommended: 4-8 for litellm backend."),
|
||||||
json_mode: bool = typer.Option(False, "--json", help="Output JSON response."),
|
json_mode: bool = typer.Option(False, "--json", help="Output JSON response."),
|
||||||
verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable debug logging."),
|
verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable debug logging."),
|
||||||
@@ -152,6 +152,12 @@ def index_init(
|
|||||||
"""
|
"""
|
||||||
_configure_logging(verbose, json_mode)
|
_configure_logging(verbose, json_mode)
|
||||||
config = Config()
|
config = Config()
|
||||||
|
|
||||||
|
# Fallback to settings.json config if CLI params not provided
|
||||||
|
config.load_settings() # Ensure settings are loaded
|
||||||
|
actual_backend = backend or config.embedding_backend
|
||||||
|
actual_model = model or config.embedding_model
|
||||||
|
|
||||||
languages = _parse_languages(language)
|
languages = _parse_languages(language)
|
||||||
base_path = path.expanduser().resolve()
|
base_path = path.expanduser().resolve()
|
||||||
|
|
||||||
@@ -199,15 +205,15 @@ def index_init(
|
|||||||
|
|
||||||
# Validate embedding backend
|
# Validate embedding backend
|
||||||
valid_backends = ["fastembed", "litellm"]
|
valid_backends = ["fastembed", "litellm"]
|
||||||
if backend not in valid_backends:
|
if actual_backend not in valid_backends:
|
||||||
error_msg = f"Invalid embedding backend: {backend}. Must be one of: {', '.join(valid_backends)}"
|
error_msg = f"Invalid embedding backend: {actual_backend}. Must be one of: {', '.join(valid_backends)}"
|
||||||
if json_mode:
|
if json_mode:
|
||||||
print_json(success=False, error=error_msg)
|
print_json(success=False, error=error_msg)
|
||||||
else:
|
else:
|
||||||
console.print(f"[red]Error:[/red] {error_msg}")
|
console.print(f"[red]Error:[/red] {error_msg}")
|
||||||
raise typer.Exit(code=1)
|
raise typer.Exit(code=1)
|
||||||
|
|
||||||
backend_available, backend_error = is_embedding_backend_available(backend)
|
backend_available, backend_error = is_embedding_backend_available(actual_backend)
|
||||||
|
|
||||||
if backend_available:
|
if backend_available:
|
||||||
# Use the index root directory (not the _index.db file)
|
# Use the index root directory (not the _index.db file)
|
||||||
@@ -215,8 +221,8 @@ def index_init(
|
|||||||
|
|
||||||
if not json_mode:
|
if not json_mode:
|
||||||
console.print("\n[bold]Generating embeddings...[/bold]")
|
console.print("\n[bold]Generating embeddings...[/bold]")
|
||||||
console.print(f"Backend: [cyan]{backend}[/cyan]")
|
console.print(f"Backend: [cyan]{actual_backend}[/cyan]")
|
||||||
console.print(f"Model: [cyan]{model}[/cyan]")
|
console.print(f"Model: [cyan]{actual_model}[/cyan]")
|
||||||
else:
|
else:
|
||||||
# Output progress message for JSON mode (parsed by Node.js)
|
# Output progress message for JSON mode (parsed by Node.js)
|
||||||
print("Generating embeddings...", flush=True)
|
print("Generating embeddings...", flush=True)
|
||||||
@@ -236,8 +242,8 @@ def index_init(
|
|||||||
|
|
||||||
embed_result = generate_embeddings_recursive(
|
embed_result = generate_embeddings_recursive(
|
||||||
index_root,
|
index_root,
|
||||||
embedding_backend=backend,
|
embedding_backend=actual_backend,
|
||||||
model_profile=model,
|
model_profile=actual_model,
|
||||||
force=False, # Don't force regenerate during init
|
force=False, # Don't force regenerate during init
|
||||||
chunk_size=2000,
|
chunk_size=2000,
|
||||||
progress_callback=progress_update, # Always use callback
|
progress_callback=progress_update, # Always use callback
|
||||||
@@ -283,7 +289,7 @@ def index_init(
|
|||||||
}
|
}
|
||||||
else:
|
else:
|
||||||
if not json_mode and verbose:
|
if not json_mode and verbose:
|
||||||
console.print(f"[dim]Embedding backend '{backend}' not available. Skipping embeddings.[/dim]")
|
console.print(f"[dim]Embedding backend '{actual_backend}' not available. Skipping embeddings.[/dim]")
|
||||||
result["embeddings"] = {
|
result["embeddings"] = {
|
||||||
"generated": False,
|
"generated": False,
|
||||||
"error": backend_error or "Embedding backend not available",
|
"error": backend_error or "Embedding backend not available",
|
||||||
|
|||||||
@@ -10,6 +10,7 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
|
|||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import List, Optional, Dict, Any, Literal, Tuple, TYPE_CHECKING
|
from typing import List, Optional, Dict, Any, Literal, Tuple, TYPE_CHECKING
|
||||||
|
import json
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import time
|
import time
|
||||||
@@ -1241,20 +1242,60 @@ class ChainSearchEngine:
|
|||||||
stats=stats
|
stats=stats
|
||||||
)
|
)
|
||||||
|
|
||||||
# Step 3: Generate query dense embedding using same model as index
|
# Step 3: Find centralized HNSW index and read model config
|
||||||
|
from codexlens.config import VECTORS_HNSW_NAME
|
||||||
|
central_hnsw_path = None
|
||||||
|
index_root = start_index.parent
|
||||||
|
current_dir = index_root
|
||||||
|
for _ in range(10): # Limit search depth
|
||||||
|
candidate = current_dir / VECTORS_HNSW_NAME
|
||||||
|
if candidate.exists():
|
||||||
|
central_hnsw_path = candidate
|
||||||
|
index_root = current_dir # Update to where HNSW was found
|
||||||
|
break
|
||||||
|
parent = current_dir.parent
|
||||||
|
if parent == current_dir: # Reached root
|
||||||
|
break
|
||||||
|
current_dir = parent
|
||||||
|
|
||||||
|
# Step 4: Generate query dense embedding using same model as centralized index
|
||||||
# Read embedding config to match the model used during indexing
|
# Read embedding config to match the model used during indexing
|
||||||
dense_coarse_time = time.time()
|
dense_coarse_time = time.time()
|
||||||
try:
|
try:
|
||||||
from codexlens.semantic.factory import get_embedder
|
from codexlens.semantic.factory import get_embedder
|
||||||
|
|
||||||
# Get embedding settings from config
|
# Get embedding settings from centralized index config (preferred) or fallback to self._config
|
||||||
embedding_backend = "litellm" # Default to API for dense
|
embedding_backend = "litellm" # Default to API for dense
|
||||||
embedding_model = "qwen3-embedding-sf" # Default model
|
embedding_model = "qwen3-embedding-sf" # Default model
|
||||||
use_gpu = True
|
use_gpu = True
|
||||||
|
|
||||||
|
# Try to read model config from centralized index's embeddings_config table
|
||||||
|
central_index_db = index_root / "_index.db"
|
||||||
|
if central_index_db.exists():
|
||||||
|
try:
|
||||||
|
from codexlens.semantic.vector_store import VectorStore
|
||||||
|
with VectorStore(central_index_db) as vs:
|
||||||
|
model_config = vs.get_model_config()
|
||||||
|
if model_config:
|
||||||
|
embedding_backend = model_config.get("backend", embedding_backend)
|
||||||
|
embedding_model = model_config.get("model_name", embedding_model)
|
||||||
|
self.logger.debug(
|
||||||
|
"Read model config from centralized index: %s/%s",
|
||||||
|
embedding_backend, embedding_model
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.debug("Failed to read centralized model config: %s", e)
|
||||||
|
|
||||||
|
# Fallback to self._config if not read from index
|
||||||
if self._config is not None:
|
if self._config is not None:
|
||||||
embedding_backend = getattr(self._config, "embedding_backend", "litellm")
|
if embedding_backend == "litellm" and embedding_model == "qwen3-embedding-sf":
|
||||||
embedding_model = getattr(self._config, "embedding_model", "qwen3-embedding-sf")
|
# Only use config values if we didn't read from centralized index
|
||||||
|
config_backend = getattr(self._config, "embedding_backend", None)
|
||||||
|
config_model = getattr(self._config, "embedding_model", None)
|
||||||
|
if config_backend:
|
||||||
|
embedding_backend = config_backend
|
||||||
|
if config_model:
|
||||||
|
embedding_model = config_model
|
||||||
use_gpu = getattr(self._config, "embedding_use_gpu", True)
|
use_gpu = getattr(self._config, "embedding_use_gpu", True)
|
||||||
|
|
||||||
# Create embedder matching index configuration
|
# Create embedder matching index configuration
|
||||||
@@ -1269,31 +1310,54 @@ class ChainSearchEngine:
|
|||||||
self.logger.warning(f"Failed to generate dense query embedding: {exc}")
|
self.logger.warning(f"Failed to generate dense query embedding: {exc}")
|
||||||
return self.hybrid_cascade_search(query, source_path, k, coarse_k, options)
|
return self.hybrid_cascade_search(query, source_path, k, coarse_k, options)
|
||||||
|
|
||||||
# Step 4: Dense coarse search using HNSW indexes
|
# Step 5: Dense coarse search using centralized HNSW index
|
||||||
coarse_candidates: List[Tuple[int, float, Path]] = [] # (chunk_id, distance, index_path)
|
coarse_candidates: List[Tuple[int, float, Path]] = [] # (chunk_id, distance, index_path)
|
||||||
index_root = index_paths[0].parent if index_paths else None
|
|
||||||
|
|
||||||
for index_path in index_paths:
|
if central_hnsw_path is not None:
|
||||||
|
# Use centralized index
|
||||||
try:
|
try:
|
||||||
# Load HNSW index
|
|
||||||
from codexlens.semantic.ann_index import ANNIndex
|
from codexlens.semantic.ann_index import ANNIndex
|
||||||
ann_index = ANNIndex(index_path, dim=query_dense.shape[0])
|
ann_index = ANNIndex.create_central(
|
||||||
if not ann_index.load():
|
index_root=index_root,
|
||||||
continue
|
dim=query_dense.shape[0],
|
||||||
|
)
|
||||||
if ann_index.count() == 0:
|
if ann_index.load() and ann_index.count() > 0:
|
||||||
continue
|
# Search centralized HNSW index
|
||||||
|
ids, distances = ann_index.search(query_dense, top_k=coarse_k)
|
||||||
# Search HNSW index
|
for chunk_id, dist in zip(ids, distances):
|
||||||
ids, distances = ann_index.search(query_dense, top_k=coarse_k)
|
coarse_candidates.append((chunk_id, dist, index_root / "_index.db"))
|
||||||
for chunk_id, dist in zip(ids, distances):
|
self.logger.debug(
|
||||||
coarse_candidates.append((chunk_id, dist, index_path))
|
"Centralized dense search: %d candidates from %s",
|
||||||
|
len(ids), central_hnsw_path
|
||||||
|
)
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
self.logger.debug(
|
self.logger.debug(
|
||||||
"Dense search failed for %s: %s", index_path, exc
|
"Centralized dense search failed for %s: %s", central_hnsw_path, exc
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Fallback: try per-directory HNSW indexes if centralized not found
|
||||||
|
if not coarse_candidates:
|
||||||
|
for index_path in index_paths:
|
||||||
|
try:
|
||||||
|
# Load HNSW index
|
||||||
|
from codexlens.semantic.ann_index import ANNIndex
|
||||||
|
ann_index = ANNIndex(index_path, dim=query_dense.shape[0])
|
||||||
|
if not ann_index.load():
|
||||||
|
continue
|
||||||
|
|
||||||
|
if ann_index.count() == 0:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Search HNSW index
|
||||||
|
ids, distances = ann_index.search(query_dense, top_k=coarse_k)
|
||||||
|
for chunk_id, dist in zip(ids, distances):
|
||||||
|
coarse_candidates.append((chunk_id, dist, index_path))
|
||||||
|
|
||||||
|
except Exception as exc:
|
||||||
|
self.logger.debug(
|
||||||
|
"Dense search failed for %s: %s", index_path, exc
|
||||||
|
)
|
||||||
|
|
||||||
if not coarse_candidates:
|
if not coarse_candidates:
|
||||||
self.logger.info("No dense candidates found, falling back to hybrid cascade")
|
self.logger.info("No dense candidates found, falling back to hybrid cascade")
|
||||||
return self.hybrid_cascade_search(query, source_path, k, coarse_k, options)
|
return self.hybrid_cascade_search(query, source_path, k, coarse_k, options)
|
||||||
@@ -1307,7 +1371,7 @@ class ChainSearchEngine:
|
|||||||
len(coarse_candidates), (time.time() - dense_coarse_time) * 1000
|
len(coarse_candidates), (time.time() - dense_coarse_time) * 1000
|
||||||
)
|
)
|
||||||
|
|
||||||
# Step 5: Build SearchResult objects for cross-encoder reranking
|
# Step 6: Build SearchResult objects for cross-encoder reranking
|
||||||
candidates_by_index: Dict[Path, List[int]] = {}
|
candidates_by_index: Dict[Path, List[int]] = {}
|
||||||
for chunk_id, distance, index_path in coarse_candidates:
|
for chunk_id, distance, index_path in coarse_candidates:
|
||||||
if index_path not in candidates_by_index:
|
if index_path not in candidates_by_index:
|
||||||
@@ -1320,29 +1384,63 @@ class ChainSearchEngine:
|
|||||||
|
|
||||||
for index_path, chunk_ids in candidates_by_index.items():
|
for index_path, chunk_ids in candidates_by_index.items():
|
||||||
try:
|
try:
|
||||||
# Query semantic_chunks table directly
|
# For centralized index, use _vectors_meta.db for chunk metadata
|
||||||
conn = sqlite3.connect(str(index_path))
|
# which contains file_path, content, start_line, end_line
|
||||||
conn.row_factory = sqlite3.Row
|
if central_hnsw_path is not None and index_path == index_root / "_index.db":
|
||||||
placeholders = ",".join("?" * len(chunk_ids))
|
# Use centralized metadata from _vectors_meta.db
|
||||||
cursor = conn.execute(
|
meta_db_path = index_root / "_vectors_meta.db"
|
||||||
f"""
|
if meta_db_path.exists():
|
||||||
SELECT id, file_path, content, metadata, category
|
conn = sqlite3.connect(str(meta_db_path))
|
||||||
FROM semantic_chunks
|
conn.row_factory = sqlite3.Row
|
||||||
WHERE id IN ({placeholders})
|
placeholders = ",".join("?" * len(chunk_ids))
|
||||||
""",
|
cursor = conn.execute(
|
||||||
chunk_ids
|
f"""
|
||||||
)
|
SELECT chunk_id, file_path, content, start_line, end_line
|
||||||
chunks_data = [
|
FROM chunk_metadata
|
||||||
{
|
WHERE chunk_id IN ({placeholders})
|
||||||
"id": row["id"],
|
""",
|
||||||
"file_path": row["file_path"],
|
chunk_ids
|
||||||
"content": row["content"],
|
)
|
||||||
"metadata": row["metadata"],
|
chunks_data = [
|
||||||
"category": row["category"],
|
{
|
||||||
}
|
"id": row["chunk_id"],
|
||||||
for row in cursor.fetchall()
|
"file_path": row["file_path"],
|
||||||
]
|
"content": row["content"],
|
||||||
conn.close()
|
"metadata": json.dumps({
|
||||||
|
"start_line": row["start_line"],
|
||||||
|
"end_line": row["end_line"]
|
||||||
|
}),
|
||||||
|
"category": "code" if row["file_path"].endswith(('.py', '.ts', '.js', '.java', '.go', '.rs', '.cpp', '.c')) else "doc",
|
||||||
|
}
|
||||||
|
for row in cursor.fetchall()
|
||||||
|
]
|
||||||
|
conn.close()
|
||||||
|
else:
|
||||||
|
chunks_data = []
|
||||||
|
else:
|
||||||
|
# Fall back to per-directory semantic_chunks table
|
||||||
|
conn = sqlite3.connect(str(index_path))
|
||||||
|
conn.row_factory = sqlite3.Row
|
||||||
|
placeholders = ",".join("?" * len(chunk_ids))
|
||||||
|
cursor = conn.execute(
|
||||||
|
f"""
|
||||||
|
SELECT id, file_path, content, metadata, category
|
||||||
|
FROM semantic_chunks
|
||||||
|
WHERE id IN ({placeholders})
|
||||||
|
""",
|
||||||
|
chunk_ids
|
||||||
|
)
|
||||||
|
chunks_data = [
|
||||||
|
{
|
||||||
|
"id": row["id"],
|
||||||
|
"file_path": row["file_path"],
|
||||||
|
"content": row["content"],
|
||||||
|
"metadata": row["metadata"],
|
||||||
|
"category": row["category"],
|
||||||
|
}
|
||||||
|
for row in cursor.fetchall()
|
||||||
|
]
|
||||||
|
conn.close()
|
||||||
|
|
||||||
for chunk in chunks_data:
|
for chunk in chunks_data:
|
||||||
chunk_id = chunk.get("id")
|
chunk_id = chunk.get("id")
|
||||||
|
|||||||
@@ -605,13 +605,20 @@ class HybridSearchEngine:
|
|||||||
index_root = hnsw_path.parent
|
index_root = hnsw_path.parent
|
||||||
model_config = None
|
model_config = None
|
||||||
|
|
||||||
# Try to get model config from the provided index_path first
|
# Try to get model config from the centralized index root first
|
||||||
|
# (not the sub-directory index_path, which may have outdated config)
|
||||||
try:
|
try:
|
||||||
from codexlens.semantic.vector_store import VectorStore
|
from codexlens.semantic.vector_store import VectorStore
|
||||||
with VectorStore(index_path) as vs:
|
central_index_path = index_root / "_index.db"
|
||||||
model_config = vs.get_model_config()
|
if central_index_path.exists():
|
||||||
except Exception:
|
with VectorStore(central_index_path) as vs:
|
||||||
pass
|
model_config = vs.get_model_config()
|
||||||
|
self.logger.debug(
|
||||||
|
"Loaded model config from centralized index: %s",
|
||||||
|
model_config
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.debug("Failed to load model config from centralized index: %s", e)
|
||||||
|
|
||||||
# Detect dimension from HNSW file if model config not found
|
# Detect dimension from HNSW file if model config not found
|
||||||
if model_config is None:
|
if model_config is None:
|
||||||
|
|||||||
Reference in New Issue
Block a user