From f37189dc641be6648b752141e6e2b9d9770cd912 Mon Sep 17 00:00:00 2001 From: catlog22 Date: Tue, 17 Mar 2026 17:17:24 +0800 Subject: [PATCH] feat: add APIEmbedder for remote embedding with multi-endpoint support - Introduced APIEmbedder class to handle embeddings via a remote HTTP API. - Implemented token packing to optimize batch sizes based on token limits. - Added support for multiple API endpoints with round-robin dispatching. - Included retry logic for API calls with exponential backoff on failures. - Enhanced indexing pipeline with file exclusion checks and smart chunking strategies. - Updated tests to cover new APIEmbedder functionality and ensure robustness. --- .claude/skills/workflow-lite-execute/SKILL.md | 39 +- .claude/skills/workflow-lite-plan/SKILL.md | 13 + .../skills/workflow-lite-test-review/SKILL.md | 442 ++++++++++++++ .../src/components/codexlens/OverviewTab.tsx | 4 +- .../codexlens/SchemaFormRenderer.tsx | 27 +- .../src/components/codexlens/envVarSchema.ts | 564 ++++++++---------- ccw/frontend/src/locales/en/codexlens.json | 48 +- ccw/frontend/src/locales/zh/codexlens.json | 48 +- ccw/frontend/src/types/codexlens.ts | 2 +- .../core/routes/codexlens/config-handlers.ts | 124 ++-- ccw/src/tools/smart-search.ts | 32 +- codex-lens-v2/pyproject.toml | 3 + codex-lens-v2/src/codexlens_search/bridge.py | 136 ++++- codex-lens-v2/src/codexlens_search/config.py | 47 ++ .../src/codexlens_search/embed/__init__.py | 3 +- .../src/codexlens_search/embed/api.py | 232 +++++++ .../src/codexlens_search/indexing/pipeline.py | 167 +++++- codex-lens-v2/tests/unit/test_embed.py | 178 ++++++ 18 files changed, 1633 insertions(+), 476 deletions(-) create mode 100644 .claude/skills/workflow-lite-test-review/SKILL.md create mode 100644 codex-lens-v2/src/codexlens_search/embed/api.py diff --git a/.claude/skills/workflow-lite-execute/SKILL.md b/.claude/skills/workflow-lite-execute/SKILL.md index ccc19430..de943dcf 100644 --- a/.claude/skills/workflow-lite-execute/SKILL.md +++ b/.claude/skills/workflow-lite-execute/SKILL.md @@ -412,17 +412,44 @@ if (hasUnresolvedIssues(reviewResult)) { **Artifact Substitution**: Replace `@{plan.json}` → `@${executionContext.session.artifacts.plan}`, `[@{exploration.json}]` → exploration files from artifacts (if exists). -### Step 5: Auto-Sync Project State +### Step 5: Chain to Test Review & Post-Completion -**Trigger**: After all executions complete (regardless of code review) +> **Note**: Spec sync (session:sync) is handled by lite-test-review's TR-Phase 5, not here. This avoids duplicate sync and ensures test fix changes are also captured. -**Operation**: `/workflow:session:sync -y "{summary}"` +**Map review tool**: Convert lite-execute's `codeReviewTool` to test-review tool name. -Summary priority: `originalUserInput` → `planObject.summary` → git log auto-infer. +```javascript +function mapReviewTool(codeReviewTool) { + if (!codeReviewTool || codeReviewTool === 'Skip') return 'agent' + if (/gemini/i.test(codeReviewTool)) return 'gemini' + if (/codex/i.test(codeReviewTool)) return 'codex' + return 'agent' +} +``` -### Step 6: Post-Completion Expansion +**Build testReviewContext and handoff**: -Ask user whether to expand into issues (test/enhance/refactor/doc). Selected items call `/issue:new "{summary} - {dimension}"`. +```javascript +testReviewContext = { + planObject: planObject, + taskFiles: executionContext?.taskFiles + || getTasks(planObject).map(t => ({ id: t.id, path: `${executionContext?.session?.folder}/.task/${t.id}.json` })), + reviewTool: mapReviewTool(executionContext?.codeReviewTool), + executionResults: previousExecutionResults, + originalUserInput: originalUserInput, + session: executionContext?.session || { + id: 'standalone', + folder: executionContext?.session?.folder || '.', + artifacts: { plan: null, task_dir: null } + } +} + +// Chain to lite-test-review (Mode 1: In-Memory) +Skill("lite-test-review") +// testReviewContext passed as global variable +``` + +**After test-review returns**: Ask user whether to expand into issues (enhance/refactor/doc). Selected items call `/issue:new "{summary} - {dimension}"`. ## Error Handling diff --git a/.claude/skills/workflow-lite-plan/SKILL.md b/.claude/skills/workflow-lite-plan/SKILL.md index 22e3ab17..e6bf3406 100644 --- a/.claude/skills/workflow-lite-plan/SKILL.md +++ b/.claude/skills/workflow-lite-plan/SKILL.md @@ -554,12 +554,25 @@ Skill("lite-execute") ├── explorations-manifest.json # Exploration index ├── planning-context.md # Evidence paths + understanding ├── plan.json # Plan overview (task_ids[]) +├── test-checklist.json # Generated by lite-test-review +├── test-review.md # Generated by lite-test-review └── .task/ ├── TASK-001.json ├── TASK-002.json └── ... ``` +## Chain: lite-plan → lite-execute → lite-test-review + +``` +lite-plan (LP-Phase 1-5) + └─ Skill("lite-execute") ← executionContext (global) + ├─ Step 1-4: Execute + Review + └─ Step 5: Skill("lite-test-review") ← testReviewContext (global) + ├─ TR-Phase 1-4: Test + Fix + └─ TR-Phase 5: Report + Sync specs +``` + ## 12. Error Handling | Error | Resolution | diff --git a/.claude/skills/workflow-lite-test-review/SKILL.md b/.claude/skills/workflow-lite-test-review/SKILL.md new file mode 100644 index 00000000..54060a1e --- /dev/null +++ b/.claude/skills/workflow-lite-test-review/SKILL.md @@ -0,0 +1,442 @@ +--- +name: workflow-lite-test-review +description: Post-execution test review and fix - chain from lite-execute or standalone. Reviews implementation against plan, runs tests, auto-fixes failures. +allowed-tools: Skill, Agent, AskUserQuestion, TodoWrite, Read, Write, Edit, Bash, Glob, Grep +--- + +# Workflow-Lite-Test-Review + +Test review and fix engine for lite-execute chain or standalone invocation. + +--- + +## Usage + +``` + Session path or auto-detect last session (required for standalone) +``` + +| Flag | Description | +|------|-------------| +| `--in-memory` | Mode 1: Chain from lite-execute via `testReviewContext` global variable | +| `--skip-fix` | Review only, do not auto-fix failures | + +## Input Modes + +### Mode 1: In-Memory Chain (from lite-execute) + +**Trigger**: `--in-memory` flag or `testReviewContext` global variable available + +**Input Source**: `testReviewContext` global variable set by lite-execute Step 6 + +**Behavior**: Skip session discovery (already resolved), inherit review tool from execution chain, proceed directly to TR-Phase 1. + +> **Note**: lite-execute Step 6 is the chain gate. Mode 1 invocation means execution is complete — proceed with test review. + +### Mode 2: Standalone + +**Trigger**: User calls with session path or `--last` + +**Behavior**: Discover session → load plan + tasks → detect test tool → proceed to TR-Phase 1. + +```javascript +// Session discovery +let sessionPath, plan, taskFiles, reviewTool + +if (testReviewContext) { + // Mode 1: from lite-execute chain + sessionPath = testReviewContext.session.folder + plan = testReviewContext.planObject + taskFiles = testReviewContext.taskFiles.map(tf => JSON.parse(Read(tf.path))) + reviewTool = testReviewContext.reviewTool || 'agent' +} else { + // Mode 2: standalone + const args = $ARGUMENTS + if (args.includes('--last') || !args.trim() || args.trim() === '--skip-fix') { + const sessions = Glob('.workflow/.lite-plan/*/plan.json') + if (sessions.length === 0) { + console.error('No lite-plan sessions found.') + return + } + sessionPath = sessions[sessions.length - 1].replace(/[/\\]plan\.json$/, '') + } else { + sessionPath = args.replace(/--skip-fix|--last/g, '').trim() + } + plan = JSON.parse(Read(`${sessionPath}/plan.json`)) + taskFiles = plan.task_ids.map(id => JSON.parse(Read(`${sessionPath}/.task/${id}.json`))) + reviewTool = 'agent' // default for standalone +} + +const skipFix = $ARGUMENTS?.includes('--skip-fix') || false +``` + +## Phase Summary + +| Phase | Core Action | Output | +|-------|-------------|--------| +| TR-Phase 1 | Detect test framework + gather changes | testConfig | +| TR-Phase 2 | Review implementation against convergence criteria | reviewResults[] | +| TR-Phase 3 | Run tests + generate checklist | test-checklist.json | +| TR-Phase 4 | Auto-fix failures (iterative, max 3 rounds) | Fixed code + updated checklist | +| TR-Phase 5 | Output report + chain to session:sync | test-review.md | + +## TR-Phase 0: Initialize + +```javascript +const sessionId = sessionPath.split('/').pop() + +TodoWrite({ todos: [ + { content: "TR-Phase 1: Detect & Gather", status: "in_progress", activeForm: "Detecting test framework" }, + { content: "TR-Phase 2: Review Convergence", status: "pending" }, + { content: "TR-Phase 3: Run Tests", status: "pending" }, + { content: "TR-Phase 4: Auto-Fix", status: "pending" }, + { content: "TR-Phase 5: Report & Sync", status: "pending" } +]}) +``` + +## TR-Phase 1: Detect Test Framework & Gather Changes + +```javascript +// Detect test framework +const hasPackageJson = Glob('package.json').length > 0 +const hasPyproject = Glob('pyproject.toml').length > 0 +const hasCargo = Glob('Cargo.toml').length > 0 +const hasGoMod = Glob('go.mod').length > 0 + +let testConfig = { command: null, framework: null, type: null } + +if (hasPackageJson) { + const pkg = JSON.parse(Read('package.json')) + const scripts = pkg.scripts || {} + if (scripts.test) { testConfig = { command: 'npm test', framework: 'jest/vitest', type: 'node' } } + else if (scripts['test:unit']) { testConfig = { command: 'npm run test:unit', framework: 'jest/vitest', type: 'node' } } +} else if (hasPyproject) { + testConfig = { command: 'python -m pytest -v --tb=short', framework: 'pytest', type: 'python' } +} else if (hasCargo) { + testConfig = { command: 'cargo test', framework: 'cargo-test', type: 'rust' } +} else if (hasGoMod) { + testConfig = { command: 'go test ./...', framework: 'go-test', type: 'go' } +} + +// Gather git changes +const changedFiles = Bash('git diff --name-only HEAD~5..HEAD 2>/dev/null || git diff --name-only HEAD') + .split('\n').filter(Boolean) +const gitDiffStat = Bash('git diff --stat HEAD~5..HEAD 2>/dev/null || git diff --stat HEAD') + +console.log(`Test Framework: ${testConfig.framework || 'unknown'} | Command: ${testConfig.command || 'none'}`) +console.log(`Changed Files: ${changedFiles.length}`) +``` + +// TodoWrite: Phase 1 → completed, Phase 2 → in_progress + +## TR-Phase 2: Review Implementation Against Plan + +For each task, verify convergence criteria using agent or CLI review tool. + +**Agent Review** (reviewTool === 'agent', default): + +```javascript +const reviewResults = [] + +for (const task of taskFiles) { + const criteria = task.convergence?.criteria || [] + const testReqs = task.test || {} + + // Find actual changed files matching task scope + const taskTargetFiles = (task.files || []) + .map(f => f.path) + .filter(p => changedFiles.some(c => c.includes(p) || p.includes(c))) + + // Read implementation to verify criteria + const fileContents = taskTargetFiles.map(p => { + try { return { path: p, content: Read(p) } } + catch { return { path: p, content: null } } + }).filter(f => f.content) + + const review = { + taskId: task.id, + title: task.title, + criteria_met: [], + criteria_unmet: [], + test_gaps: [], + files_reviewed: taskTargetFiles + } + + // Agent evaluates each criterion against file contents + for (const criterion of criteria) { + // Check: does implementation satisfy this criterion? + // Analyze file contents, look for expected patterns/functions/logic + const met = /* agent evaluation based on fileContents */ true_or_false + if (met) review.criteria_met.push(criterion) + else review.criteria_unmet.push(criterion) + } + + // Check test coverage gaps + const hasTestFiles = changedFiles.some(f => + /test[_\-.]|spec[_\-.]|\/__tests__\/|\/tests\//.test(f) + ) + if (testReqs.unit?.length > 0 && !hasTestFiles) { + testReqs.unit.forEach(u => review.test_gaps.push({ type: 'unit', desc: u })) + } + if (testReqs.integration?.length > 0) { + testReqs.integration.forEach(i => review.test_gaps.push({ type: 'integration', desc: i })) + } + + reviewResults.push(review) +} +``` + +**CLI Review** (reviewTool === 'gemini' or 'codex'): + +```javascript +if (reviewTool !== 'agent') { + const reviewId = `${sessionId}-tr-review` + Bash(`ccw cli -p "PURPOSE: Post-execution test review — verify convergence criteria met and identify test gaps +TASK: • Read plan.json and .task/*.json convergence criteria • For each criterion, check implementation in changed files • Identify missing unit/integration tests • List unmet criteria with file:line evidence +MODE: analysis +CONTEXT: @${sessionPath}/plan.json @${sessionPath}/.task/*.json @**/* | Memory: lite-execute completed, reviewing convergence +EXPECTED: Per-task verdict table (PASS/PARTIAL/FAIL) + unmet criteria list + test gap list +CONSTRAINTS: Read-only | Focus on convergence verification" --tool ${reviewTool} --mode analysis --id ${reviewId}`, { run_in_background: true }) + // STOP - wait for hook callback, then parse CLI output into reviewResults format +} +``` + +// TodoWrite: Phase 2 → completed, Phase 3 → in_progress + +## TR-Phase 3: Run Tests & Generate Checklist + +```javascript +// Build checklist structure +const testChecklist = { + session: sessionId, + plan_summary: plan.summary, + generated_at: new Date().toISOString(), + test_config: testConfig, + tasks: reviewResults.map(review => { + const task = taskFiles.find(t => t.id === review.taskId) + const testReqs = task.test || {} + return { + task_id: review.taskId, + title: review.title, + status: review.criteria_unmet.length === 0 ? 'PASS' + : review.criteria_met.length > 0 ? 'PARTIAL' : 'FAIL', + convergence: { met: review.criteria_met, unmet: review.criteria_unmet }, + test_items: [ + ...(testReqs.unit || []).map(u => ({ type: 'unit', desc: u, status: 'pending' })), + ...(testReqs.integration || []).map(i => ({ type: 'integration', desc: i, status: 'pending' })), + ...(testReqs.success_metrics || []).map(m => ({ type: 'metric', desc: m, status: 'pending' })), + ...review.test_gaps.map(g => ({ type: g.type, desc: g.desc, status: 'missing' })) + ] + } + }), + execution: null +} + +// Run tests if framework detected +if (testConfig.command) { + console.log(`Running: ${testConfig.command}`) + const testResult = Bash(testConfig.command, { timeout: 300000 }) + + const passed = /(\d+) passed/.test(testResult) || /PASSED/.test(testResult) || /ok \d+/.test(testResult) + const failMatch = testResult.match(/(\d+) failed/) + const hasFail = failMatch || /FAILED/.test(testResult) || /FAIL/.test(testResult) + + testChecklist.execution = { + command: testConfig.command, + timestamp: new Date().toISOString(), + raw_output: testResult.slice(-3000), // keep tail for error context + overall: hasFail ? 'FAIL' : (passed ? 'PASS' : 'UNKNOWN'), + fail_count: failMatch ? parseInt(failMatch[1]) : (hasFail ? -1 : 0) + } + + console.log(`Result: ${testChecklist.execution.overall}`) +} else { + console.log('No test command detected. Skipping test execution.') +} + +Write(`${sessionPath}/test-checklist.json`, JSON.stringify(testChecklist, null, 2)) +``` + +// TodoWrite: Phase 3 → completed, Phase 4 → in_progress + +## TR-Phase 4: Auto-Fix Failures (Iterative) + +**Skip if**: `skipFix === true` OR `testChecklist.execution?.overall !== 'FAIL'` + +**Max iterations**: 3 + +```javascript +if (skipFix || !testChecklist.execution || testChecklist.execution.overall !== 'FAIL') { + console.log(testChecklist.execution?.overall === 'PASS' + ? 'All tests passed. Skipping fix phase.' + : 'Skipping auto-fix (--skip-fix or no test execution).') + // TodoWrite: Phase 4 → completed (skipped) +} else { + let iteration = 0 + const MAX_ITERATIONS = 3 + + while (iteration < MAX_ITERATIONS && testChecklist.execution.overall === 'FAIL') { + iteration++ + console.log(`\n--- Fix Iteration ${iteration}/${MAX_ITERATIONS} ---`) + + // Use test-fix-agent for fixing + Agent({ + subagent_type: "test-fix-agent", + run_in_background: false, + description: `Fix tests (iter ${iteration})`, + prompt: `## Test Fix Iteration ${iteration}/${MAX_ITERATIONS} + +**Test Command**: ${testConfig.command} +**Framework**: ${testConfig.framework} +**Session**: ${sessionPath} + +### Failing Output (last 3000 chars) +\`\`\` +${testChecklist.execution.raw_output} +\`\`\` + +### Plan Context +**Summary**: ${plan.summary} +**Tasks**: ${taskFiles.map(t => `${t.id}: ${t.title}`).join(' | ')} + +### Instructions +1. Analyze test failure output to identify root cause +2. Fix the SOURCE CODE (not tests) unless tests themselves are wrong +3. Run \`${testConfig.command}\` to verify fix +4. If fix introduces new failures, revert and try alternative approach +5. Return: what was fixed, which files changed, test result after fix` + }) + + // Re-run tests after fix + const retestResult = Bash(testConfig.command, { timeout: 300000 }) + const hasFail = /failed|FAIL/.test(retestResult) + + testChecklist.execution = { + command: testConfig.command, + timestamp: new Date().toISOString(), + raw_output: retestResult.slice(-3000), + overall: hasFail ? 'FAIL' : 'PASS', + fix_iteration: iteration + } + + Write(`${sessionPath}/test-checklist.json`, JSON.stringify(testChecklist, null, 2)) + + if (!hasFail) { + console.log(`Tests passed after iteration ${iteration}.`) + break + } + } + + if (testChecklist.execution.overall === 'FAIL') { + console.log(`Tests still failing after ${MAX_ITERATIONS} iterations. Manual investigation needed.`) + } +} +``` + +// TodoWrite: Phase 4 → completed, Phase 5 → in_progress + +## TR-Phase 5: Report & Sync + +> **CHECKPOINT**: This step is MANDATORY. Always generate report and trigger sync. + +```javascript +// Generate markdown report +const report = `# Test Review Report + +**Session**: ${sessionId} +**Summary**: ${plan.summary} +**Generated**: ${new Date().toISOString()} +**Test Framework**: ${testConfig.framework || 'unknown'} + +## Task Verdicts + +| Task | Status | Convergence | Test Items | Gaps | +|------|--------|-------------|------------|------| +${testChecklist.tasks.map(t => + `| ${t.task_id} | ${t.status} | ${t.convergence.met.length}/${t.convergence.met.length + t.convergence.unmet.length} | ${t.test_items.length} | ${t.test_items.filter(i => i.status === 'missing').length} |` +).join('\n')} + +## Unmet Criteria + +${testChecklist.tasks.filter(t => t.convergence.unmet.length > 0).map(t => + `### ${t.task_id}: ${t.title}\n${t.convergence.unmet.map(u => \`- [ ] \${u}\`).join('\n')}` +).join('\n\n') || 'All criteria met.'} + +## Test Gaps + +${testChecklist.tasks.flatMap(t => t.test_items.filter(i => i.status === 'missing')).map(i => + \`- [ ] (\${i.type}) \${i.desc}\` +).join('\n') || 'No gaps detected.'} + +${testChecklist.execution ? `## Test Execution + +**Command**: \\\`${testChecklist.execution.command}\\\` +**Result**: ${testChecklist.execution.overall} +${testChecklist.execution.fix_iteration ? `**Fixed in iteration**: ${testChecklist.execution.fix_iteration}` : ''} +` : '## Test Execution\n\nNo test framework detected.'} +` + +Write(`${sessionPath}/test-review.md`, report) +console.log(`Report: ${sessionPath}/test-review.md`) +console.log(`Checklist: ${sessionPath}/test-checklist.json`) + +// Chain to session:sync +Skill({ skill: "workflow:session:sync", args: `-y "Test review: ${testChecklist.execution?.overall || 'no-test'} — ${plan.summary}"` }) +``` + +// TodoWrite: Phase 5 → completed + +**Display summary**: +```javascript +console.log(` +── Test Review Complete ── +${testChecklist.tasks.map(t => { + const icon = t.status === 'PASS' ? '[PASS]' : t.status === 'PARTIAL' ? '[PARTIAL]' : '[FAIL]' + return `${icon} ${t.task_id}: ${t.title} (${t.convergence.met.length}/${t.convergence.met.length + t.convergence.unmet.length})` +}).join('\n')} +Test: ${testChecklist.execution?.overall || 'skipped'}${testChecklist.execution?.fix_iteration ? ` (fixed iter ${testChecklist.execution.fix_iteration})` : ''} +`) +``` + +## Data Structures + +### testReviewContext (Input - Mode 1, set by lite-execute) + +```javascript +{ + planObject: { /* same as executionContext.planObject */ }, + taskFiles: [{ id: string, path: string }], + reviewTool: "agent" | "gemini" | "codex", // inherited from lite-execute codeReviewTool + executionResults: [...], // previousExecutionResults from lite-execute + originalUserInput: string, + session: { + id: string, + folder: string, + artifacts: { plan: string, task_dir: string } + } +} +``` + +## Session Folder Structure (after test-review) + +``` +.workflow/.lite-plan/{session-id}/ +├── exploration-*.json +├── explorations-manifest.json +├── planning-context.md +├── plan.json +├── .task/TASK-*.json +├── test-checklist.json # NEW: structured test results +└── test-review.md # NEW: human-readable report +``` + +## Error Handling + +| Error | Resolution | +|-------|------------| +| No session found | "No lite-plan sessions found. Run lite-plan first." | +| Missing plan.json | "Invalid session: missing plan.json at {path}" | +| No test framework | Skip TR-Phase 3 execution, still generate review report | +| Test timeout | Capture partial output, report as FAIL | +| Fix agent fails | Log iteration, continue to next or stop at max | +| Sync fails | Log warning, do not block report generation | diff --git a/ccw/frontend/src/components/codexlens/OverviewTab.tsx b/ccw/frontend/src/components/codexlens/OverviewTab.tsx index 070fc489..65faa539 100644 --- a/ccw/frontend/src/components/codexlens/OverviewTab.tsx +++ b/ccw/frontend/src/components/codexlens/OverviewTab.tsx @@ -16,7 +16,6 @@ import { cn } from '@/lib/utils'; import type { CodexLensVenvStatus, CodexLensConfig } from '@/lib/api'; import { IndexOperations } from './IndexOperations'; import { FileWatcherCard } from './FileWatcherCard'; -import { LspServerCard } from './LspServerCard'; interface OverviewTabProps { installed: boolean; @@ -145,9 +144,8 @@ export function OverviewTab({ installed, status, config, isLoading, onRefresh }: {/* Service Management */} -
+
-
{/* Index Operations */} diff --git a/ccw/frontend/src/components/codexlens/SchemaFormRenderer.tsx b/ccw/frontend/src/components/codexlens/SchemaFormRenderer.tsx index ddc9ee75..3fbb506b 100644 --- a/ccw/frontend/src/components/codexlens/SchemaFormRenderer.tsx +++ b/ccw/frontend/src/components/codexlens/SchemaFormRenderer.tsx @@ -11,7 +11,6 @@ import { ArrowUpDown, Cpu, GitBranch, - Scissors, type LucideIcon, } from 'lucide-react'; import { Label } from '@/components/ui/Label'; @@ -41,7 +40,6 @@ const iconMap: Record = { 'arrow-up-down': ArrowUpDown, cpu: Cpu, 'git-branch': GitBranch, - scissors: Scissors, }; interface SchemaFormRendererProps { @@ -214,12 +212,12 @@ function FieldRenderer({ case 'model-select': { // Determine backend type from related backend env var - const isEmbedding = field.key.includes('EMBEDDING'); + const isEmbedding = field.key.includes('EMBED'); const backendKey = isEmbedding ? 'CODEXLENS_EMBEDDING_BACKEND' : 'CODEXLENS_RERANKER_BACKEND'; const backendValue = allValues[backendKey]; - const backendType = (backendValue === 'api' || backendValue === 'litellm') ? 'api' : 'local'; + const backendType = backendValue === 'api' ? 'api' : 'local'; return (
@@ -241,6 +239,27 @@ function FieldRenderer({ ); } + case 'password': + return ( +
+ + onChange(e.target.value)} + placeholder={field.placeholder} + disabled={disabled} + autoComplete="off" + /> +
+ ); + case 'text': default: return ( diff --git a/ccw/frontend/src/components/codexlens/envVarSchema.ts b/ccw/frontend/src/components/codexlens/envVarSchema.ts index 73814690..5034b0f4 100644 --- a/ccw/frontend/src/components/codexlens/envVarSchema.ts +++ b/ccw/frontend/src/components/codexlens/envVarSchema.ts @@ -1,8 +1,8 @@ // ======================================== -// CodexLens Environment Variable Schema +// CodexLens v2 Environment Variable Schema // ======================================== -// TypeScript port of ENV_VAR_GROUPS from codexlens-manager.js -// Defines structured groups for CodexLens configuration +// Defines structured groups for codexlens-search v2 configuration. +// Env var names match what the Python bridge CLI reads. import type { EnvVarGroupsSchema } from '@/types/codexlens'; @@ -20,20 +20,38 @@ export const envVarGroupsSchema: EnvVarGroupsSchema = { default: 'local', settingsPath: 'embedding.backend', }, - CODEXLENS_EMBEDDING_MODEL: { - key: 'CODEXLENS_EMBEDDING_MODEL', + CODEXLENS_EMBED_API_URL: { + key: 'CODEXLENS_EMBED_API_URL', + labelKey: 'codexlens.envField.apiUrl', + type: 'text', + placeholder: 'https://api.siliconflow.cn/v1', + default: '', + settingsPath: 'embedding.api_url', + showWhen: (env) => env['CODEXLENS_EMBEDDING_BACKEND'] === 'api', + }, + CODEXLENS_EMBED_API_KEY: { + key: 'CODEXLENS_EMBED_API_KEY', + labelKey: 'codexlens.envField.apiKey', + type: 'password', + placeholder: 'sk-...', + default: '', + settingsPath: 'embedding.api_key', + showWhen: (env) => env['CODEXLENS_EMBEDDING_BACKEND'] === 'api', + }, + CODEXLENS_EMBED_API_MODEL: { + key: 'CODEXLENS_EMBED_API_MODEL', labelKey: 'codexlens.envField.model', type: 'model-select', placeholder: 'Select or enter model...', - default: 'fast', - settingsPath: 'embedding.model', - localModels: [ - { - group: 'FastEmbed Profiles', - items: ['fast', 'code', 'base', 'minilm', 'multilingual', 'balanced'], - }, - ], + default: '', + settingsPath: 'embedding.api_model', + showWhen: (env) => env['CODEXLENS_EMBEDDING_BACKEND'] === 'api', + localModels: [], apiModels: [ + { + group: 'SiliconFlow', + items: ['BAAI/bge-m3', 'BAAI/bge-large-zh-v1.5', 'BAAI/bge-large-en-v1.5'], + }, { group: 'OpenAI', items: ['text-embedding-3-small', 'text-embedding-3-large', 'text-embedding-ada-002'], @@ -44,66 +62,90 @@ export const envVarGroupsSchema: EnvVarGroupsSchema = { }, { group: 'Voyage', - items: ['voyage-3', 'voyage-3-lite', 'voyage-code-3', 'voyage-multilingual-2'], - }, - { - group: 'SiliconFlow', - items: ['BAAI/bge-m3', 'BAAI/bge-large-zh-v1.5', 'BAAI/bge-large-en-v1.5'], + items: ['voyage-3', 'voyage-3-lite', 'voyage-code-3'], }, { group: 'Jina', - items: ['jina-embeddings-v3', 'jina-embeddings-v2-base-en', 'jina-embeddings-v2-base-zh'], + items: ['jina-embeddings-v3', 'jina-embeddings-v2-base-en'], }, ], }, - CODEXLENS_AUTO_EMBED_MISSING: { - key: 'CODEXLENS_AUTO_EMBED_MISSING', - labelKey: 'codexlens.envField.autoEmbedMissing', - type: 'checkbox', - default: 'true', - settingsPath: 'embedding.auto_embed_missing', + CODEXLENS_EMBED_API_ENDPOINTS: { + key: 'CODEXLENS_EMBED_API_ENDPOINTS', + labelKey: 'codexlens.envField.multiEndpoints', + type: 'text', + placeholder: 'url1|key1|model1,url2|key2|model2', + default: '', + settingsPath: 'embedding.api_endpoints', + showWhen: (env) => env['CODEXLENS_EMBEDDING_BACKEND'] === 'api', + }, + CODEXLENS_EMBED_DIM: { + key: 'CODEXLENS_EMBED_DIM', + labelKey: 'codexlens.envField.embedDim', + type: 'number', + placeholder: '384', + default: '384', + settingsPath: 'embedding.dim', + min: 64, + max: 4096, + showWhen: (env) => env['CODEXLENS_EMBEDDING_BACKEND'] === 'api', + }, + CODEXLENS_EMBED_API_CONCURRENCY: { + key: 'CODEXLENS_EMBED_API_CONCURRENCY', + labelKey: 'codexlens.envField.apiConcurrency', + type: 'number', + placeholder: '4', + default: '4', + settingsPath: 'embedding.api_concurrency', + min: 1, + max: 32, + showWhen: (env) => env['CODEXLENS_EMBEDDING_BACKEND'] === 'api', + }, + CODEXLENS_EMBED_API_MAX_TOKENS: { + key: 'CODEXLENS_EMBED_API_MAX_TOKENS', + labelKey: 'codexlens.envField.maxTokensPerBatch', + type: 'number', + placeholder: '8192', + default: '8192', + settingsPath: 'embedding.api_max_tokens_per_batch', + min: 512, + max: 65536, + showWhen: (env) => env['CODEXLENS_EMBEDDING_BACKEND'] === 'api', + }, + CODEXLENS_EMBEDDING_MODEL: { + key: 'CODEXLENS_EMBEDDING_MODEL', + labelKey: 'codexlens.envField.localModel', + type: 'model-select', + placeholder: 'Select local model...', + default: 'BAAI/bge-small-en-v1.5', + settingsPath: 'embedding.model', + showWhen: (env) => env['CODEXLENS_EMBEDDING_BACKEND'] !== 'api', + localModels: [ + { + group: 'FastEmbed Profiles', + items: ['small', 'base', 'large', 'code'], + }, + ], + apiModels: [], }, CODEXLENS_USE_GPU: { key: 'CODEXLENS_USE_GPU', labelKey: 'codexlens.envField.useGpu', type: 'select', - options: ['true', 'false'], - default: 'true', - settingsPath: 'embedding.use_gpu', - showWhen: (env) => env['CODEXLENS_EMBEDDING_BACKEND'] === 'local', + options: ['auto', 'cuda', 'cpu'], + default: 'auto', + settingsPath: 'embedding.device', + showWhen: (env) => env['CODEXLENS_EMBEDDING_BACKEND'] !== 'api', }, - CODEXLENS_EMBEDDING_POOL_ENABLED: { - key: 'CODEXLENS_EMBEDDING_POOL_ENABLED', - labelKey: 'codexlens.envField.highAvailability', - type: 'select', - options: ['true', 'false'], - default: 'false', - settingsPath: 'embedding.pool_enabled', - showWhen: (env) => env['CODEXLENS_EMBEDDING_BACKEND'] === 'api', - }, - CODEXLENS_EMBEDDING_STRATEGY: { - key: 'CODEXLENS_EMBEDDING_STRATEGY', - labelKey: 'codexlens.envField.loadBalanceStrategy', - type: 'select', - options: ['round_robin', 'latency_aware', 'weighted_random'], - default: 'latency_aware', - settingsPath: 'embedding.strategy', - showWhen: (env) => - env['CODEXLENS_EMBEDDING_BACKEND'] === 'api' && - env['CODEXLENS_EMBEDDING_POOL_ENABLED'] === 'true', - }, - CODEXLENS_EMBEDDING_COOLDOWN: { - key: 'CODEXLENS_EMBEDDING_COOLDOWN', - labelKey: 'codexlens.envField.rateLimitCooldown', + CODEXLENS_EMBED_BATCH_SIZE: { + key: 'CODEXLENS_EMBED_BATCH_SIZE', + labelKey: 'codexlens.envField.batchSize', type: 'number', - placeholder: '60', - default: '60', - settingsPath: 'embedding.cooldown', - min: 0, - max: 300, - showWhen: (env) => - env['CODEXLENS_EMBEDDING_BACKEND'] === 'api' && - env['CODEXLENS_EMBEDDING_POOL_ENABLED'] === 'true', + placeholder: '64', + default: '64', + settingsPath: 'embedding.batch_size', + min: 1, + max: 512, }, }, }, @@ -112,29 +154,64 @@ export const envVarGroupsSchema: EnvVarGroupsSchema = { labelKey: 'codexlens.envGroup.reranker', icon: 'arrow-up-down', vars: { - CODEXLENS_RERANKER_ENABLED: { - key: 'CODEXLENS_RERANKER_ENABLED', - labelKey: 'codexlens.envField.enabled', - type: 'select', - options: ['true', 'false'], - default: 'true', - settingsPath: 'reranker.enabled', - }, CODEXLENS_RERANKER_BACKEND: { key: 'CODEXLENS_RERANKER_BACKEND', labelKey: 'codexlens.envField.backend', type: 'select', - options: ['onnx', 'api', 'litellm', 'legacy'], - default: 'onnx', + options: ['local', 'api'], + default: 'local', settingsPath: 'reranker.backend', }, - CODEXLENS_RERANKER_MODEL: { - key: 'CODEXLENS_RERANKER_MODEL', + CODEXLENS_RERANKER_API_URL: { + key: 'CODEXLENS_RERANKER_API_URL', + labelKey: 'codexlens.envField.apiUrl', + type: 'text', + placeholder: 'https://api.siliconflow.cn/v1', + default: '', + settingsPath: 'reranker.api_url', + showWhen: (env) => env['CODEXLENS_RERANKER_BACKEND'] === 'api', + }, + CODEXLENS_RERANKER_API_KEY: { + key: 'CODEXLENS_RERANKER_API_KEY', + labelKey: 'codexlens.envField.apiKey', + type: 'password', + placeholder: 'sk-...', + default: '', + settingsPath: 'reranker.api_key', + showWhen: (env) => env['CODEXLENS_RERANKER_BACKEND'] === 'api', + }, + CODEXLENS_RERANKER_API_MODEL: { + key: 'CODEXLENS_RERANKER_API_MODEL', labelKey: 'codexlens.envField.model', type: 'model-select', placeholder: 'Select or enter model...', + default: '', + settingsPath: 'reranker.api_model', + showWhen: (env) => env['CODEXLENS_RERANKER_BACKEND'] === 'api', + localModels: [], + apiModels: [ + { + group: 'SiliconFlow', + items: ['BAAI/bge-reranker-v2-m3', 'BAAI/bge-reranker-large', 'BAAI/bge-reranker-base'], + }, + { + group: 'Cohere', + items: ['rerank-english-v3.0', 'rerank-multilingual-v3.0'], + }, + { + group: 'Jina', + items: ['jina-reranker-v2-base-multilingual'], + }, + ], + }, + CODEXLENS_RERANKER_MODEL: { + key: 'CODEXLENS_RERANKER_MODEL', + labelKey: 'codexlens.envField.localModel', + type: 'model-select', + placeholder: 'Select local model...', default: 'Xenova/ms-marco-MiniLM-L-6-v2', settingsPath: 'reranker.model', + showWhen: (env) => env['CODEXLENS_RERANKER_BACKEND'] !== 'api', localModels: [ { group: 'FastEmbed/ONNX', @@ -145,283 +222,128 @@ export const envVarGroupsSchema: EnvVarGroupsSchema = { ], }, ], - apiModels: [ - { - group: 'Cohere', - items: ['rerank-english-v3.0', 'rerank-multilingual-v3.0', 'rerank-english-v2.0'], - }, - { - group: 'Voyage', - items: ['rerank-2', 'rerank-2-lite', 'rerank-1'], - }, - { - group: 'SiliconFlow', - items: ['BAAI/bge-reranker-v2-m3', 'BAAI/bge-reranker-large', 'BAAI/bge-reranker-base'], - }, - { - group: 'Jina', - items: ['jina-reranker-v2-base-multilingual', 'jina-reranker-v1-base-en'], - }, - ], + apiModels: [], }, CODEXLENS_RERANKER_TOP_K: { key: 'CODEXLENS_RERANKER_TOP_K', labelKey: 'codexlens.envField.topKResults', type: 'number', - placeholder: '50', - default: '50', + placeholder: '20', + default: '20', settingsPath: 'reranker.top_k', min: 5, max: 200, }, - CODEXLENS_RERANKER_POOL_ENABLED: { - key: 'CODEXLENS_RERANKER_POOL_ENABLED', - labelKey: 'codexlens.envField.highAvailability', - type: 'select', - options: ['true', 'false'], - default: 'false', - settingsPath: 'reranker.pool_enabled', - showWhen: (env) => env['CODEXLENS_RERANKER_BACKEND'] === 'api' || env['CODEXLENS_RERANKER_BACKEND'] === 'litellm', + CODEXLENS_RERANKER_BATCH_SIZE: { + key: 'CODEXLENS_RERANKER_BATCH_SIZE', + labelKey: 'codexlens.envField.batchSize', + type: 'number', + placeholder: '32', + default: '32', + settingsPath: 'reranker.batch_size', + min: 1, + max: 128, }, - CODEXLENS_RERANKER_STRATEGY: { - key: 'CODEXLENS_RERANKER_STRATEGY', - labelKey: 'codexlens.envField.loadBalanceStrategy', - type: 'select', - options: ['round_robin', 'latency_aware', 'weighted_random'], - default: 'latency_aware', - settingsPath: 'reranker.strategy', - showWhen: (env) => - (env['CODEXLENS_RERANKER_BACKEND'] === 'api' || env['CODEXLENS_RERANKER_BACKEND'] === 'litellm') && - env['CODEXLENS_RERANKER_POOL_ENABLED'] === 'true', + }, + }, + search: { + id: 'search', + labelKey: 'codexlens.envGroup.search', + icon: 'git-branch', + vars: { + CODEXLENS_BINARY_TOP_K: { + key: 'CODEXLENS_BINARY_TOP_K', + labelKey: 'codexlens.envField.binaryTopK', + type: 'number', + placeholder: '200', + default: '200', + settingsPath: 'search.binary_top_k', + min: 10, + max: 1000, }, - CODEXLENS_RERANKER_COOLDOWN: { - key: 'CODEXLENS_RERANKER_COOLDOWN', - labelKey: 'codexlens.envField.rateLimitCooldown', + CODEXLENS_ANN_TOP_K: { + key: 'CODEXLENS_ANN_TOP_K', + labelKey: 'codexlens.envField.annTopK', + type: 'number', + placeholder: '50', + default: '50', + settingsPath: 'search.ann_top_k', + min: 5, + max: 500, + }, + CODEXLENS_FTS_TOP_K: { + key: 'CODEXLENS_FTS_TOP_K', + labelKey: 'codexlens.envField.ftsTopK', + type: 'number', + placeholder: '50', + default: '50', + settingsPath: 'search.fts_top_k', + min: 5, + max: 500, + }, + CODEXLENS_FUSION_K: { + key: 'CODEXLENS_FUSION_K', + labelKey: 'codexlens.envField.fusionK', type: 'number', placeholder: '60', default: '60', - settingsPath: 'reranker.cooldown', - min: 0, - max: 300, - showWhen: (env) => - (env['CODEXLENS_RERANKER_BACKEND'] === 'api' || env['CODEXLENS_RERANKER_BACKEND'] === 'litellm') && - env['CODEXLENS_RERANKER_POOL_ENABLED'] === 'true', - }, - }, - }, - concurrency: { - id: 'concurrency', - labelKey: 'codexlens.envGroup.concurrency', - icon: 'cpu', - vars: { - CODEXLENS_API_MAX_WORKERS: { - key: 'CODEXLENS_API_MAX_WORKERS', - labelKey: 'codexlens.envField.maxWorkers', - type: 'number', - placeholder: '4', - default: '4', - settingsPath: 'api.max_workers', + settingsPath: 'search.fusion_k', min: 1, - max: 32, - }, - CODEXLENS_API_BATCH_SIZE: { - key: 'CODEXLENS_API_BATCH_SIZE', - labelKey: 'codexlens.envField.batchSize', - type: 'number', - placeholder: '8', - default: '8', - settingsPath: 'api.batch_size', - min: 1, - max: 64, - showWhen: (env) => env['CODEXLENS_API_BATCH_SIZE_DYNAMIC'] !== 'true', - }, - CODEXLENS_API_BATCH_SIZE_DYNAMIC: { - key: 'CODEXLENS_API_BATCH_SIZE_DYNAMIC', - labelKey: 'codexlens.envField.dynamicBatchSize', - type: 'checkbox', - default: 'false', - settingsPath: 'api.batch_size_dynamic', - }, - CODEXLENS_API_BATCH_SIZE_UTILIZATION: { - key: 'CODEXLENS_API_BATCH_SIZE_UTILIZATION', - labelKey: 'codexlens.envField.batchSizeUtilization', - type: 'number', - placeholder: '0.8', - default: '0.8', - settingsPath: 'api.batch_size_utilization_factor', - min: 0.1, - max: 0.95, - step: 0.05, - showWhen: (env) => env['CODEXLENS_API_BATCH_SIZE_DYNAMIC'] === 'true', - }, - CODEXLENS_API_BATCH_SIZE_MAX: { - key: 'CODEXLENS_API_BATCH_SIZE_MAX', - labelKey: 'codexlens.envField.batchSizeMax', - type: 'number', - placeholder: '2048', - default: '2048', - settingsPath: 'api.batch_size_max', - min: 1, - max: 4096, - showWhen: (env) => env['CODEXLENS_API_BATCH_SIZE_DYNAMIC'] === 'true', - }, - CODEXLENS_CHARS_PER_TOKEN: { - key: 'CODEXLENS_CHARS_PER_TOKEN', - labelKey: 'codexlens.envField.charsPerToken', - type: 'number', - placeholder: '4', - default: '4', - settingsPath: 'api.chars_per_token_estimate', - min: 1, - max: 10, - showWhen: (env) => env['CODEXLENS_API_BATCH_SIZE_DYNAMIC'] === 'true', - }, - }, - }, - cascade: { - id: 'cascade', - labelKey: 'codexlens.envGroup.cascade', - icon: 'git-branch', - vars: { - CODEXLENS_CASCADE_STRATEGY: { - key: 'CODEXLENS_CASCADE_STRATEGY', - labelKey: 'codexlens.envField.searchStrategy', - type: 'select', - options: ['binary', 'hybrid', 'binary_rerank', 'dense_rerank', 'staged'], - default: 'dense_rerank', - settingsPath: 'cascade.strategy', - }, - CODEXLENS_CASCADE_COARSE_K: { - key: 'CODEXLENS_CASCADE_COARSE_K', - labelKey: 'codexlens.envField.coarseK', - type: 'number', - placeholder: '100', - default: '100', - settingsPath: 'cascade.coarse_k', - min: 10, - max: 500, - }, - CODEXLENS_CASCADE_FINE_K: { - key: 'CODEXLENS_CASCADE_FINE_K', - labelKey: 'codexlens.envField.fineK', - type: 'number', - placeholder: '10', - default: '10', - settingsPath: 'cascade.fine_k', - min: 1, - max: 100, - }, - CODEXLENS_STAGED_STAGE2_MODE: { - key: 'CODEXLENS_STAGED_STAGE2_MODE', - labelKey: 'codexlens.envField.stagedStage2Mode', - type: 'select', - options: ['precomputed', 'realtime', 'static_global_graph'], - default: 'precomputed', - settingsPath: 'staged.stage2_mode', - showWhen: (env) => env['CODEXLENS_CASCADE_STRATEGY'] === 'staged', - }, - CODEXLENS_STAGED_CLUSTERING_STRATEGY: { - key: 'CODEXLENS_STAGED_CLUSTERING_STRATEGY', - labelKey: 'codexlens.envField.stagedClusteringStrategy', - type: 'select', - options: ['auto', 'hdbscan', 'dbscan', 'frequency', 'noop', 'score', 'dir_rr', 'path'], - default: 'auto', - settingsPath: 'staged.clustering_strategy', - showWhen: (env) => env['CODEXLENS_CASCADE_STRATEGY'] === 'staged', - }, - CODEXLENS_STAGED_CLUSTERING_MIN_SIZE: { - key: 'CODEXLENS_STAGED_CLUSTERING_MIN_SIZE', - labelKey: 'codexlens.envField.stagedClusteringMinSize', - type: 'number', - placeholder: '3', - default: '3', - settingsPath: 'staged.clustering_min_size', - min: 1, - max: 50, - showWhen: (env) => env['CODEXLENS_CASCADE_STRATEGY'] === 'staged', - }, - CODEXLENS_ENABLE_STAGED_RERANK: { - key: 'CODEXLENS_ENABLE_STAGED_RERANK', - labelKey: 'codexlens.envField.enableStagedRerank', - type: 'checkbox', - default: 'true', - settingsPath: 'staged.enable_rerank', - showWhen: (env) => env['CODEXLENS_CASCADE_STRATEGY'] === 'staged', + max: 200, }, }, }, indexing: { id: 'indexing', labelKey: 'codexlens.envGroup.indexing', - icon: 'git-branch', + icon: 'cpu', vars: { - CODEXLENS_USE_ASTGREP: { - key: 'CODEXLENS_USE_ASTGREP', - labelKey: 'codexlens.envField.useAstGrep', + CODEXLENS_CODE_AWARE_CHUNKING: { + key: 'CODEXLENS_CODE_AWARE_CHUNKING', + labelKey: 'codexlens.envField.codeAwareChunking', type: 'checkbox', - default: 'false', - settingsPath: 'parsing.use_astgrep', - }, - CODEXLENS_STATIC_GRAPH_ENABLED: { - key: 'CODEXLENS_STATIC_GRAPH_ENABLED', - labelKey: 'codexlens.envField.staticGraphEnabled', - type: 'checkbox', - default: 'false', - settingsPath: 'indexing.static_graph_enabled', - }, - CODEXLENS_STATIC_GRAPH_RELATIONSHIP_TYPES: { - key: 'CODEXLENS_STATIC_GRAPH_RELATIONSHIP_TYPES', - labelKey: 'codexlens.envField.staticGraphRelationshipTypes', - type: 'text', - placeholder: 'imports,inherits,calls', - default: 'imports,inherits', - settingsPath: 'indexing.static_graph_relationship_types', - showWhen: (env) => env['CODEXLENS_STATIC_GRAPH_ENABLED'] === 'true', - }, - }, - }, - chunking: { - id: 'chunking', - labelKey: 'codexlens.envGroup.chunking', - icon: 'scissors', - vars: { - CHUNK_STRIP_COMMENTS: { - key: 'CHUNK_STRIP_COMMENTS', - labelKey: 'codexlens.envField.stripComments', - type: 'select', - options: ['true', 'false'], default: 'true', - settingsPath: 'chunking.strip_comments', + settingsPath: 'indexing.code_aware_chunking', }, - CHUNK_STRIP_DOCSTRINGS: { - key: 'CHUNK_STRIP_DOCSTRINGS', - labelKey: 'codexlens.envField.stripDocstrings', - type: 'select', - options: ['true', 'false'], - default: 'true', - settingsPath: 'chunking.strip_docstrings', - }, - RERANKER_TEST_FILE_PENALTY: { - key: 'RERANKER_TEST_FILE_PENALTY', - labelKey: 'codexlens.envField.testFilePenalty', + CODEXLENS_INDEX_WORKERS: { + key: 'CODEXLENS_INDEX_WORKERS', + labelKey: 'codexlens.envField.indexWorkers', type: 'number', - placeholder: '0.0', - default: '0.0', - settingsPath: 'reranker.test_file_penalty', - min: 0, - max: 1, - step: 0.1, + placeholder: '2', + default: '2', + settingsPath: 'indexing.workers', + min: 1, + max: 16, }, - RERANKER_DOCSTRING_WEIGHT: { - key: 'RERANKER_DOCSTRING_WEIGHT', - labelKey: 'codexlens.envField.docstringWeight', + CODEXLENS_MAX_FILE_SIZE: { + key: 'CODEXLENS_MAX_FILE_SIZE', + labelKey: 'codexlens.envField.maxFileSize', type: 'number', - placeholder: '1.0', - default: '1.0', - settingsPath: 'reranker.docstring_weight', - min: 0, - max: 1, - step: 0.1, + placeholder: '1000000', + default: '1000000', + settingsPath: 'indexing.max_file_size_bytes', + min: 10000, + max: 10000000, + }, + CODEXLENS_HNSW_EF: { + key: 'CODEXLENS_HNSW_EF', + labelKey: 'codexlens.envField.hnswEf', + type: 'number', + placeholder: '150', + default: '150', + settingsPath: 'indexing.hnsw_ef', + min: 10, + max: 500, + }, + CODEXLENS_HNSW_M: { + key: 'CODEXLENS_HNSW_M', + labelKey: 'codexlens.envField.hnswM', + type: 'number', + placeholder: '32', + default: '32', + settingsPath: 'indexing.hnsw_M', + min: 4, + max: 128, }, }, }, diff --git a/ccw/frontend/src/locales/en/codexlens.json b/ccw/frontend/src/locales/en/codexlens.json index 3ecd74d3..4177bbb5 100644 --- a/ccw/frontend/src/locales/en/codexlens.json +++ b/ccw/frontend/src/locales/en/codexlens.json @@ -290,41 +290,31 @@ "envGroup": { "embedding": "Embedding", "reranker": "Reranker", - "concurrency": "Concurrency", - "cascade": "Cascade Search", - "indexing": "Indexing", - "chunking": "Chunking" + "search": "Search Pipeline", + "indexing": "Indexing" }, "envField": { "backend": "Backend", "model": "Model", - "autoEmbedMissing": "Auto Build Missing Vectors", - "useGpu": "Use GPU", - "highAvailability": "High Availability", - "loadBalanceStrategy": "Load Balance Strategy", - "rateLimitCooldown": "Rate Limit Cooldown", - "enabled": "Enabled", + "localModel": "Local Model", + "apiUrl": "API URL", + "apiKey": "API Key", + "multiEndpoints": "Multi-Endpoint", + "embedDim": "Embed Dimension", + "apiConcurrency": "Concurrency", + "maxTokensPerBatch": "Max Tokens/Batch", + "useGpu": "Device", "topKResults": "Top K Results", - "maxWorkers": "Max Workers", "batchSize": "Batch Size", - "dynamicBatchSize": "Dynamic Batch Size", - "batchSizeUtilization": "Utilization Factor", - "batchSizeMax": "Max Batch Size", - "charsPerToken": "Chars Per Token", - "searchStrategy": "Search Strategy", - "coarseK": "Coarse K", - "fineK": "Fine K", - "stagedStage2Mode": "Stage-2 Mode", - "stagedClusteringStrategy": "Clustering Strategy", - "stagedClusteringMinSize": "Cluster Min Size", - "enableStagedRerank": "Enable Rerank", - "useAstGrep": "Use ast-grep", - "staticGraphEnabled": "Static Graph", - "staticGraphRelationshipTypes": "Relationship Types", - "stripComments": "Strip Comments", - "stripDocstrings": "Strip Docstrings", - "testFilePenalty": "Test File Penalty", - "docstringWeight": "Docstring Weight" + "binaryTopK": "Binary Top K", + "annTopK": "ANN Top K", + "ftsTopK": "FTS Top K", + "fusionK": "Fusion K", + "codeAwareChunking": "Code-Aware Chunking", + "indexWorkers": "Index Workers", + "maxFileSize": "Max File Size (bytes)", + "hnswEf": "HNSW ef", + "hnswM": "HNSW M" }, "install": { "title": "Install CodexLens", diff --git a/ccw/frontend/src/locales/zh/codexlens.json b/ccw/frontend/src/locales/zh/codexlens.json index b9757d5c..f1a65541 100644 --- a/ccw/frontend/src/locales/zh/codexlens.json +++ b/ccw/frontend/src/locales/zh/codexlens.json @@ -290,41 +290,31 @@ "envGroup": { "embedding": "嵌入模型", "reranker": "重排序", - "concurrency": "并发", - "cascade": "级联搜索", - "indexing": "索引与解析", - "chunking": "分块" + "search": "搜索流水线", + "indexing": "索引" }, "envField": { "backend": "后端", "model": "模型", - "autoEmbedMissing": "缺失向量时自动构建", - "useGpu": "使用 GPU", - "highAvailability": "高可用", - "loadBalanceStrategy": "负载均衡策略", - "rateLimitCooldown": "限流冷却时间", - "enabled": "启用", + "localModel": "本地模型", + "apiUrl": "API 地址", + "apiKey": "API 密钥", + "multiEndpoints": "多端点", + "embedDim": "向量维度", + "apiConcurrency": "并发数", + "maxTokensPerBatch": "每批最大Token数", + "useGpu": "设备", "topKResults": "Top K 结果数", - "maxWorkers": "最大工作线程", "batchSize": "批次大小", - "dynamicBatchSize": "动态批次大小", - "batchSizeUtilization": "利用率因子", - "batchSizeMax": "最大批次大小", - "charsPerToken": "每 Token 字符数", - "searchStrategy": "搜索策略", - "coarseK": "粗筛 K 值", - "fineK": "精筛 K 值", - "stagedStage2Mode": "Stage-2 模式", - "stagedClusteringStrategy": "聚类策略", - "stagedClusteringMinSize": "最小聚类大小", - "enableStagedRerank": "启用重排序", - "useAstGrep": "使用 ast-grep", - "staticGraphEnabled": "启用静态图", - "staticGraphRelationshipTypes": "关系类型", - "stripComments": "去除注释", - "stripDocstrings": "去除文档字符串", - "testFilePenalty": "测试文件惩罚", - "docstringWeight": "文档字符串权重" + "binaryTopK": "二值粗筛 K", + "annTopK": "ANN 精筛 K", + "ftsTopK": "全文搜索 K", + "fusionK": "融合 K", + "codeAwareChunking": "代码感知分块", + "indexWorkers": "索引线程数", + "maxFileSize": "最大文件大小(字节)", + "hnswEf": "HNSW ef", + "hnswM": "HNSW M" }, "install": { "title": "安装 CodexLens", diff --git a/ccw/frontend/src/types/codexlens.ts b/ccw/frontend/src/types/codexlens.ts index f5029c0f..470e4894 100644 --- a/ccw/frontend/src/types/codexlens.ts +++ b/ccw/frontend/src/types/codexlens.ts @@ -20,7 +20,7 @@ export interface EnvVarFieldSchema { /** i18n label key */ labelKey: string; /** Field type determines which control to render */ - type: 'select' | 'model-select' | 'number' | 'checkbox' | 'text'; + type: 'select' | 'model-select' | 'number' | 'checkbox' | 'text' | 'password'; /** Options for select type */ options?: string[]; /** Default value */ diff --git a/ccw/src/core/routes/codexlens/config-handlers.ts b/ccw/src/core/routes/codexlens/config-handlers.ts index f0621f7c..38bb27dd 100644 --- a/ccw/src/core/routes/codexlens/config-handlers.ts +++ b/ccw/src/core/routes/codexlens/config-handlers.ts @@ -941,7 +941,7 @@ export async function handleCodexLensConfigRoutes(ctx: RouteContext): Promise = {}; // Embedding settings @@ -950,19 +950,34 @@ export async function handleCodexLensConfigRoutes(ctx: RouteContext): Promise { + const envVars: Record = {}; + try { + const envPath = join(getCodexLensDataDir(), '.env'); + const content = readFileSync(envPath, 'utf-8'); + for (const line of content.split('\n')) { + const trimmed = line.trim(); + if (!trimmed || trimmed.startsWith('#')) continue; + const eqIdx = trimmed.indexOf('='); + if (eqIdx <= 0) continue; + const key = trimmed.substring(0, eqIdx).trim(); + let value = trimmed.substring(eqIdx + 1).trim(); + // Strip surrounding quotes + if ((value.startsWith('"') && value.endsWith('"')) || + (value.startsWith("'") && value.endsWith("'"))) { + value = value.slice(1, -1); + } + envVars[key] = value; + } + } catch { + // File doesn't exist — no env overrides + } + return envVars; +} + /** * Execute a generic codexlens-search v2 bridge subcommand (init, status, sync, watch, etc.). * Returns parsed JSON output from the bridge CLI. @@ -2299,11 +2327,13 @@ async function executeV2BridgeCommand( // --db-path is a global arg and must come BEFORE the subcommand const globalArgs = options?.dbPath ? ['--db-path', options.dbPath] : []; const fullArgs = [...globalArgs, subcommand, ...args]; + // Merge process.env with .env file settings (file values override process.env) + const codexlensEnv = loadCodexLensEnvFile(); execFile('codexlens-search', fullArgs, { encoding: 'utf-8', timeout: options?.timeout ?? EXEC_TIMEOUTS.PROCESS_SPAWN, windowsHide: true, - env: { ...process.env, PYTHONIOENCODING: 'utf-8' }, + env: { ...process.env, ...codexlensEnv, PYTHONIOENCODING: 'utf-8' }, }, (error, stdout, stderr) => { if (error) { resolve({ diff --git a/codex-lens-v2/pyproject.toml b/codex-lens-v2/pyproject.toml index 648e56a6..d3224eea 100644 --- a/codex-lens-v2/pyproject.toml +++ b/codex-lens-v2/pyproject.toml @@ -44,6 +44,9 @@ faiss-cpu = [ faiss-gpu = [ "faiss-gpu>=1.7.4", ] +embed-api = [ + "httpx>=0.25", +] reranker-api = [ "httpx>=0.25", ] diff --git a/codex-lens-v2/src/codexlens_search/bridge.py b/codex-lens-v2/src/codexlens_search/bridge.py index 0fe94834..ccd98058 100644 --- a/codex-lens-v2/src/codexlens_search/bridge.py +++ b/codex-lens-v2/src/codexlens_search/bridge.py @@ -57,6 +57,73 @@ def _create_config(args: argparse.Namespace) -> "Config": kwargs: dict = {} if hasattr(args, "embed_model") and args.embed_model: kwargs["embed_model"] = args.embed_model + # API embedding overrides + if hasattr(args, "embed_api_url") and args.embed_api_url: + kwargs["embed_api_url"] = args.embed_api_url + if hasattr(args, "embed_api_key") and args.embed_api_key: + kwargs["embed_api_key"] = args.embed_api_key + if hasattr(args, "embed_api_model") and args.embed_api_model: + kwargs["embed_api_model"] = args.embed_api_model + # Also check env vars as fallback + if "embed_api_url" not in kwargs and os.environ.get("CODEXLENS_EMBED_API_URL"): + kwargs["embed_api_url"] = os.environ["CODEXLENS_EMBED_API_URL"] + if "embed_api_key" not in kwargs and os.environ.get("CODEXLENS_EMBED_API_KEY"): + kwargs["embed_api_key"] = os.environ["CODEXLENS_EMBED_API_KEY"] + if "embed_api_model" not in kwargs and os.environ.get("CODEXLENS_EMBED_API_MODEL"): + kwargs["embed_api_model"] = os.environ["CODEXLENS_EMBED_API_MODEL"] + # Multi-endpoint: CODEXLENS_EMBED_API_ENDPOINTS=url1|key1|model1,url2|key2|model2 + endpoints_env = os.environ.get("CODEXLENS_EMBED_API_ENDPOINTS", "") + if endpoints_env: + endpoints = [] + for entry in endpoints_env.split(","): + parts = entry.strip().split("|") + if len(parts) >= 2: + ep = {"url": parts[0], "key": parts[1]} + if len(parts) >= 3: + ep["model"] = parts[2] + endpoints.append(ep) + if endpoints: + kwargs["embed_api_endpoints"] = endpoints + # Embed dimension and concurrency from env + if os.environ.get("CODEXLENS_EMBED_DIM"): + kwargs["embed_dim"] = int(os.environ["CODEXLENS_EMBED_DIM"]) + if os.environ.get("CODEXLENS_EMBED_BATCH_SIZE"): + kwargs["embed_batch_size"] = int(os.environ["CODEXLENS_EMBED_BATCH_SIZE"]) + if os.environ.get("CODEXLENS_EMBED_API_CONCURRENCY"): + kwargs["embed_api_concurrency"] = int(os.environ["CODEXLENS_EMBED_API_CONCURRENCY"]) + if os.environ.get("CODEXLENS_EMBED_API_MAX_TOKENS"): + kwargs["embed_api_max_tokens_per_batch"] = int(os.environ["CODEXLENS_EMBED_API_MAX_TOKENS"]) + # Reranker API env vars + if os.environ.get("CODEXLENS_RERANKER_API_URL"): + kwargs["reranker_api_url"] = os.environ["CODEXLENS_RERANKER_API_URL"] + if os.environ.get("CODEXLENS_RERANKER_API_KEY"): + kwargs["reranker_api_key"] = os.environ["CODEXLENS_RERANKER_API_KEY"] + if os.environ.get("CODEXLENS_RERANKER_API_MODEL"): + kwargs["reranker_api_model"] = os.environ["CODEXLENS_RERANKER_API_MODEL"] + # Search pipeline params from env + if os.environ.get("CODEXLENS_RERANKER_TOP_K"): + kwargs["reranker_top_k"] = int(os.environ["CODEXLENS_RERANKER_TOP_K"]) + if os.environ.get("CODEXLENS_RERANKER_BATCH_SIZE"): + kwargs["reranker_batch_size"] = int(os.environ["CODEXLENS_RERANKER_BATCH_SIZE"]) + if os.environ.get("CODEXLENS_BINARY_TOP_K"): + kwargs["binary_top_k"] = int(os.environ["CODEXLENS_BINARY_TOP_K"]) + if os.environ.get("CODEXLENS_ANN_TOP_K"): + kwargs["ann_top_k"] = int(os.environ["CODEXLENS_ANN_TOP_K"]) + if os.environ.get("CODEXLENS_FTS_TOP_K"): + kwargs["fts_top_k"] = int(os.environ["CODEXLENS_FTS_TOP_K"]) + if os.environ.get("CODEXLENS_FUSION_K"): + kwargs["fusion_k"] = int(os.environ["CODEXLENS_FUSION_K"]) + # Indexing params from env + if os.environ.get("CODEXLENS_CODE_AWARE_CHUNKING"): + kwargs["code_aware_chunking"] = os.environ["CODEXLENS_CODE_AWARE_CHUNKING"].lower() == "true" + if os.environ.get("CODEXLENS_INDEX_WORKERS"): + kwargs["index_workers"] = int(os.environ["CODEXLENS_INDEX_WORKERS"]) + if os.environ.get("CODEXLENS_MAX_FILE_SIZE"): + kwargs["max_file_size_bytes"] = int(os.environ["CODEXLENS_MAX_FILE_SIZE"]) + if os.environ.get("CODEXLENS_HNSW_EF"): + kwargs["hnsw_ef"] = int(os.environ["CODEXLENS_HNSW_EF"]) + if os.environ.get("CODEXLENS_HNSW_M"): + kwargs["hnsw_M"] = int(os.environ["CODEXLENS_HNSW_M"]) db_path = Path(args.db_path).resolve() kwargs["metadata_db_path"] = str(db_path / "metadata.db") return Config(**kwargs) @@ -72,22 +139,43 @@ def _create_pipeline( """ from codexlens_search.config import Config from codexlens_search.core.factory import create_ann_index, create_binary_index - from codexlens_search.embed.local import FastEmbedEmbedder from codexlens_search.indexing.metadata import MetadataStore from codexlens_search.indexing.pipeline import IndexingPipeline - from codexlens_search.rerank.local import FastEmbedReranker from codexlens_search.search.fts import FTSEngine from codexlens_search.search.pipeline import SearchPipeline config = _create_config(args) db_path = _resolve_db_path(args) - embedder = FastEmbedEmbedder(config) + # Select embedder: API if configured, otherwise local fastembed + if config.embed_api_url: + from codexlens_search.embed.api import APIEmbedder + embedder = APIEmbedder(config) + log.info("Using API embedder: %s", config.embed_api_url) + # Auto-detect embed_dim from API if still at default + if config.embed_dim == 384: + probe_vec = embedder.embed_single("dimension probe") + detected_dim = probe_vec.shape[0] + if detected_dim != config.embed_dim: + log.info("Auto-detected embed_dim=%d from API (was %d)", detected_dim, config.embed_dim) + config.embed_dim = detected_dim + else: + from codexlens_search.embed.local import FastEmbedEmbedder + embedder = FastEmbedEmbedder(config) + binary_store = create_binary_index(db_path, config.embed_dim, config) ann_index = create_ann_index(db_path, config.embed_dim, config) fts = FTSEngine(db_path / "fts.db") metadata = MetadataStore(db_path / "metadata.db") - reranker = FastEmbedReranker(config) + + # Select reranker: API if configured, otherwise local fastembed + if config.reranker_api_url: + from codexlens_search.rerank.api import APIReranker + reranker = APIReranker(config) + log.info("Using API reranker: %s", config.reranker_api_url) + else: + from codexlens_search.rerank.local import FastEmbedReranker + reranker = FastEmbedReranker(config) indexing = IndexingPipeline( embedder=embedder, @@ -181,6 +269,19 @@ def cmd_remove_file(args: argparse.Namespace) -> None: }) +_DEFAULT_EXCLUDES = frozenset({ + "node_modules", ".git", "__pycache__", "dist", "build", + ".venv", "venv", ".tox", ".mypy_cache", ".pytest_cache", + ".next", ".nuxt", "coverage", ".eggs", "*.egg-info", +}) + + +def _should_exclude(path: Path, exclude_dirs: frozenset[str]) -> bool: + """Check if any path component matches an exclude pattern.""" + parts = path.parts + return any(part in exclude_dirs for part in parts) + + def cmd_sync(args: argparse.Namespace) -> None: """Sync index with files under --root matching --glob pattern.""" indexing, _, _ = _create_pipeline(args) @@ -189,12 +290,15 @@ def cmd_sync(args: argparse.Namespace) -> None: if not root.is_dir(): _error_exit(f"Root directory not found: {root}") + exclude_dirs = frozenset(args.exclude) if args.exclude else _DEFAULT_EXCLUDES pattern = args.glob or "**/*" file_paths = [ p for p in root.glob(pattern) - if p.is_file() + if p.is_file() and not _should_exclude(p.relative_to(root), exclude_dirs) ] + log.debug("Sync: %d files after exclusion (root=%s, pattern=%s)", len(file_paths), root, pattern) + stats = indexing.sync(file_paths, root=root) _json_output({ "status": "synced", @@ -331,6 +435,23 @@ def _build_parser() -> argparse.ArgumentParser: help="Enable debug logging to stderr", ) + # API embedding overrides (also read from CODEXLENS_EMBED_API_* env vars) + parser.add_argument( + "--embed-api-url", + default="", + help="Remote embedding API URL (OpenAI-compatible, e.g. https://api.openai.com/v1)", + ) + parser.add_argument( + "--embed-api-key", + default="", + help="API key for remote embedding", + ) + parser.add_argument( + "--embed-api-model", + default="", + help="Model name for remote embedding (e.g. text-embedding-3-small)", + ) + sub = parser.add_subparsers(dest="command") # init @@ -354,6 +475,11 @@ def _build_parser() -> argparse.ArgumentParser: p_sync = sub.add_parser("sync", help="Sync index with directory") p_sync.add_argument("--root", "-r", required=True, help="Root directory to sync") p_sync.add_argument("--glob", "-g", default="**/*", help="Glob pattern (default: **/*)") + p_sync.add_argument( + "--exclude", "-e", action="append", default=None, + help="Directory names to exclude (repeatable). " + "Defaults: node_modules, .git, __pycache__, dist, build, .venv, venv, .tox, .mypy_cache", + ) # watch p_watch = sub.add_parser("watch", help="Watch directory for changes (JSONL output)") diff --git a/codex-lens-v2/src/codexlens_search/config.py b/codex-lens-v2/src/codexlens_search/config.py index ea7cd015..0968411a 100644 --- a/codex-lens-v2/src/codexlens_search/config.py +++ b/codex-lens-v2/src/codexlens_search/config.py @@ -12,6 +12,15 @@ class Config: embed_dim: int = 384 embed_batch_size: int = 64 + # API embedding (optional — overrides local fastembed when set) + embed_api_url: str = "" # e.g. "https://api.openai.com/v1" + embed_api_key: str = "" + embed_api_model: str = "" # e.g. "text-embedding-3-small" + # Multi-endpoint: list of {"url": "...", "key": "...", "model": "..."} dicts + embed_api_endpoints: list[dict[str, str]] = None # type: ignore[assignment] + embed_api_concurrency: int = 4 + embed_api_max_tokens_per_batch: int = 8192 + # Model download / cache model_cache_dir: str = "" # empty = fastembed default cache hf_mirror: str = "" # HuggingFace mirror URL, e.g. "https://hf-mirror.com" @@ -20,6 +29,21 @@ class Config: device: str = "auto" # 'auto', 'cuda', 'cpu' embed_providers: list[str] | None = None # explicit ONNX providers override + # File filtering + max_file_size_bytes: int = 1_000_000 # 1MB + exclude_extensions: frozenset[str] = None # type: ignore[assignment] # set in __post_init__ + binary_detect_sample_bytes: int = 2048 + binary_null_threshold: float = 0.10 # >10% null bytes = binary + generated_code_markers: tuple[str, ...] = ("@generated", "DO NOT EDIT", "auto-generated", "AUTO GENERATED") + + # Code-aware chunking + code_aware_chunking: bool = True + code_extensions: frozenset[str] = frozenset({ + ".py", ".js", ".ts", ".jsx", ".tsx", ".go", ".java", ".cpp", ".c", + ".h", ".hpp", ".cs", ".rs", ".rb", ".php", ".scala", ".kt", ".swift", + ".lua", ".sh", ".bash", ".zsh", ".ps1", ".vue", ".svelte", + }) + # Backend selection: 'auto', 'faiss', 'hnswlib' ann_backend: str = "auto" binary_backend: str = "auto" @@ -64,6 +88,29 @@ class Config: "graph": 0.15, }) + _DEFAULT_EXCLUDE_EXTENSIONS: frozenset[str] = frozenset({ + # binaries / images + ".png", ".jpg", ".jpeg", ".gif", ".webp", ".ico", ".bmp", ".svg", + ".zip", ".gz", ".tar", ".rar", ".7z", ".bz2", + ".bin", ".exe", ".dll", ".so", ".dylib", ".a", ".o", ".obj", + ".pdf", ".doc", ".docx", ".xls", ".xlsx", ".ppt", ".pptx", + # build / generated + ".min.js", ".min.css", ".map", ".lock", + ".pyc", ".pyo", ".class", ".wasm", + # data + ".sqlite", ".db", ".npy", ".npz", ".pkl", ".pickle", + ".parquet", ".arrow", ".feather", + # media + ".mp3", ".mp4", ".wav", ".avi", ".mov", ".flv", + ".ttf", ".otf", ".woff", ".woff2", ".eot", + }) + + def __post_init__(self) -> None: + if self.exclude_extensions is None: + object.__setattr__(self, "exclude_extensions", self._DEFAULT_EXCLUDE_EXTENSIONS) + if self.embed_api_endpoints is None: + object.__setattr__(self, "embed_api_endpoints", []) + def resolve_embed_providers(self) -> list[str]: """Return ONNX execution providers based on device config. diff --git a/codex-lens-v2/src/codexlens_search/embed/__init__.py b/codex-lens-v2/src/codexlens_search/embed/__init__.py index 43df6b7f..f6c9608a 100644 --- a/codex-lens-v2/src/codexlens_search/embed/__init__.py +++ b/codex-lens-v2/src/codexlens_search/embed/__init__.py @@ -1,4 +1,5 @@ from .base import BaseEmbedder from .local import FastEmbedEmbedder, EMBED_PROFILES +from .api import APIEmbedder -__all__ = ["BaseEmbedder", "FastEmbedEmbedder", "EMBED_PROFILES"] +__all__ = ["BaseEmbedder", "FastEmbedEmbedder", "APIEmbedder", "EMBED_PROFILES"] diff --git a/codex-lens-v2/src/codexlens_search/embed/api.py b/codex-lens-v2/src/codexlens_search/embed/api.py new file mode 100644 index 00000000..55d4201a --- /dev/null +++ b/codex-lens-v2/src/codexlens_search/embed/api.py @@ -0,0 +1,232 @@ +from __future__ import annotations + +import itertools +import logging +import threading +import time +from concurrent.futures import ThreadPoolExecutor, as_completed + +import httpx +import numpy as np + +from ..config import Config +from .base import BaseEmbedder + +logger = logging.getLogger(__name__) + + +class _Endpoint: + """A single API endpoint with its own client and rate-limit tracking.""" + + __slots__ = ("url", "key", "model", "client", "failures", "lock") + + def __init__(self, url: str, key: str, model: str) -> None: + self.url = url.rstrip("/") + if not self.url.endswith("/embeddings"): + self.url += "/embeddings" + self.key = key + self.model = model + self.client = httpx.Client( + headers={ + "Authorization": f"Bearer {key}", + "Content-Type": "application/json", + }, + timeout=60.0, + ) + self.failures = 0 + self.lock = threading.Lock() + + +class APIEmbedder(BaseEmbedder): + """Embedder backed by remote HTTP API (OpenAI /v1/embeddings format). + + Features: + - Token packing: packs small chunks into batches up to max_tokens_per_batch + - Multi-endpoint: round-robins across multiple (url, key) pairs + - Concurrent dispatch: parallel API calls via ThreadPoolExecutor + - Per-endpoint failure tracking and retry with backoff + """ + + def __init__(self, config: Config) -> None: + self._config = config + self._endpoints = self._build_endpoints(config) + self._cycler = itertools.cycle(range(len(self._endpoints))) + self._cycler_lock = threading.Lock() + self._executor = ThreadPoolExecutor( + max_workers=min(config.embed_api_concurrency, len(self._endpoints) * 2), + ) + + @staticmethod + def _build_endpoints(config: Config) -> list[_Endpoint]: + """Build endpoint list from config. Supports both single and multi configs.""" + endpoints: list[_Endpoint] = [] + + # Multi-endpoint config takes priority + if config.embed_api_endpoints: + for ep in config.embed_api_endpoints: + endpoints.append(_Endpoint( + url=ep.get("url", config.embed_api_url), + key=ep.get("key", config.embed_api_key), + model=ep.get("model", config.embed_api_model), + )) + + # Fallback: single endpoint from top-level config + if not endpoints and config.embed_api_url: + endpoints.append(_Endpoint( + url=config.embed_api_url, + key=config.embed_api_key, + model=config.embed_api_model, + )) + + if not endpoints: + raise ValueError("No API embedding endpoints configured") + + return endpoints + + def _next_endpoint(self) -> _Endpoint: + with self._cycler_lock: + idx = next(self._cycler) + return self._endpoints[idx] + + # -- Token packing ------------------------------------------------ + + @staticmethod + def _estimate_tokens(text: str) -> int: + """Rough token estimate: ~4 chars per token for code.""" + return max(1, len(text) // 4) + + def _pack_batches( + self, texts: list[str] + ) -> list[list[tuple[int, str]]]: + """Pack texts into batches respecting max_tokens_per_batch. + + Returns list of batches, each batch is list of (original_index, text). + Also respects embed_batch_size as max items per batch. + """ + max_tokens = self._config.embed_api_max_tokens_per_batch + max_items = self._config.embed_batch_size + batches: list[list[tuple[int, str]]] = [] + current: list[tuple[int, str]] = [] + current_tokens = 0 + + for i, text in enumerate(texts): + tokens = self._estimate_tokens(text) + # Start new batch if adding this text would exceed limits + if current and ( + current_tokens + tokens > max_tokens + or len(current) >= max_items + ): + batches.append(current) + current = [] + current_tokens = 0 + current.append((i, text)) + current_tokens += tokens + + if current: + batches.append(current) + + return batches + + # -- API call with retry ------------------------------------------ + + def _call_api( + self, + texts: list[str], + endpoint: _Endpoint, + max_retries: int = 3, + ) -> list[np.ndarray]: + """Call a single endpoint with retry logic.""" + payload: dict = {"input": texts} + if endpoint.model: + payload["model"] = endpoint.model + + last_exc: Exception | None = None + for attempt in range(max_retries): + try: + response = endpoint.client.post(endpoint.url, json=payload) + except Exception as exc: + last_exc = exc + logger.warning( + "API embed %s failed (attempt %d/%d): %s", + endpoint.url, attempt + 1, max_retries, exc, + ) + time.sleep((2 ** attempt) * 0.5) + continue + + if response.status_code in (429, 503): + logger.warning( + "API embed %s returned HTTP %s (attempt %d/%d), retrying...", + endpoint.url, response.status_code, attempt + 1, max_retries, + ) + time.sleep((2 ** attempt) * 0.5) + continue + + response.raise_for_status() + data = response.json() + + items = data.get("data", []) + items.sort(key=lambda x: x["index"]) + vectors = [ + np.array(item["embedding"], dtype=np.float32) + for item in items + ] + + # Reset failure counter on success + with endpoint.lock: + endpoint.failures = 0 + + return vectors + + # Track failures + with endpoint.lock: + endpoint.failures += 1 + + raise RuntimeError( + f"API embed failed at {endpoint.url} after {max_retries} attempts. " + f"Last error: {last_exc}" + ) + + # -- Public interface --------------------------------------------- + + def embed_single(self, text: str) -> np.ndarray: + endpoint = self._next_endpoint() + vecs = self._call_api([text], endpoint) + return vecs[0] + + def embed_batch(self, texts: list[str]) -> list[np.ndarray]: + if not texts: + return [] + + # 1. Pack into token-aware batches + packed = self._pack_batches(texts) + + if len(packed) == 1: + # Single batch — no concurrency overhead needed + batch_texts = [t for _, t in packed[0]] + batch_indices = [i for i, _ in packed[0]] + endpoint = self._next_endpoint() + vecs = self._call_api(batch_texts, endpoint) + results: dict[int, np.ndarray] = {} + for idx, vec in zip(batch_indices, vecs): + results[idx] = vec + return [results[i] for i in range(len(texts))] + + # 2. Dispatch batches concurrently across endpoints + results: dict[int, np.ndarray] = {} + futures = [] + batch_index_map: list[list[int]] = [] + + for batch in packed: + batch_texts = [t for _, t in batch] + batch_indices = [i for i, _ in batch] + endpoint = self._next_endpoint() + future = self._executor.submit(self._call_api, batch_texts, endpoint) + futures.append(future) + batch_index_map.append(batch_indices) + + for future, indices in zip(futures, batch_index_map): + vecs = future.result() # propagates exceptions + for idx, vec in zip(indices, vecs): + results[idx] = vec + + return [results[i] for i in range(len(texts))] diff --git a/codex-lens-v2/src/codexlens_search/indexing/pipeline.py b/codex-lens-v2/src/codexlens_search/indexing/pipeline.py index 22cac992..368a02c7 100644 --- a/codex-lens-v2/src/codexlens_search/indexing/pipeline.py +++ b/codex-lens-v2/src/codexlens_search/indexing/pipeline.py @@ -8,6 +8,7 @@ from __future__ import annotations import hashlib import logging import queue +import re import threading import time from dataclasses import dataclass @@ -32,6 +33,52 @@ _DEFAULT_MAX_CHUNK_CHARS = 800 _DEFAULT_CHUNK_OVERLAP = 100 +def is_file_excluded(file_path: Path, config: Config) -> str | None: + """Check if a file should be excluded from indexing. + + Returns exclusion reason string, or None if file should be indexed. + """ + # Extension check + suffix = file_path.suffix.lower() + # Handle compound extensions like .min.js + name_lower = file_path.name.lower() + for ext in config.exclude_extensions: + if name_lower.endswith(ext): + return f"excluded extension: {ext}" + + # File size check + try: + size = file_path.stat().st_size + except OSError: + return "cannot stat file" + if size > config.max_file_size_bytes: + return f"exceeds max size ({size} > {config.max_file_size_bytes})" + if size == 0: + return "empty file" + + # Binary detection: sample first N bytes + try: + with open(file_path, "rb") as f: + sample = f.read(config.binary_detect_sample_bytes) + except OSError: + return "cannot read file" + if sample: + null_ratio = sample.count(b"\x00") / len(sample) + if null_ratio > config.binary_null_threshold: + return f"binary file (null ratio: {null_ratio:.2%})" + + # Generated code markers (check first 1KB of text) + try: + head = file_path.read_text(encoding="utf-8", errors="replace")[:1024] + except OSError: + return None # can't check, let it through + for marker in config.generated_code_markers: + if marker in head: + return f"generated code marker: {marker}" + + return None + + @dataclass class IndexStats: """Statistics returned after indexing completes.""" @@ -126,16 +173,19 @@ class IndexingPipeline: chunks_created = 0 for fpath in files: + # Noise file filter + exclude_reason = is_file_excluded(fpath, self._config) + if exclude_reason: + logger.debug("Skipping %s: %s", fpath, exclude_reason) + continue try: - if fpath.stat().st_size > max_file_size: - continue text = fpath.read_text(encoding="utf-8", errors="replace") except Exception as exc: logger.debug("Skipping %s: %s", fpath, exc) continue rel_path = str(fpath.relative_to(root)) if root else str(fpath) - file_chunks = self._chunk_text(text, rel_path, max_chunk_chars, chunk_overlap) + file_chunks = self._smart_chunk(text, rel_path, max_chunk_chars, chunk_overlap) if not file_chunks: continue @@ -290,6 +340,106 @@ class IndexingPipeline: return chunks + # Pattern matching top-level definitions across languages + _CODE_BOUNDARY_RE = re.compile( + r"^(?:" + r"(?:export\s+)?(?:async\s+)?(?:def|class|function)\s+" # Python/JS/TS + r"|(?:pub\s+)?(?:fn|struct|impl|enum|trait|mod)\s+" # Rust + r"|(?:func|type)\s+" # Go + r"|(?:public|private|protected|internal)?\s*(?:static\s+)?(?:class|interface|enum|record)\s+" # Java/C# + r"|(?:namespace|template)\s+" # C++ + r")", + re.MULTILINE, + ) + + def _chunk_code( + self, + text: str, + path: str, + max_chars: int, + overlap: int, + ) -> list[tuple[str, str, int, int]]: + """Split code at function/class boundaries with fallback to _chunk_text. + + Strategy: + 1. Find all top-level definition boundaries via regex. + 2. Split text into segments at those boundaries. + 3. Merge small adjacent segments up to max_chars. + 4. If a segment exceeds max_chars, fall back to _chunk_text for that segment. + """ + lines = text.splitlines(keepends=True) + if not lines: + return [] + + # Find boundary line numbers (0-based) + boundaries: list[int] = [0] # always start at line 0 + for i, line in enumerate(lines): + if i == 0: + continue + # Only match lines with no or minimal indentation (top-level) + stripped = line.lstrip() + indent = len(line) - len(stripped) + if indent <= 4 and self._CODE_BOUNDARY_RE.match(stripped): + boundaries.append(i) + + if len(boundaries) <= 1: + # No boundaries found, fall back to text chunking + return self._chunk_text(text, path, max_chars, overlap) + + # Build raw segments between boundaries + raw_segments: list[tuple[int, int]] = [] # (start_line, end_line) 0-based + for idx in range(len(boundaries)): + start = boundaries[idx] + end = boundaries[idx + 1] if idx + 1 < len(boundaries) else len(lines) + raw_segments.append((start, end)) + + # Merge small adjacent segments up to max_chars + merged: list[tuple[int, int]] = [] + cur_start, cur_end = raw_segments[0] + cur_len = sum(len(lines[i]) for i in range(cur_start, cur_end)) + + for seg_start, seg_end in raw_segments[1:]: + seg_len = sum(len(lines[i]) for i in range(seg_start, seg_end)) + if cur_len + seg_len <= max_chars: + cur_end = seg_end + cur_len += seg_len + else: + merged.append((cur_start, cur_end)) + cur_start, cur_end = seg_start, seg_end + cur_len = seg_len + merged.append((cur_start, cur_end)) + + # Build chunks, falling back to _chunk_text for oversized segments + chunks: list[tuple[str, str, int, int]] = [] + for seg_start, seg_end in merged: + seg_text = "".join(lines[seg_start:seg_end]) + if len(seg_text) > max_chars: + # Oversized: sub-chunk with text splitter + sub_chunks = self._chunk_text(seg_text, path, max_chars, overlap) + # Adjust line numbers relative to segment start + for chunk_text, p, sl, el in sub_chunks: + chunks.append((chunk_text, p, sl + seg_start, el + seg_start)) + else: + chunks.append((seg_text, path, seg_start + 1, seg_end)) + + return chunks + + def _smart_chunk( + self, + text: str, + path: str, + max_chars: int, + overlap: int, + ) -> list[tuple[str, str, int, int]]: + """Choose chunking strategy based on file type and config.""" + if self._config.code_aware_chunking: + suffix = Path(path).suffix.lower() + if suffix in self._config.code_extensions: + result = self._chunk_code(text, path, max_chars, overlap) + if result: + return result + return self._chunk_text(text, path, max_chars, overlap) + # ------------------------------------------------------------------ # Incremental API # ------------------------------------------------------------------ @@ -342,11 +492,14 @@ class IndexingPipeline: meta = self._require_metadata() t0 = time.monotonic() + # Noise file filter + exclude_reason = is_file_excluded(file_path, self._config) + if exclude_reason: + logger.debug("Skipping %s: %s", file_path, exclude_reason) + return IndexStats(duration_seconds=round(time.monotonic() - t0, 2)) + # Read file try: - if file_path.stat().st_size > max_file_size: - logger.debug("Skipping %s: exceeds max_file_size", file_path) - return IndexStats(duration_seconds=round(time.monotonic() - t0, 2)) text = file_path.read_text(encoding="utf-8", errors="replace") except Exception as exc: logger.debug("Skipping %s: %s", file_path, exc) @@ -366,7 +519,7 @@ class IndexingPipeline: self._fts.delete_by_path(rel_path) # Chunk - file_chunks = self._chunk_text(text, rel_path, max_chunk_chars, chunk_overlap) + file_chunks = self._smart_chunk(text, rel_path, max_chunk_chars, chunk_overlap) if not file_chunks: # Register file with no chunks meta.register_file(rel_path, content_hash, file_path.stat().st_mtime) diff --git a/codex-lens-v2/tests/unit/test_embed.py b/codex-lens-v2/tests/unit/test_embed.py index afa19a1a..645a0a6a 100644 --- a/codex-lens-v2/tests/unit/test_embed.py +++ b/codex-lens-v2/tests/unit/test_embed.py @@ -21,6 +21,7 @@ _make_fastembed_mock() from codexlens_search.config import Config # noqa: E402 from codexlens_search.embed.base import BaseEmbedder # noqa: E402 from codexlens_search.embed.local import EMBED_PROFILES, FastEmbedEmbedder # noqa: E402 +from codexlens_search.embed.api import APIEmbedder # noqa: E402 class TestEmbedSingle(unittest.TestCase): @@ -76,5 +77,182 @@ class TestBaseEmbedderAbstract(unittest.TestCase): BaseEmbedder() # type: ignore[abstract] +# --------------------------------------------------------------------------- +# APIEmbedder +# --------------------------------------------------------------------------- + +def _make_api_config(**overrides) -> Config: + defaults = dict( + embed_api_url="https://api.example.com/v1", + embed_api_key="test-key", + embed_api_model="text-embedding-3-small", + embed_dim=384, + embed_batch_size=2, + embed_api_max_tokens_per_batch=8192, + embed_api_concurrency=2, + ) + defaults.update(overrides) + return Config(**defaults) + + +def _mock_200(count=1, dim=384): + r = MagicMock() + r.status_code = 200 + r.json.return_value = { + "data": [{"index": j, "embedding": [0.1 * (j + 1)] * dim} for j in range(count)] + } + r.raise_for_status = MagicMock() + return r + + +class TestAPIEmbedderSingle(unittest.TestCase): + def test_embed_single_returns_float32(self): + config = _make_api_config() + with patch("httpx.Client") as mock_client_cls: + mock_client = MagicMock() + mock_client_cls.return_value = mock_client + mock_client.post.return_value = _mock_200(1, 384) + + embedder = APIEmbedder(config) + result = embedder.embed_single("hello") + + self.assertIsInstance(result, np.ndarray) + self.assertEqual(result.dtype, np.float32) + self.assertEqual(result.shape, (384,)) + + +class TestAPIEmbedderBatch(unittest.TestCase): + def test_embed_batch_splits_by_batch_size(self): + config = _make_api_config(embed_batch_size=2) + + with patch("httpx.Client") as mock_client_cls: + mock_client = MagicMock() + mock_client_cls.return_value = mock_client + mock_client.post.side_effect = [_mock_200(2, 384), _mock_200(1, 384)] + + embedder = APIEmbedder(config) + result = embedder.embed_batch(["a", "b", "c"]) + + self.assertEqual(len(result), 3) + for arr in result: + self.assertIsInstance(arr, np.ndarray) + self.assertEqual(arr.dtype, np.float32) + + def test_embed_batch_empty_returns_empty(self): + config = _make_api_config() + with patch("httpx.Client"): + embedder = APIEmbedder(config) + result = embedder.embed_batch([]) + self.assertEqual(result, []) + + +class TestAPIEmbedderRetry(unittest.TestCase): + def test_retry_on_429(self): + config = _make_api_config() + mock_429 = MagicMock() + mock_429.status_code = 429 + + with patch("httpx.Client") as mock_client_cls: + mock_client = MagicMock() + mock_client_cls.return_value = mock_client + mock_client.post.side_effect = [mock_429, _mock_200(1, 384)] + + embedder = APIEmbedder(config) + ep = embedder._endpoints[0] + with patch("time.sleep"): + result = embedder._call_api(["test"], ep) + + self.assertEqual(len(result), 1) + self.assertEqual(mock_client.post.call_count, 2) + + def test_raises_after_max_retries(self): + config = _make_api_config() + mock_429 = MagicMock() + mock_429.status_code = 429 + + with patch("httpx.Client") as mock_client_cls: + mock_client = MagicMock() + mock_client_cls.return_value = mock_client + mock_client.post.return_value = mock_429 + + embedder = APIEmbedder(config) + ep = embedder._endpoints[0] + with patch("time.sleep"): + with self.assertRaises(RuntimeError): + embedder._call_api(["test"], ep, max_retries=2) + + +class TestAPIEmbedderTokenPacking(unittest.TestCase): + def test_packs_small_texts_together(self): + config = _make_api_config( + embed_batch_size=100, + embed_api_max_tokens_per_batch=100, # ~400 chars + ) + with patch("httpx.Client"): + embedder = APIEmbedder(config) + + # 5 texts of 80 chars each (~20 tokens) -> 100 tokens = 1 batch at limit + texts = ["x" * 80] * 5 + batches = embedder._pack_batches(texts) + # Should pack as many as fit under 100 tokens + self.assertTrue(len(batches) >= 1) + total_items = sum(len(b) for b in batches) + self.assertEqual(total_items, 5) + + def test_large_text_gets_own_batch(self): + config = _make_api_config( + embed_batch_size=100, + embed_api_max_tokens_per_batch=50, # ~200 chars + ) + with patch("httpx.Client"): + embedder = APIEmbedder(config) + + # Mix of small and large texts + texts = ["small" * 10, "x" * 800, "tiny"] + batches = embedder._pack_batches(texts) + # Large text (200 tokens) exceeds 50 limit, should be separate + self.assertTrue(len(batches) >= 2) + + +class TestAPIEmbedderMultiEndpoint(unittest.TestCase): + def test_multi_endpoint_config(self): + config = _make_api_config( + embed_api_endpoints=[ + {"url": "https://ep1.example.com/v1", "key": "k1", "model": "m1"}, + {"url": "https://ep2.example.com/v1", "key": "k2", "model": "m2"}, + ] + ) + with patch("httpx.Client"): + embedder = APIEmbedder(config) + self.assertEqual(len(embedder._endpoints), 2) + self.assertTrue(embedder._endpoints[0].url.endswith("/embeddings")) + self.assertTrue(embedder._endpoints[1].url.endswith("/embeddings")) + + def test_single_endpoint_fallback(self): + config = _make_api_config() # no embed_api_endpoints + with patch("httpx.Client"): + embedder = APIEmbedder(config) + self.assertEqual(len(embedder._endpoints), 1) + + +class TestAPIEmbedderUrlNormalization(unittest.TestCase): + def test_appends_embeddings_path(self): + config = _make_api_config(embed_api_url="https://api.example.com/v1") + with patch("httpx.Client") as mock_client_cls: + mock_client = MagicMock() + mock_client_cls.return_value = mock_client + mock_client.post.return_value = _mock_200(1, 384) + embedder = APIEmbedder(config) + ep = embedder._endpoints[0] + self.assertTrue(ep.url.endswith("/embeddings")) + + def test_does_not_double_append(self): + config = _make_api_config(embed_api_url="https://api.example.com/v1/embeddings") + with patch("httpx.Client"): + embedder = APIEmbedder(config) + ep = embedder._endpoints[0] + self.assertFalse(ep.url.endswith("/embeddings/embeddings")) + + if __name__ == "__main__": unittest.main()