diff --git a/.claude/commands/workflow/docs/analyze.md b/.claude/commands/workflow/docs/analyze.md deleted file mode 100644 index 6a43f623..00000000 --- a/.claude/commands/workflow/docs/analyze.md +++ /dev/null @@ -1,1467 +0,0 @@ ---- -name: analyze -description: Multi-phase iterative project analysis with user-guided focus, produces architecture/design/method reports through incremental updates -argument-hint: "[scope|path] [--type=architecture|design|methods|comprehensive]" -allowed-tools: Task(*), AskUserQuestion(*), Read(*), Bash(*), Glob(*), Grep(*), TodoWrite(*), Write(*) ---- - -# Workflow Docs Analyze Command (/workflow:docs:analyze) - -## Overview - -**Iterative project analysis command** that guides users through multi-phase analysis to produce tailored project reports. Uses multi-agent collaboration with incremental report updates based on user feedback. - -**Core Differentiator**: Unlike static analysis tools, this command engages in multi-round dialogue to understand user's specific focus areas and iteratively refines reports until user satisfaction. - -## Report Types - -| Type | Focus | Output | -|------|-------|--------| -| `architecture` | System structure, module relationships, dependencies | ARCHITECTURE-REPORT.md | -| `design` | Design patterns, class diagrams, component interactions | DESIGN-REPORT.md | -| `methods` | Key algorithms, critical paths, core functions explained | METHODS-REPORT.md | -| `comprehensive` | All above combined | COMPREHENSIVE-REPORT.md | - -## Execution Philosophy - -**User-Centric Iteration Model**: -1. **Discover** user's actual analysis needs (not assume) -2. **Explore** codebase from multiple angles -3. **Generate** initial report with clear structure -4. **Iterate** based on user feedback until satisfied -5. **Finalize** polished report - -## Execution Process - -``` -Input Parsing: - ├─ Parse: scope/path (optional, defaults to project root) - └─ Parse: --type flag (optional, determines initial focus) - -Phase 1: Requirements Discovery (AskUserQuestion) - ├─ Ask: Report type preference (if not specified) - ├─ Ask: Focus areas within chosen type - ├─ Ask: Depth level (overview/detailed/deep-dive) - └─ Output: analysis_config object - -Phase 2: Project Exploration (cli-explore-agent) - ├─ Execute: Parallel exploration by focus areas - ├─ Generate: exploration-{angle}.json files - └─ Output: explorations-manifest.json - -Phase 3: Deep Analysis (Gemini CLI) - ├─ Execute: Multi-dimensional analysis - ├─ Generate: analysis-{dimension}.json files - └─ Output: analysis-manifest.json - -Phase 4: Initial Report Generation - ├─ Synthesize: Merge all analysis results - ├─ Generate: {TYPE}-REPORT.md (draft) - └─ Output: Present report to user - -Phase 5: Iterative Refinement (Loop) - ├─ Ask: User feedback on current report - ├─ Decision: - │ ├─ "satisfied" → Finalize and exit - │ ├─ "expand section X" → Deep-dive specific area - │ ├─ "add details on Y" → Augment with new analysis - │ └─ "change focus" → Re-run Phase 3 with new config - └─ Update: Incremental report modification - -Finalize: - └─ Output: Final {TYPE}-REPORT.md -``` - -## 5-Phase Execution - -### Phase 1: Requirements Discovery - -**Purpose**: Understand user's actual analysis needs through interactive questioning - -**Step 1.1: Initialize TodoWrite** - -```javascript -TodoWrite([ - {content: "Phase 1: Requirements Discovery", status: "in_progress", activeForm: "Discovering analysis requirements"}, - {content: "Phase 2: Project Exploration", status: "pending", activeForm: "Exploring project structure"}, - {content: "Phase 3: Deep Analysis", status: "pending", activeForm: "Performing deep analysis"}, - {content: "Phase 4: Initial Report Generation", status: "pending", activeForm: "Generating initial report"}, - {content: "Phase 5: Iterative Refinement", status: "pending", activeForm: "Refining report based on feedback"} -]) -``` - -**Step 1.2: Report Type Selection (if not specified)** - -```javascript -if (!type_specified) { - AskUserQuestion({ - questions: [{ - question: "What type of project analysis report would you like?", - header: "Report Type", - multiSelect: false, - options: [ - {label: "Architecture (Recommended)", description: "System structure, module relationships, layer analysis, dependency graph"}, - {label: "Design", description: "Design patterns, class relationships, component interactions, abstraction analysis"}, - {label: "Methods", description: "Key algorithms, critical code paths, core function explanations with examples"}, - {label: "Comprehensive", description: "All above combined into a complete project analysis"} - ] - }] - }) -} -``` - -**Step 1.3: Focus Area Discovery** - -```javascript -// Based on report type, ask focused questions -const focusQuestions = { - architecture: { - question: "Which architectural aspects are you most interested in?", - header: "Focus Areas", - multiSelect: true, - options: [ - {label: "Layer Structure", description: "How the system is divided into layers (presentation, business, data)"}, - {label: "Module Dependencies", description: "How modules depend on and communicate with each other"}, - {label: "Entry Points", description: "How requests flow through the system from entry to response"}, - {label: "Data Flow", description: "How data moves and transforms across the system"} - ] - }, - design: { - question: "Which design aspects do you want to understand?", - header: "Design Focus", - multiSelect: true, - options: [ - {label: "Design Patterns", description: "Identify and explain patterns used in the codebase"}, - {label: "Class Relationships", description: "Inheritance, composition, and associations between classes"}, - {label: "Interface Contracts", description: "How components communicate through interfaces/protocols"}, - {label: "State Management", description: "How application state is managed and mutated"} - ] - }, - methods: { - question: "What kind of method analysis do you need?", - header: "Method Focus", - multiSelect: true, - options: [ - {label: "Core Algorithms", description: "Key algorithms that define the system's behavior"}, - {label: "Critical Paths", description: "Most important execution flows in the application"}, - {label: "Public APIs", description: "External-facing methods and their contracts"}, - {label: "Complex Logic", description: "Methods with high cyclomatic complexity explained"} - ] - } -}; - -AskUserQuestion({questions: [focusQuestions[selected_type]]}) -``` - -**Step 1.4: Depth Level Selection** - -```javascript -AskUserQuestion({ - questions: [{ - question: "How detailed should the analysis be?", - header: "Depth", - multiSelect: false, - options: [ - {label: "Overview (Recommended)", description: "High-level understanding, suitable for onboarding or quick review"}, - {label: "Detailed", description: "In-depth analysis with code examples and explanations"}, - {label: "Deep-Dive", description: "Exhaustive analysis with implementation details and edge cases"} - ] - }] -}) -``` - -**Step 1.5: Store Analysis Config** - -```javascript -const analysis_config = { - type: selected_type, - focus_areas: selected_focus_areas, - depth: selected_depth, - scope: scope_path || ".", - timestamp: new Date().toISOString() -}; - -// Create output directory -const outputDir = `.workflow/.scratchpad/analyze-${Date.now()}`; -bash(`mkdir -p ${outputDir}`); -Write(`${outputDir}/analysis-config.json`, JSON.stringify(analysis_config, null, 2)); -``` - -**TodoWrite**: Mark Phase 1 completed, Phase 2 in_progress - ---- - -### Phase 2: Project Exploration - -**Purpose**: Multi-angle exploration based on user's focus areas - -**Step 2.1: Launch Parallel Explore Agents** - -```javascript -const explorationAngles = mapFocusToAngles(analysis_config.focus_areas); -// Examples: -// - "Layer Structure" → ["architecture", "dependencies"] -// - "Design Patterns" → ["patterns", "abstractions"] -// - "Core Algorithms" → ["algorithms", "complexity"] - -const explorationTasks = explorationAngles.map((angle, index) => - Task({ - subagent_type: "cli-explore-agent", - run_in_background: false, - description: `Explore: ${angle}`, - prompt: ` -## Exploration Objective -Execute **${angle}** exploration for project analysis report. - -## Context -- **Angle**: ${angle} -- **Report Type**: ${analysis_config.type} -- **Depth**: ${analysis_config.depth} -- **Scope**: ${analysis_config.scope} -- **Output**: ${outputDir}/exploration-${angle}.json - -## Exploration Protocol - -**Step 1: Structural Discovery** -- Execute: ccw tool exec get_modules_by_depth '{}' -- Find files relevant to ${angle} using rg/glob -- Map directory structure and module boundaries - -**Step 2: Pattern Recognition** -- Identify ${angle}-related patterns and conventions -- Note file naming, organization patterns -- Extract key abstractions - -**Step 3: Relationship Mapping** -- Map dependencies and relationships -- Identify integration points -- Note coupling and cohesion patterns - -## Output Format -{ - "angle": "${angle}", - "findings": { - "structure": [...], - "patterns": [...], - "relationships": [...], - "key_files": [{path, relevance, rationale}] - }, - "insights": [...], - "depth_notes": "${analysis_config.depth}-specific observations" -} - -Write output to: ${outputDir}/exploration-${angle}.json -` - }) -); -``` - -**Step 2.2: Generate Exploration Manifest** - -```javascript -const manifest = { - config: analysis_config, - explorations: explorationAngles.map(angle => ({ - angle, - file: `exploration-${angle}.json` - })), - timestamp: new Date().toISOString() -}; -Write(`${outputDir}/explorations-manifest.json`, JSON.stringify(manifest, null, 2)); -``` - -**TodoWrite**: Mark Phase 2 completed, Phase 3 in_progress - ---- - -### Phase 3: Deep Analysis - -**Purpose**: Use Gemini CLI for in-depth analysis based on exploration results - -**Step 3.1: Synthesize Exploration Results** - -```javascript -// Read all exploration files -const explorations = explorationAngles.map(angle => - JSON.parse(Read(`${outputDir}/exploration-${angle}.json`)) -); - -// Extract key files to analyze -const keyFiles = explorations.flatMap(e => e.findings.key_files) - .sort((a, b) => b.relevance - a.relevance) - .slice(0, 20); // Top 20 most relevant files -``` - -**Step 3.2: Execute Deep Analysis via Gemini CLI** - -```javascript -const analysisPrompt = buildAnalysisPrompt(analysis_config, keyFiles); - -Bash({ - command: `ccw cli -p " -PURPOSE: Generate ${analysis_config.type} analysis report for project -TASK: - • Analyze project structure and patterns from ${analysis_config.type} perspective - • Focus on: ${analysis_config.focus_areas.join(', ')} - • Depth level: ${analysis_config.depth} - • Key files to analyze: ${keyFiles.map(f => f.path).join(', ')} -MODE: analysis -CONTEXT: @**/* | Exploration results from ${outputDir}/ -EXPECTED: - - Structured ${analysis_config.type} analysis - - Clear explanations with code references - - Mermaid diagrams where applicable - - Actionable insights -RULES: $(cat ~/.claude/workflows/cli-templates/protocols/analysis-protocol.md) | - Focus on ${analysis_config.focus_areas.join(', ')} | - Depth: ${analysis_config.depth} -" --tool gemini --mode analysis`, - run_in_background: true, - timeout: 600000 -}); -``` - -**Step 3.3: Store Analysis Results** - -```javascript -// After CLI completes, store structured results -Write(`${outputDir}/deep-analysis.json`, JSON.stringify({ - type: analysis_config.type, - focus_areas: analysis_config.focus_areas, - analysis_result: cli_output, - diagrams: extracted_diagrams, - code_references: extracted_references -}, null, 2)); -``` - -**TodoWrite**: Mark Phase 3 completed, Phase 3.5 in_progress - ---- - -### Phase 3.5: Diagram Generation - -**Purpose**: Generate Mermaid diagrams based on analysis results, tailored to report type - -**Step 3.5.1: Determine Required Diagrams** - -```javascript -// Map report types to required diagram types -const diagramRequirements = { - architecture: [ - {type: 'architecture', format: 'graph TD', name: 'System Architecture'}, - {type: 'layers', format: 'graph TD', name: 'Layer Structure'}, - {type: 'dependencies', format: 'graph LR', name: 'Module Dependencies'}, - {type: 'dataflow', format: 'flowchart LR', name: 'Data Flow'} - ], - design: [ - {type: 'class', format: 'classDiagram', name: 'Class Diagram'}, - {type: 'components', format: 'graph TD', name: 'Component Diagram'}, - {type: 'patterns', format: 'graph TD', name: 'Design Patterns'}, - {type: 'state', format: 'stateDiagram-v2', name: 'State Management'} - ], - methods: [ - {type: 'algorithm', format: 'flowchart TD', name: 'Algorithm Flowchart'}, - {type: 'sequence', format: 'sequenceDiagram', name: 'Call Sequence'}, - {type: 'critical_path', format: 'flowchart LR', name: 'Critical Path'}, - {type: 'api', format: 'graph LR', name: 'API Structure'} - ], - comprehensive: [ - {type: 'architecture', format: 'graph TD', name: 'System Architecture'}, - {type: 'class', format: 'classDiagram', name: 'Class Diagram'}, - {type: 'sequence', format: 'sequenceDiagram', name: 'Key Sequences'}, - {type: 'dataflow', format: 'flowchart LR', name: 'Data Flow'} - ] -}; - -const requiredDiagrams = diagramRequirements[config.type]; -bash(`mkdir -p ${outputDir}/diagrams`); -``` - -**Step 3.5.2: Generate Architecture Diagrams (graph TD)** - -```javascript -function generateArchitectureDiagram(analysis) { - let mermaid = 'graph TD\n'; - - // Subgraphs for layers/modules - const layers = analysis.structure?.layers || extractLayers(analysis); - for (const layer of layers) { - mermaid += ` subgraph ${sanitizeId(layer.name)}["${escapeLabel(layer.name)}"]\n`; - for (const component of layer.components || []) { - mermaid += ` ${sanitizeId(component.id)}["${escapeLabel(component.name)}"]\n`; - } - mermaid += ' end\n'; - } - - // Connections - const connections = analysis.relationships || []; - for (const conn of connections) { - const label = conn.label ? `|"${escapeLabel(conn.label)}"|` : ''; - mermaid += ` ${sanitizeId(conn.from)} -->${label} ${sanitizeId(conn.to)}\n`; - } - - // Styling - mermaid += '\n %% Styling\n'; - mermaid += ' classDef core fill:#e1f5fe,stroke:#01579b\n'; - mermaid += ' classDef external fill:#fff3e0,stroke:#e65100\n'; - - return mermaid; -} - -if (requiredDiagrams.some(d => d.type === 'architecture')) { - const archDiagram = generateArchitectureDiagram(deepAnalysis); - Write(`${outputDir}/diagrams/architecture.mmd`, archDiagram); -} -``` - -**Step 3.5.3: Generate Layer/Dependency Diagrams** - -```javascript -function generateLayerDiagram(analysis) { - let mermaid = 'graph TD\n'; - - const layers = ['Presentation', 'Application', 'Domain', 'Infrastructure']; - const detectedLayers = analysis.layers || inferLayers(analysis); - - for (let i = 0; i < detectedLayers.length; i++) { - const layer = detectedLayers[i]; - mermaid += ` subgraph L${i}["${escapeLabel(layer.name)}"]\n`; - mermaid += ` direction LR\n`; - for (const mod of layer.modules || []) { - mermaid += ` ${sanitizeId(mod)}["${escapeLabel(mod)}"]\n`; - } - mermaid += ' end\n'; - - if (i < detectedLayers.length - 1) { - mermaid += ` L${i} --> L${i + 1}\n`; - } - } - - return mermaid; -} - -function generateDependencyDiagram(analysis) { - let mermaid = 'graph LR\n'; - - const modules = analysis.modules || []; - const deps = analysis.dependencies || []; - - // Nodes - for (const mod of modules) { - const shape = mod.type === 'core' ? '([' : '['; - const shapeEnd = mod.type === 'core' ? '])' : ']'; - mermaid += ` ${sanitizeId(mod.name)}${shape}"${escapeLabel(mod.name)}"${shapeEnd}\n`; - } - - // Edges with labels - for (const dep of deps) { - const style = dep.type === 'weak' ? '-.->' : '-->'; - const label = dep.count ? `|"${dep.count} refs"|` : ''; - mermaid += ` ${sanitizeId(dep.from)} ${style}${label} ${sanitizeId(dep.to)}\n`; - } - - return mermaid; -} - -if (requiredDiagrams.some(d => d.type === 'layers')) { - Write(`${outputDir}/diagrams/layers.mmd`, generateLayerDiagram(deepAnalysis)); -} -if (requiredDiagrams.some(d => d.type === 'dependencies')) { - Write(`${outputDir}/diagrams/dependencies.mmd`, generateDependencyDiagram(deepAnalysis)); -} -``` - -**Step 3.5.4: Generate Class Diagrams (classDiagram)** - -```javascript -function generateClassDiagram(analysis) { - let mermaid = 'classDiagram\n'; - - const entities = analysis.entities || analysis.classes || []; - - // Classes with properties and methods - for (const entity of entities.slice(0, 15)) { // Limit for readability - mermaid += ` class ${sanitizeId(entity.name)} {\n`; - - // Properties - for (const prop of (entity.properties || []).slice(0, 8)) { - const vis = {public: '+', private: '-', protected: '#'}[prop.visibility] || '+'; - mermaid += ` ${vis}${sanitizeType(prop.type)} ${prop.name}\n`; - } - - // Methods - for (const method of (entity.methods || []).slice(0, 6)) { - const vis = {public: '+', private: '-', protected: '#'}[method.visibility] || '+'; - const params = (method.params || []).map(p => `${p.name}`).join(', '); - mermaid += ` ${vis}${method.name}(${params}) ${sanitizeType(method.returnType || 'void')}\n`; - } - - mermaid += ' }\n'; - } - - // Relationships - const relationships = analysis.relationships || []; - const arrows = { - inheritance: '--|>', - implementation: '..|>', - composition: '*--', - aggregation: 'o--', - association: '-->', - dependency: '..>' - }; - - for (const rel of relationships) { - const arrow = arrows[rel.type] || '-->'; - const label = rel.label ? ` : ${escapeLabel(rel.label)}` : ''; - mermaid += ` ${sanitizeId(rel.from)} ${arrow} ${sanitizeId(rel.to)}${label}\n`; - } - - return mermaid; -} - -if (requiredDiagrams.some(d => d.type === 'class')) { - Write(`${outputDir}/diagrams/class-diagram.mmd`, generateClassDiagram(deepAnalysis)); -} -``` - -**Step 3.5.5: Generate Flowcharts (flowchart TD)** - -```javascript -function generateAlgorithmFlowchart(algorithm) { - let mermaid = 'flowchart TD\n'; - - // Start - mermaid += ` START(["Start: ${escapeLabel(algorithm.name)}"])\n`; - - // Input processing - if (algorithm.inputs?.length > 0) { - mermaid += ` INPUT[/"Input: ${algorithm.inputs.map(i => i.name).join(', ')}"/]\n`; - mermaid += ' START --> INPUT\n'; - } - - // Process steps - let prevNode = algorithm.inputs?.length > 0 ? 'INPUT' : 'START'; - for (let i = 0; i < (algorithm.steps || []).length; i++) { - const step = algorithm.steps[i]; - const nodeId = `STEP${i}`; - - if (step.type === 'decision') { - mermaid += ` ${nodeId}{"${escapeLabel(step.description)}"}\n`; - mermaid += ` ${prevNode} --> ${nodeId}\n`; - if (step.yes_branch) { - mermaid += ` ${nodeId} -->|"Yes"| ${sanitizeId(step.yes_branch)}\n`; - } - if (step.no_branch) { - mermaid += ` ${nodeId} -->|"No"| ${sanitizeId(step.no_branch)}\n`; - } - } else { - mermaid += ` ${nodeId}["${escapeLabel(step.description)}"]\n`; - mermaid += ` ${prevNode} --> ${nodeId}\n`; - } - prevNode = nodeId; - } - - // Output and end - mermaid += ` OUTPUT[/"Output: ${escapeLabel(algorithm.output || 'Result')}"/]\n`; - mermaid += ` ${prevNode} --> OUTPUT\n`; - mermaid += ' END_([End])\n'; - mermaid += ' OUTPUT --> END_\n'; - - return mermaid; -} - -function generateDataFlowDiagram(analysis) { - let mermaid = 'flowchart LR\n'; - - const flows = analysis.dataFlows || []; - - // Data stores - mermaid += ' subgraph DataStores["Data Stores"]\n'; - for (const store of analysis.dataStores || []) { - mermaid += ` ${sanitizeId(store.id)}[("${escapeLabel(store.name)}")]\n`; - } - mermaid += ' end\n'; - - // Processes - mermaid += ' subgraph Processes["Processes"]\n'; - for (const proc of analysis.processes || []) { - mermaid += ` ${sanitizeId(proc.id)}["${escapeLabel(proc.name)}"]\n`; - } - mermaid += ' end\n'; - - // Flows - for (const flow of flows) { - mermaid += ` ${sanitizeId(flow.from)} -->|"${escapeLabel(flow.data)}"| ${sanitizeId(flow.to)}\n`; - } - - return mermaid; -} - -// Generate algorithm flowcharts -const algorithms = deepAnalysis.algorithms || []; -for (const algo of algorithms.slice(0, 3)) { // Top 3 algorithms - const flowchart = generateAlgorithmFlowchart(algo); - Write(`${outputDir}/diagrams/algorithm-${sanitizeId(algo.name)}.mmd`, flowchart); -} - -if (requiredDiagrams.some(d => d.type === 'dataflow')) { - Write(`${outputDir}/diagrams/dataflow.mmd`, generateDataFlowDiagram(deepAnalysis)); -} -``` - -**Step 3.5.6: Generate Sequence Diagrams (sequenceDiagram)** - -```javascript -function generateSequenceDiagram(scenario) { - let mermaid = 'sequenceDiagram\n'; - - // Title - mermaid += ` title ${escapeLabel(scenario.name)}\n`; - - // Participants - for (const actor of scenario.actors || []) { - const type = actor.type === 'external' ? 'actor' : 'participant'; - mermaid += ` ${type} ${sanitizeId(actor.id)} as ${escapeLabel(actor.name)}\n`; - } - - mermaid += '\n'; - - // Messages - for (const msg of scenario.messages || []) { - let arrow; - switch (msg.type) { - case 'async': arrow = '->>'; break; - case 'response': arrow = '-->>'; break; - case 'create': arrow = '->>+'; break; - case 'destroy': arrow = '->>-'; break; - default: arrow = '->>'; - } - - mermaid += ` ${sanitizeId(msg.from)}${arrow}${sanitizeId(msg.to)}: ${escapeLabel(msg.description)}\n`; - - // Activations - if (msg.activate) { - mermaid += ` activate ${sanitizeId(msg.to)}\n`; - } - if (msg.deactivate) { - mermaid += ` deactivate ${sanitizeId(msg.from)}\n`; - } - - // Notes - if (msg.note) { - mermaid += ` Note over ${sanitizeId(msg.to)}: ${escapeLabel(msg.note)}\n`; - } - } - - // Loops and conditions - for (const block of scenario.blocks || []) { - if (block.type === 'loop') { - mermaid += ` loop ${escapeLabel(block.condition)}\n`; - for (const m of block.messages) { - mermaid += ` ${sanitizeId(m.from)}->>${sanitizeId(m.to)}: ${escapeLabel(m.description)}\n`; - } - mermaid += ' end\n'; - } - } - - return mermaid; -} - -// Generate sequence diagrams for key scenarios -const scenarios = deepAnalysis.sequences || deepAnalysis.scenarios || []; -for (const scenario of scenarios.slice(0, 3)) { - const seqDiagram = generateSequenceDiagram(scenario); - Write(`${outputDir}/diagrams/sequence-${sanitizeId(scenario.name)}.mmd`, seqDiagram); -} -``` - -**Step 3.5.7: Generate State Diagrams (stateDiagram-v2)** - -```javascript -function generateStateDiagram(stateAnalysis) { - let mermaid = 'stateDiagram-v2\n'; - - const states = stateAnalysis.states || []; - const transitions = stateAnalysis.transitions || []; - - // States - for (const state of states) { - if (state.type === 'initial') { - mermaid += ` [*] --> ${sanitizeId(state.name)}\n`; - } else if (state.type === 'final') { - mermaid += ` ${sanitizeId(state.name)} --> [*]\n`; - } else if (state.substates) { - mermaid += ` state ${sanitizeId(state.name)} {\n`; - for (const sub of state.substates) { - mermaid += ` ${sanitizeId(sub.name)}\n`; - } - mermaid += ' }\n'; - } else { - mermaid += ` ${sanitizeId(state.name)} : ${escapeLabel(state.description || state.name)}\n`; - } - } - - // Transitions - for (const trans of transitions) { - const label = trans.event ? ` : ${escapeLabel(trans.event)}` : ''; - mermaid += ` ${sanitizeId(trans.from)} --> ${sanitizeId(trans.to)}${label}\n`; - } - - return mermaid; -} - -if (requiredDiagrams.some(d => d.type === 'state')) { - const stateAnalysis = deepAnalysis.stateManagement || {}; - if (stateAnalysis.states?.length > 0) { - Write(`${outputDir}/diagrams/state.mmd`, generateStateDiagram(stateAnalysis)); - } -} -``` - -**Step 3.5.8: Helper Functions & Validation** - -```javascript -// Sanitize ID for Mermaid (only alphanumeric and underscore) -function sanitizeId(text) { - return text.replace(/[^a-zA-Z0-9_]/g, '_').replace(/^[0-9]/, '_$&'); -} - -// Escape special characters in labels -function escapeLabel(text) { - if (!text) return ''; - return text - .replace(/"/g, "'") - .replace(/[(){}[\]<>]/g, char => `#${char.charCodeAt(0)};`) - .substring(0, 50); // Limit length -} - -// Sanitize type names -function sanitizeType(type) { - if (!type) return 'any'; - return type.replace(/[<>]/g, '').replace(/\|/g, ' or '); -} - -// Validate all generated diagrams -function validateMermaidDiagrams(diagramDir) { - const files = Glob(`${diagramDir}/*.mmd`); - const results = []; - - for (const file of files) { - const content = Read(file); - const issues = []; - - // Check for common syntax errors - if (content.includes('undefined')) issues.push('Contains undefined'); - if (content.match(/-->.*-->/)) issues.push('Double arrow syntax'); - if (!content.match(/^(graph|flowchart|classDiagram|sequenceDiagram|stateDiagram)/m)) { - issues.push('Missing diagram type declaration'); - } - - // Check for unescaped special chars in labels - const labelMatches = content.match(/\["[^"]*[(){}[\]<>][^"]*"\]/g); - if (labelMatches?.some(m => !m.includes('#'))) { - issues.push('Unescaped special characters in labels'); - } - - results.push({ - file: file.split('/').pop(), - valid: issues.length === 0, - issues - }); - } - - return results; -} - -const validationResults = validateMermaidDiagrams(`${outputDir}/diagrams`); -Write(`${outputDir}/diagrams/validation.json`, JSON.stringify(validationResults, null, 2)); - -// Log validation summary -const validCount = validationResults.filter(r => r.valid).length; -console.log(`Diagram Validation: ${validCount}/${validationResults.length} passed`); -``` - -**Step 3.5.9: Generate Diagram Manifest** - -```javascript -const diagramManifest = { - report_type: config.type, - generated_at: new Date().toISOString(), - diagrams: requiredDiagrams.map(req => { - const filename = `${req.type}.mmd`; - const exists = file_exists(`${outputDir}/diagrams/${filename}`); - return { - type: req.type, - format: req.format, - name: req.name, - filename: exists ? filename : null, - status: exists ? 'generated' : 'skipped' - }; - }), - validation: validationResults -}; - -Write(`${outputDir}/diagrams/manifest.json`, JSON.stringify(diagramManifest, null, 2)); -``` - -**TodoWrite**: Mark Phase 3.5 completed, Phase 4 in_progress - ---- - -### Phase 4: Initial Report Generation - -**Purpose**: Synthesize all analysis into structured report - -**Step 4.1: Load All Analysis Data and Diagrams** - -```javascript -const config = JSON.parse(Read(`${outputDir}/analysis-config.json`)); -const explorations = JSON.parse(Read(`${outputDir}/explorations-manifest.json`)); -const deepAnalysis = JSON.parse(Read(`${outputDir}/deep-analysis.json`)); -const diagramManifest = JSON.parse(Read(`${outputDir}/diagrams/manifest.json`)); - -// Load all generated diagrams -const diagrams = {}; -for (const diag of diagramManifest.diagrams.filter(d => d.status === 'generated')) { - diagrams[diag.type] = Read(`${outputDir}/diagrams/${diag.filename}`); -} -``` - -**Step 4.2: Generate Report Structure** - -```javascript -const reportTemplate = getReportTemplate(config.type); -// Templates define section structure for each report type - -const reportSections = { - architecture: [ - "# Architecture Report", - "## Executive Summary", - "## System Overview", - "## Layer Analysis", - "## Module Dependencies", - "## Data Flow", - "## Entry Points & Critical Paths", - "## Dependency Graph (Mermaid)", - "## Recommendations" - ], - design: [ - "# Design Report", - "## Executive Summary", - "## Design Patterns Used", - "## Class Relationships", - "## Interface Contracts", - "## State Management", - "## Component Diagrams (Mermaid)", - "## Design Recommendations" - ], - methods: [ - "# Key Methods Report", - "## Executive Summary", - "## Core Algorithms", - "## Critical Code Paths", - "## Public API Reference", - "## Complex Logic Breakdown", - "## Sequence Diagrams (Mermaid)", - "## Optimization Suggestions" - ], - comprehensive: [ - "# Comprehensive Project Analysis", - "## Executive Summary", - "## Architecture Overview", - "## Design Patterns & Principles", - "## Key Methods & Algorithms", - "## System Diagrams", - "## Recommendations & Next Steps" - ] -}; -``` - -**Step 4.3: Generate Initial Report** - -```javascript -const reportPath = `${outputDir}/${config.type.toUpperCase()}-REPORT.md`; -const report = generateReport(reportSections[config.type], explorations, deepAnalysis); -Write(reportPath, report); -``` - -**Step 4.4: Present to User** - -```markdown -## Initial Report Generated - -**Report Type**: ${config.type} -**Focus Areas**: ${config.focus_areas.join(', ')} -**Depth**: ${config.depth} - -**Output**: ${reportPath} - -### Report Preview -[First 50 lines of report...] - ---- -**Ready for your feedback.** The report can be refined based on your input. -``` - -**TodoWrite**: Mark Phase 4 completed, Phase 5 in_progress - ---- - -### Phase 5: Iterative Refinement (Discovery-Driven) - -**Purpose**: Generate targeted questions based on agent analysis findings, then refine report based on user responses - -**Core Philosophy**: Questions are NOT predetermined. They emerge from what the agent discovered during analysis. - -**Step 5.1: Extract Discoveries from Analysis** - -```javascript -// Parse deep analysis results for actionable discoveries -function extractDiscoveries(deepAnalysis) { - return { - // Ambiguities: Areas where analysis found multiple interpretations - ambiguities: deepAnalysis.findings.filter(f => f.confidence < 0.7), - - // Complexity hotspots: Areas that may need deeper explanation - complexityHotspots: deepAnalysis.findings.filter(f => f.complexity === 'high'), - - // Pattern variations: Where code deviates from expected patterns - patternDeviations: deepAnalysis.patterns.filter(p => p.consistency < 0.8), - - // Unclear dependencies: Relationships that couldn't be fully traced - unclearDependencies: deepAnalysis.dependencies.filter(d => d.type === 'implicit'), - - // Potential issues: Areas flagged for attention - potentialIssues: deepAnalysis.recommendations.filter(r => r.priority === 'investigate'), - - // Depth opportunities: Sections that could benefit from expansion - depthOpportunities: deepAnalysis.sections.filter(s => s.has_more_detail) - }; -} - -const discoveries = extractDiscoveries(deepAnalysis); -``` - -**Step 5.2: Generate Dynamic Questions Based on Discoveries** - -```javascript -function buildDynamicQuestions(discoveries, config) { - const questions = []; - - // Question 1: Address ambiguities found - if (discoveries.ambiguities.length > 0) { - const topAmbiguity = discoveries.ambiguities[0]; - questions.push({ - question: `Analysis found ambiguity in "${topAmbiguity.area}": ${topAmbiguity.description}. Which interpretation is correct?`, - header: "Clarify", - multiSelect: false, - options: topAmbiguity.interpretations.map((interp, i) => ({ - label: interp.summary, - description: interp.detail - })) - }); - } - - // Question 2: Complexity deep-dive selection - if (discoveries.complexityHotspots.length > 0) { - questions.push({ - question: `These areas have high complexity. Which would you like explained in more detail?`, - header: "Deep-Dive", - multiSelect: true, - options: discoveries.complexityHotspots.slice(0, 4).map(h => ({ - label: h.name, - description: `${h.file}:${h.line} - ${h.brief}` - })) - }); - } - - // Question 3: Pattern deviation confirmation - if (discoveries.patternDeviations.length > 0) { - const deviation = discoveries.patternDeviations[0]; - questions.push({ - question: `Found pattern inconsistency: "${deviation.pattern}" is used differently in ${deviation.locations.length} places. Should I analyze this deviation?`, - header: "Patterns", - multiSelect: false, - options: [ - {label: "Yes, analyze differences", description: "Add detailed comparison to report"}, - {label: "Skip, it's intentional", description: "Note as intentional variation"}, - {label: "Flag as tech debt", description: "Add to recommendations section"} - ] - }); - } - - // Question 4: Dependency clarification - if (discoveries.unclearDependencies.length > 0) { - questions.push({ - question: `Found ${discoveries.unclearDependencies.length} implicit dependencies. Should I trace them fully?`, - header: "Dependencies", - multiSelect: false, - options: [ - {label: "Yes, trace all (Recommended)", description: "Complete dependency graph, may take longer"}, - {label: "Only critical paths", description: "Focus on main execution flows"}, - {label: "Skip for now", description: "Keep current analysis level"} - ] - }); - } - - // Always include satisfaction check as final question - questions.push({ - question: "How would you like to proceed with the current report?", - header: "Action", - multiSelect: false, - options: [ - {label: "Continue refining", description: "Apply above selections and show next discoveries"}, - {label: "Finalize report", description: "Report meets my needs, generate final version"}, - {label: "Change scope", description: "Adjust focus areas or analysis depth"} - ] - }); - - return questions.slice(0, 4); // Max 4 questions per iteration -} - -const dynamicQuestions = buildDynamicQuestions(discoveries, config); -AskUserQuestion({questions: dynamicQuestions}); -``` - -**Step 5.3: Process User Responses and Update Analysis** - -```javascript -function processResponses(responses, discoveries, deepAnalysis) { - const updates = []; - - // Handle ambiguity resolution - if (responses.clarify) { - const resolution = { - area: discoveries.ambiguities[0].area, - resolved_to: responses.clarify, - confidence: 1.0 // User confirmed - }; - updates.push({type: 'resolve_ambiguity', data: resolution}); - } - - // Handle deep-dive requests - if (responses.deepDive && responses.deepDive.length > 0) { - const deepDiveTargets = responses.deepDive; - updates.push({type: 'expand_sections', data: deepDiveTargets}); - } - - // Handle pattern deviation response - if (responses.patterns) { - updates.push({type: 'pattern_handling', data: responses.patterns}); - } - - // Handle dependency tracing - if (responses.dependencies) { - updates.push({type: 'dependency_depth', data: responses.dependencies}); - } - - return updates; -} - -const updates = processResponses(user_responses, discoveries, deepAnalysis); -``` - -**Step 5.4: Incremental Analysis & Report Update** - -```javascript -// Execute targeted re-analysis based on user responses -async function applyUpdates(updates, outputDir, config) { - for (const update of updates) { - switch (update.type) { - case 'expand_sections': - // Re-run Gemini CLI for specific sections only - for (const target of update.data) { - Bash({ - command: `ccw cli -p " -PURPOSE: Deep-dive analysis of ${target.name} -TASK: Provide detailed explanation of ${target.file}:${target.line} - • Trace execution flow step by step - • Explain complex logic with examples - • Show relationships to other components -MODE: analysis -CONTEXT: @${target.file} | Previous analysis from ${outputDir}/ -EXPECTED: Detailed section content for report insertion -RULES: $(cat ~/.claude/workflows/cli-templates/protocols/analysis-protocol.md) -" --tool gemini --mode analysis`, - run_in_background: true - }); - } - break; - - case 'resolve_ambiguity': - // Update analysis with confirmed interpretation - deepAnalysis.findings = deepAnalysis.findings.map(f => - f.area === update.data.area - ? {...f, interpretation: update.data.resolved_to, confidence: 1.0} - : f - ); - break; - - case 'dependency_depth': - if (update.data === 'Yes, trace all (Recommended)') { - // Launch dependency tracing agent - Task({ - subagent_type: "cli-explore-agent", - description: "Trace full dependencies", - prompt: `Trace all implicit dependencies found in ${outputDir}/deep-analysis.json` - }); - } - break; - } - } -} - -await applyUpdates(updates, outputDir, config); - -// Regenerate report sections affected by updates -const currentReport = Read(reportPath); -const updatedSections = regenerateAffectedSections(currentReport, updates); -Write(reportPath, updatedSections); -Write(`${outputDir}/iterations/v${iteration_count}.md`, currentReport); // Archive previous version -``` - -**Step 5.5: Check Termination or Continue Loop** - -```javascript -iteration_count++; - -if (user_responses.action === 'Finalize report') { - // Exit refinement loop - finalized = true; -} else if (user_responses.action === 'Change scope') { - // Return to Phase 1 for scope adjustment - goto Phase1_Step1_3; // Focus area selection -} else { - // Extract new discoveries from updated analysis - const newDiscoveries = extractDiscoveries(deepAnalysis); - - if (newDiscoveries.total_count === 0) { - // No more discoveries to clarify - console.log("All ambiguities resolved. Report is comprehensive."); - AskUserQuestion({ - questions: [{ - question: "No further clarifications needed. Finalize the report?", - header: "Complete", - multiSelect: false, - options: [ - {label: "Yes, finalize", description: "Generate final polished report"}, - {label: "Add custom section", description: "I want to add something specific"} - ] - }] - }); - } else { - // Continue with next round of discovery-driven questions - goto Step5.2; - } -} - -// Safety limit -if (iteration_count > 7) { - console.log("Maximum refinement iterations reached. Finalizing report."); - finalized = true; -} -``` - -**Step 5.6: Discovery Summary Between Iterations** - -```javascript -// Show user what was learned in this iteration -function summarizeIteration(iteration_count, updates, discoveries) { - return ` -### Iteration ${iteration_count} Summary - -**Clarifications Applied**: ${updates.filter(u => u.type === 'resolve_ambiguity').length} -**Sections Expanded**: ${updates.filter(u => u.type === 'expand_sections').length} -**Patterns Addressed**: ${updates.filter(u => u.type === 'pattern_handling').length} - -**Remaining Discoveries**: -- Ambiguities: ${discoveries.ambiguities.length} -- Complexity hotspots: ${discoveries.complexityHotspots.length} -- Pattern deviations: ${discoveries.patternDeviations.length} - -**Report Sections Updated**: [list affected sections] -`; -} - -console.log(summarizeIteration(iteration_count, updates, discoveries)); -``` - ---- - -### Finalization - -**Step 6.1: Polish Final Report** - -```javascript -// Add metadata and finalize formatting -const finalReport = ` - - - - -${report_content} - ---- -*Report generated through ${iteration_count} refinement iterations.* -`; - -Write(reportPath, finalReport); -``` - -**Step 6.2: Summary Output** - -```markdown -## Analysis Complete - -**Final Report**: ${reportPath} -**Report Type**: ${config.type} -**Focus Areas**: ${config.focus_areas.join(', ')} -**Refinement Iterations**: ${iteration_count} - -### Report Contents -- ${section_count} sections -- ${diagram_count} Mermaid diagrams -- ${code_reference_count} code references - -### Recommended Next Steps -1. Share report with team members -2. Use findings to guide development decisions -3. Run `/workflow:plan` to create implementation tasks based on recommendations -``` - -**TodoWrite**: Mark Phase 5 completed - ---- - -## TodoWrite Pattern - -**Dynamic Task Tracking** with phase-level visibility: - -```javascript -// Initial state -[ - {content: "Phase 1: Requirements Discovery", status: "pending", activeForm: "Discovering analysis requirements"}, - {content: "Phase 2: Project Exploration", status: "pending", activeForm: "Exploring project structure"}, - {content: "Phase 3: Deep Analysis", status: "pending", activeForm: "Performing deep analysis"}, - {content: "Phase 3.5: Diagram Generation", status: "pending", activeForm: "Generating Mermaid diagrams"}, - {content: "Phase 4: Initial Report Generation", status: "pending", activeForm: "Generating initial report"}, - {content: "Phase 5: Iterative Refinement", status: "pending", activeForm: "Refining report based on feedback"} -] - -// During Phase 3.5, show diagram generation progress -[ - {content: "Phase 1: Requirements Discovery", status: "completed", activeForm: "Discovering analysis requirements"}, - {content: "Phase 2: Project Exploration", status: "completed", activeForm: "Exploring project structure"}, - {content: "Phase 3: Deep Analysis", status: "completed", activeForm: "Performing deep analysis"}, - {content: "Phase 3.5: Diagram Generation", status: "in_progress", activeForm: "Generating Mermaid diagrams"}, - {content: " → Architecture diagram", status: "completed", activeForm: "Generating architecture diagram"}, - {content: " → Class diagram", status: "in_progress", activeForm: "Generating class diagram"}, - {content: " → Sequence diagrams", status: "pending", activeForm: "Generating sequence diagrams"}, - {content: "Phase 4: Initial Report Generation", status: "pending", activeForm: "Generating initial report"}, - {content: "Phase 5: Iterative Refinement", status: "pending", activeForm: "Refining report based on feedback"} -] - -// During Phase 5 iterations, expand to show current refinement -[ - {content: "Phase 1: Requirements Discovery", status: "completed", activeForm: "Discovering analysis requirements"}, - {content: "Phase 2: Project Exploration", status: "completed", activeForm: "Exploring project structure"}, - {content: "Phase 3: Deep Analysis", status: "completed", activeForm: "Performing deep analysis"}, - {content: "Phase 3.5: Diagram Generation", status: "completed", activeForm: "Generating Mermaid diagrams"}, - {content: "Phase 4: Initial Report Generation", status: "completed", activeForm: "Generating initial report"}, - {content: "Phase 5: Iterative Refinement", status: "in_progress", activeForm: "Refining report based on feedback"}, - {content: " → Iteration 2: Expanding Architecture section", status: "in_progress", activeForm: "Expanding section"} -] -``` - -## Usage Examples - -```bash -# Interactive analysis (prompts for all preferences) -/workflow:docs:analyze - -# Architecture report for specific directory -/workflow:docs:analyze src/core --type=architecture - -# Design patterns analysis -/workflow:docs:analyze --type=design - -# Comprehensive analysis of entire project -/workflow:docs:analyze --type=comprehensive - -# Key methods analysis for API module -/workflow:docs:analyze src/api --type=methods -``` - -## Output Structure - -``` -.workflow/.scratchpad/analyze-{timestamp}/ -├── analysis-config.json # User's analysis preferences -├── exploration-{angle}.json # Per-angle exploration results -├── explorations-manifest.json # Exploration summary -├── deep-analysis.json # Gemini CLI analysis output -├── diagrams/ # Generated Mermaid diagrams -│ ├── manifest.json # Diagram generation manifest -│ ├── validation.json # Syntax validation results -│ ├── architecture.mmd # System architecture (graph TD) -│ ├── layers.mmd # Layer structure (graph TD) -│ ├── dependencies.mmd # Module dependencies (graph LR) -│ ├── class-diagram.mmd # Class relationships (classDiagram) -│ ├── dataflow.mmd # Data flow (flowchart LR) -│ ├── algorithm-*.mmd # Algorithm flowcharts (flowchart TD) -│ ├── sequence-*.mmd # Call sequences (sequenceDiagram) -│ └── state.mmd # State transitions (stateDiagram-v2) -├── {TYPE}-REPORT.md # Final report (main output) -└── iterations/ # Refinement history - ├── v1-initial.md - ├── v2-expanded.md - └── ... -``` - -## Report Quality Standards - -**All reports must include**: -1. **Executive Summary**: 3-5 key takeaways -2. **Visual Diagrams**: Mermaid syntax for architecture/flow visualization -3. **Code References**: `file:line` format for easy navigation -4. **Actionable Insights**: Specific recommendations, not generic advice -5. **Consistent Depth**: Match user's selected depth level throughout - -**Quality Gates**: -- [ ] All focus areas addressed -- [ ] Diagrams render correctly (Mermaid syntax valid) -- [ ] Code references are accurate (files exist) -- [ ] No placeholder content -- [ ] Recommendations are project-specific - -## Error Handling - -| Error | Recovery | -|-------|----------| -| CLI timeout | Reduce scope, retry with fewer files | -| Exploration failure | Fall back to direct file reading | -| User abandons iteration | Save current progress, allow resume | -| Invalid scope path | Prompt user to correct path | - -## Related Commands - -**Prerequisite Commands**: -- None required (can be run independently) - -**Follow-up Commands**: -- `/workflow:plan` - Create implementation tasks from recommendations -- `/memory:docs` - Generate project documentation -- `/workflow:review` - Review specific areas identified in analysis - -**Alternative Commands**: -- `/memory:code-map-memory` - For feature-specific code mapping -- `/memory:load` - For quick context loading without reports - ---- - -## Mermaid Diagram Specifications - -### Diagram Types by Report - -| Report Type | Diagrams Generated | -|-------------|-------------------| -| `architecture` | architecture.mmd, layers.mmd, dependencies.mmd, dataflow.mmd | -| `design` | class-diagram.mmd, components.mmd, patterns.mmd, state.mmd | -| `methods` | algorithm-*.mmd, sequence-*.mmd, critical-path.mmd, api.mmd | -| `comprehensive` | architecture.mmd, class-diagram.mmd, sequence-*.mmd, dataflow.mmd | - -### Syntax Rules (Ensuring Valid Diagrams) - -```javascript -// 1. Node IDs: Only alphanumeric and underscore -sanitizeId("User-Service") → "User_Service" -sanitizeId("3rdParty") → "_3rdParty" - -// 2. Labels: Escape special characters with HTML entities -escapeLabel("Process(data)") → "Process#40;data#41;" -escapeLabel("Check {valid?}") → "Check #123;valid?#125;" - -// 3. Edge labels: Always use double quotes -// ✓ Correct -A -->|"calls(param)"| B - -// ✗ Wrong -A -->|calls(param)| B - -// 4. Subgraph titles: Use quoted labels -// ✓ Correct -subgraph ServiceLayer["Service Layer (Core)"] - -// ✗ Wrong -subgraph ServiceLayer[Service Layer (Core)] -``` - -### Diagram Format Examples - -**Architecture (graph TD)**: -```mermaid -graph TD - subgraph Presentation["Presentation Layer"] - API["API Gateway"] - CLI["CLI Interface"] - end - subgraph Business["Business Layer"] - SVC["Core Services"] - end - API --> SVC - CLI --> SVC -``` - -**Class Diagram (classDiagram)**: -```mermaid -classDiagram - class UserService { - +string name - +getUser(id) User - -validate() boolean - } - class Repository { - <> - +find(id) Entity - } - UserService --> Repository : uses -``` - -**Sequence Diagram (sequenceDiagram)**: -```mermaid -sequenceDiagram - participant C as Client - participant S as Server - participant D as Database - C->>S: request(data) - activate S - S->>D: query - D-->>S: result - S-->>C: response - deactivate S -``` - -**Flowchart (flowchart TD)**: -```mermaid -flowchart TD - START(["Start"]) - INPUT[/"Read input"/] - CHECK{"Valid?"} - PROCESS["Process data"] - OUTPUT[/"Write output"/] - END_(["End"]) - - START --> INPUT --> CHECK - CHECK -->|"Yes"| PROCESS --> OUTPUT --> END_ - CHECK -->|"No"| END_ -``` - -### Validation Checks - -The diagram generator validates: -- [ ] Diagram type declaration present -- [ ] No `undefined` values in output -- [ ] Special characters properly escaped -- [ ] No double arrow syntax errors -- [ ] Subgraph brackets balanced -- [ ] Node IDs are valid identifiers diff --git a/.claude/commands/workflow/docs/copyright.md b/.claude/commands/workflow/docs/copyright.md deleted file mode 100644 index d020090f..00000000 --- a/.claude/commands/workflow/docs/copyright.md +++ /dev/null @@ -1,1265 +0,0 @@ ---- -name: copyright -description: Multi-phase iterative software copyright documentation generator for CPCC compliance, produces design specification with Mermaid diagrams through code analysis -argument-hint: "[scope|path] [--name=软件名称] [--version=版本号]" -allowed-tools: Task(*), AskUserQuestion(*), Read(*), Bash(*), Glob(*), Grep(*), TodoWrite(*), Write(*) ---- - -# Workflow Copyright Command (/workflow:docs:copyright) - -## Overview - -**软件著作权设计说明书生成器** - 基于源代码深度分析,生成符合中国版权保护中心(CPCC)规范的无界面软件设计说明书。通过多阶段迭代和用户交互,确保文档内容准确、完整、合规。 - -**核心原则**: -- **基于代码分析**:所有内容严格来源于代码分析,杜绝臆测 -- **Mermaid 规范**:所有图表使用正确的 Mermaid 语法,特殊字符正确转义 -- **CPCC 合规**:严格遵循鉴别材料必备组成部分要求 -- **迭代优化**:通过多轮交互确保文档质量 - -## 文档结构(鉴别材料必备组成部分) - -| 章节 | 内容 | 图表类型 | -|------|------|----------| -| 1. 软件概述 | 背景、目标、运行环境、技术架构 | - | -| 2. 系统架构图 | 整体架构、模块关系、数据流 | `graph TD` | -| 3. 功能模块设计 | 功能分解、模块职责、交互关系 | `flowchart TD` | -| 4. 核心算法与流程 | 关键业务逻辑、算法实现、处理流程 | `flowchart TD` | -| 5. 数据结构设计 | 数据实体、关系、约束 | `classDiagram` | -| 6. 接口设计 | API 定义、参数、返回值 | `sequenceDiagram` | -| 7. 异常处理设计 | 错误处理、异常流程、恢复机制 | `flowchart TD` | - -## Execution Process - -``` -Input Parsing: - ├─ Parse: scope/path (required, source code location) - ├─ Parse: --name (software name, will prompt if not provided) - └─ Parse: --version (version number, will prompt if not provided) - -Phase 1: Project Discovery & Metadata Collection - ├─ Ask: Software name and version (if not specified) - ├─ Ask: Software category and primary function - ├─ Explore: Project structure and tech stack - └─ Output: project-metadata.json - -Phase 2: Deep Code Analysis (Parallel Agents) - ├─ Agent 1: Architecture analysis (modules, layers, dependencies) - ├─ Agent 2: Function analysis (features, workflows, algorithms) - ├─ Agent 3: Data structure analysis (entities, relationships) - ├─ Agent 4: Interface analysis (APIs, protocols) - └─ Output: analysis-{dimension}.json files - -Phase 3: Mermaid Diagram Generation - ├─ Generate: System architecture diagram (graph TD) - ├─ Generate: Function module diagrams (flowchart TD) - ├─ Generate: Core algorithm flowcharts (flowchart TD) - ├─ Generate: Class diagrams (classDiagram) - ├─ Generate: Sequence diagrams (sequenceDiagram) - └─ Output: diagrams/*.mmd + validation - -Phase 4: Document Assembly - ├─ Synthesize: Merge all analysis results - ├─ Generate: 软件设计说明书.md (draft) - ├─ Validate: CPCC compliance check - └─ Output: Present document to user - -Phase 5: Iterative Refinement (Discovery-Driven) - ├─ Extract: Gaps and ambiguities from analysis - ├─ Ask: Targeted questions based on findings - ├─ Update: Incremental document modification - └─ Loop: Until user satisfied or no more discoveries - -Finalize: - └─ Output: Final 软件设计说明书.md -``` - -## 5-Phase Execution - -### Phase 1: Project Discovery & Metadata Collection - -**Purpose**: Collect essential metadata and understand project structure - -**Step 1.1: Initialize TodoWrite** - -```javascript -TodoWrite([ - {content: "Phase 1: Project Discovery", status: "in_progress", activeForm: "Discovering project structure"}, - {content: "Phase 2: Deep Code Analysis", status: "pending", activeForm: "Analyzing source code"}, - {content: "Phase 3: Diagram Generation", status: "pending", activeForm: "Generating Mermaid diagrams"}, - {content: "Phase 4: Document Assembly", status: "pending", activeForm: "Assembling design document"}, - {content: "Phase 5: Iterative Refinement", status: "pending", activeForm: "Refining based on feedback"} -]) -``` - -**Step 1.2: Collect Software Metadata (if not specified)** - -```javascript -if (!name_specified || !version_specified) { - AskUserQuestion({ - questions: [ - { - question: "请提供软件的正式名称(将显示在页眉和文档标题中)", - header: "软件名称", - multiSelect: false, - options: [ - {label: "使用项目名称", description: `自动检测: ${detected_project_name}`}, - {label: "自定义名称", description: "输入自定义的软件名称"} - ] - }, - { - question: "请提供软件版本号(格式如 V1.0.0)", - header: "版本号", - multiSelect: false, - options: [ - {label: "V1.0.0", description: "初始版本"}, - {label: "使用 package.json 版本", description: `自动检测: ${detected_version}`}, - {label: "自定义版本", description: "输入自定义版本号"} - ] - } - ] - }); -} -``` - -**Step 1.3: Software Category Selection** - -```javascript -AskUserQuestion({ - questions: [{ - question: "请选择软件的主要类型(影响文档描述风格)", - header: "软件类型", - multiSelect: false, - options: [ - {label: "命令行工具 (CLI)", description: "无图形界面的命令行应用程序"}, - {label: "后端服务/API", description: "Web API、微服务、后台服务"}, - {label: "SDK/库", description: "供其他程序调用的开发工具包"}, - {label: "数据处理系统", description: "ETL、数据分析、批处理系统"}, - {label: "自动化脚本", description: "运维自动化、构建工具、脚本集合"} - ] - }] -}); -``` - -**Step 1.4: Explore Project Structure** - -```javascript -Task({ - subagent_type: "cli-explore-agent", - run_in_background: false, - description: "Explore project structure", - prompt: ` -## Exploration Objective -分析项目结构,提取软件著作权文档所需的基础信息。 - -## Target Path -${scope_path} - -## Required Analysis -1. **技术栈识别**:编程语言、框架、主要依赖 -2. **目录结构**:源代码组织方式、模块划分 -3. **入口点识别**:主程序入口、命令行入口 -4. **配置文件**:环境配置、构建配置 -5. **文档线索**:README、注释中的功能描述 - -## Output -Write to: ${outputDir}/project-discovery.json -{ - "tech_stack": {...}, - "directory_structure": {...}, - "entry_points": [...], - "main_modules": [...], - "detected_features": [...] -} -` -}); -``` - -**Step 1.5: Store Project Metadata** - -```javascript -const projectMetadata = { - software_name: selected_name, - version: selected_version, - category: selected_category, - scope_path: scope_path, - tech_stack: discovery.tech_stack, - entry_points: discovery.entry_points, - main_modules: discovery.main_modules, - timestamp: new Date().toISOString() -}; - -Write(`${outputDir}/project-metadata.json`, JSON.stringify(projectMetadata, null, 2)); -``` - -**TodoWrite**: Mark Phase 1 completed, Phase 2 in_progress - ---- - -### Phase 2: Deep Code Analysis (Parallel Agents) - -**Purpose**: Multi-dimensional code analysis for comprehensive understanding - -**Step 2.1: Launch Parallel Analysis Agents** - -```javascript -const analysisAgents = [ - { - dimension: "architecture", - focus: "系统架构分析", - prompt: ` -## 分析目标 -分析项目的系统架构,为"系统架构图"章节提供数据。 - -## 分析内容 -1. **分层结构**:识别代码的分层(如 Controller/Service/Repository) -2. **模块边界**:各模块的职责范围和边界 -3. **依赖关系**:模块间的依赖方向和类型 -4. **核心组件**:系统的核心组件及其作用 -5. **数据流向**:数据在各层之间的流动路径 - -## 输出格式 -{ - "layers": [{name, components, responsibility}], - "modules": [{name, path, responsibility, dependencies}], - "data_flow": [{from, to, data_type, description}], - "core_components": [{name, type, responsibility}], - "mermaid_hints": {nodes: [], edges: []} -} -` - }, - { - dimension: "functions", - focus: "功能模块分析", - prompt: ` -## 分析目标 -分析项目的功能模块,为"功能模块设计"章节提供数据。 - -## 分析内容 -1. **功能清单**:从代码中提取的所有功能点 -2. **功能分组**:按业务逻辑分组的功能模块 -3. **功能层级**:功能的父子关系和层级结构 -4. **模块交互**:功能模块之间的调用关系 -5. **关键流程**:主要业务流程的步骤分解 - -## 输出格式 -{ - "feature_list": [{id, name, description, module, entry_file}], - "feature_groups": [{group_name, features: []}], - "feature_hierarchy": {root: {children: [...]}}, - "interactions": [{caller, callee, trigger, description}], - "key_workflows": [{name, steps: [], files_involved}] -} -` - }, - { - dimension: "algorithms", - focus: "核心算法分析", - prompt: ` -## 分析目标 -分析项目的核心算法和业务逻辑,为"核心算法与流程"章节提供数据。 - -## 分析内容 -1. **核心算法**:识别关键算法(排序、搜索、加密、业务规则等) -2. **复杂逻辑**:圈复杂度高的函数/方法 -3. **处理流程**:核心业务的处理步骤 -4. **输入输出**:算法的输入参数和输出结果 -5. **边界条件**:特殊情况的处理逻辑 - -## 输出格式 -{ - "algorithms": [{ - name, file, line, - description, - complexity, - inputs: [{name, type, description}], - outputs: [{name, type, description}], - steps: [{step_num, description, code_ref}] - }], - "complex_functions": [{name, file, cyclomatic_complexity, description}], - "flowchart_hints": [{algorithm_name, nodes: [], edges: []}] -} -` - }, - { - dimension: "data_structures", - focus: "数据结构分析", - prompt: ` -## 分析目标 -分析项目的数据结构,为"数据结构设计"章节提供数据。 - -## 分析内容 -1. **数据实体**:类、接口、类型定义 -2. **属性字段**:实体的属性及其类型 -3. **实体关系**:继承、组合、关联关系 -4. **约束规则**:数据验证、业务约束 -5. **枚举常量**:枚举类型和常量定义 - -## 输出格式 -{ - "entities": [{ - name, file, type, // class/interface/type - properties: [{name, type, visibility, description}], - methods: [{name, params, return_type, visibility}] - }], - "relationships": [{ - from, to, - type, // inheritance/composition/association/dependency - cardinality, description - }], - "enums": [{name, values: [{name, value, description}]}], - "class_diagram_hints": {classes: [], relationships: []} -} -` - }, - { - dimension: "interfaces", - focus: "接口设计分析", - prompt: ` -## 分析目标 -分析项目的接口设计,为"接口设计"章节提供数据。 - -## 分析内容 -1. **API 端点**:HTTP API、RPC 接口、CLI 命令 -2. **参数定义**:输入参数的名称、类型、约束 -3. **返回值**:输出结果的格式和类型 -4. **调用协议**:同步/异步、请求/响应模式 -5. **接口分组**:按功能或资源分组 - -## 输出格式 -{ - "apis": [{ - name, path, method, // GET/POST/CLI/RPC - description, - parameters: [{name, type, required, description}], - response: {type, schema, description}, - category - }], - "protocols": [{name, type, description}], - "sequence_hints": [{scenario, actors: [], messages: []}] -} -` - }, - { - dimension: "exceptions", - focus: "异常处理分析", - prompt: ` -## 分析目标 -分析项目的异常处理机制,为"异常处理设计"章节提供数据。 - -## 分析内容 -1. **异常类型**:自定义异常类、错误码定义 -2. **捕获策略**:try-catch 的使用模式 -3. **错误传播**:异常的传播和转换链 -4. **恢复机制**:重试、降级、回滚策略 -5. **日志记录**:错误日志的记录方式 - -## 输出格式 -{ - "exception_types": [{name, parent, code, message, file}], - "error_codes": [{code, message, severity, category}], - "handling_patterns": [{pattern, locations: [], description}], - "recovery_strategies": [{strategy, trigger, action, files}], - "logging_approach": {framework, levels, format} -} -` - } -]; - -// Launch all agents in parallel -const analysisTasks = analysisAgents.map(agent => - Task({ - subagent_type: "cli-explore-agent", - run_in_background: false, - description: agent.focus, - prompt: ` -${agent.prompt} - -## Context -- **Scope Path**: ${scope_path} -- **Tech Stack**: ${projectMetadata.tech_stack} -- **Main Modules**: ${projectMetadata.main_modules.join(', ')} - -## Output File -Write to: ${outputDir}/analysis-${agent.dimension}.json -` - }) -); -``` - -**Step 2.2: Validate Analysis Results** - -```javascript -// Verify all analysis files created -const requiredAnalyses = ['architecture', 'functions', 'algorithms', 'data_structures', 'interfaces', 'exceptions']; -for (const dimension of requiredAnalyses) { - const filePath = `${outputDir}/analysis-${dimension}.json`; - if (!file_exists(filePath)) { - throw new Error(`Missing analysis: ${dimension}`); - } -} -``` - -**TodoWrite**: Mark Phase 2 completed, Phase 3 in_progress - ---- - -### Phase 3: Mermaid Diagram Generation - -**Purpose**: Generate all required diagrams with correct Mermaid syntax - -**Step 3.1: System Architecture Diagram** - -```javascript -const archAnalysis = JSON.parse(Read(`${outputDir}/analysis-architecture.json`)); - -// Generate Mermaid graph TD -function generateArchitectureDiagram(analysis) { - let mermaid = 'graph TD\n'; - - // Add subgraphs for layers - for (const layer of analysis.layers) { - mermaid += ` subgraph ${sanitizeId(layer.name)}["${escapeLabel(layer.name)}"]\n`; - for (const comp of layer.components) { - mermaid += ` ${sanitizeId(comp)}["${escapeLabel(comp)}"]\n`; - } - mermaid += ' end\n'; - } - - // Add edges for data flow - for (const flow of analysis.data_flow) { - mermaid += ` ${sanitizeId(flow.from)} -->|"${escapeLabel(flow.description)}"| ${sanitizeId(flow.to)}\n`; - } - - return mermaid; -} - -// Helper: Escape special characters in labels -function escapeLabel(text) { - return text.replace(/["(){}[\]<>]/g, char => `#${char.charCodeAt(0)};`); -} - -const archDiagram = generateArchitectureDiagram(archAnalysis); -Write(`${outputDir}/diagrams/architecture.mmd`, archDiagram); -``` - -**Step 3.2: Function Module Diagrams** - -```javascript -const funcAnalysis = JSON.parse(Read(`${outputDir}/analysis-functions.json`)); - -function generateFunctionDiagram(analysis) { - let mermaid = 'flowchart TD\n'; - - // Root node - mermaid += ` ROOT["${escapeLabel(projectMetadata.software_name)}"]\n`; - - // Feature groups as subgraphs - for (const group of analysis.feature_groups) { - mermaid += ` subgraph ${sanitizeId(group.group_name)}["${escapeLabel(group.group_name)}"]\n`; - for (const feature of group.features) { - mermaid += ` ${sanitizeId(feature.id)}["${escapeLabel(feature.name)}"]\n`; - } - mermaid += ' end\n'; - mermaid += ` ROOT --> ${sanitizeId(group.group_name)}\n`; - } - - return mermaid; -} - -const funcDiagram = generateFunctionDiagram(funcAnalysis); -Write(`${outputDir}/diagrams/functions.mmd`, funcDiagram); -``` - -**Step 3.3: Algorithm Flowcharts** - -```javascript -const algoAnalysis = JSON.parse(Read(`${outputDir}/analysis-algorithms.json`)); - -function generateAlgorithmFlowchart(algorithm) { - let mermaid = 'flowchart TD\n'; - - // Start node - mermaid += ` START(["开始"])\n`; - - // Input nodes - for (const input of algorithm.inputs) { - mermaid += ` INPUT_${sanitizeId(input.name)}[/"${escapeLabel(input.name)}: ${escapeLabel(input.type)}"/]\n`; - } - - // Process steps - let prevNode = 'START'; - for (const step of algorithm.steps) { - const nodeId = `STEP_${step.step_num}`; - mermaid += ` ${nodeId}["${escapeLabel(step.description)}"]\n`; - mermaid += ` ${prevNode} --> ${nodeId}\n`; - prevNode = nodeId; - } - - // Output and end - mermaid += ` OUTPUT[/"输出结果"/]\n`; - mermaid += ` ${prevNode} --> OUTPUT\n`; - mermaid += ` END_(["结束"])\n`; - mermaid += ` OUTPUT --> END_\n`; - - return mermaid; -} - -// Generate flowchart for each core algorithm -for (const algo of algoAnalysis.algorithms.slice(0, 5)) { // Top 5 algorithms - const flowchart = generateAlgorithmFlowchart(algo); - Write(`${outputDir}/diagrams/algorithm-${sanitizeId(algo.name)}.mmd`, flowchart); -} -``` - -**Step 3.4: Class Diagrams** - -```javascript -const dataAnalysis = JSON.parse(Read(`${outputDir}/analysis-data_structures.json`)); - -function generateClassDiagram(analysis) { - let mermaid = 'classDiagram\n'; - - // Classes - for (const entity of analysis.entities) { - mermaid += ` class ${sanitizeId(entity.name)} {\n`; - - // Properties - for (const prop of entity.properties) { - const visibility = {public: '+', private: '-', protected: '#'}[prop.visibility] || '+'; - mermaid += ` ${visibility}${prop.type} ${prop.name}\n`; - } - - // Methods - for (const method of entity.methods) { - const visibility = {public: '+', private: '-', protected: '#'}[method.visibility] || '+'; - mermaid += ` ${visibility}${method.name}(${method.params}) ${method.return_type}\n`; - } - - mermaid += ' }\n'; - } - - // Relationships - for (const rel of analysis.relationships) { - const arrows = { - inheritance: '--|>', - composition: '*--', - aggregation: 'o--', - association: '-->', - dependency: '..>' - }; - const arrow = arrows[rel.type] || '-->'; - mermaid += ` ${sanitizeId(rel.from)} ${arrow} ${sanitizeId(rel.to)} : ${escapeLabel(rel.description || rel.type)}\n`; - } - - return mermaid; -} - -const classDiagram = generateClassDiagram(dataAnalysis); -Write(`${outputDir}/diagrams/class-diagram.mmd`, classDiagram); -``` - -**Step 3.5: Sequence Diagrams** - -```javascript -const interfaceAnalysis = JSON.parse(Read(`${outputDir}/analysis-interfaces.json`)); - -function generateSequenceDiagram(scenario) { - let mermaid = 'sequenceDiagram\n'; - - // Participants - for (const actor of scenario.actors) { - mermaid += ` participant ${sanitizeId(actor.id)} as ${escapeLabel(actor.name)}\n`; - } - - // Messages - for (const msg of scenario.messages) { - const arrow = msg.type === 'async' ? '-)' : '->>'; - mermaid += ` ${sanitizeId(msg.from)}${arrow}${sanitizeId(msg.to)}: ${escapeLabel(msg.description)}\n`; - } - - return mermaid; -} - -// Generate sequence diagram for each key scenario -for (const scenario of interfaceAnalysis.sequence_hints || []) { - const seqDiagram = generateSequenceDiagram(scenario); - Write(`${outputDir}/diagrams/sequence-${sanitizeId(scenario.scenario)}.mmd`, seqDiagram); -} -``` - -**Step 3.6: Validate Mermaid Syntax** - -```javascript -// Validate all generated diagrams -const diagramFiles = Glob(`${outputDir}/diagrams/*.mmd`); -const validationResults = []; - -for (const file of diagramFiles) { - const content = Read(file); - const issues = validateMermaidSyntax(content); - validationResults.push({ - file: file, - valid: issues.length === 0, - issues: issues - }); -} - -Write(`${outputDir}/diagrams/validation-report.json`, JSON.stringify(validationResults, null, 2)); -``` - -**TodoWrite**: Mark Phase 3 completed, Phase 4 in_progress - ---- - -### Phase 4: Document Assembly - -**Purpose**: Assemble all analysis and diagrams into final document - -**Step 4.1: Load All Analysis Data** - -```javascript -const metadata = JSON.parse(Read(`${outputDir}/project-metadata.json`)); -const analyses = { - architecture: JSON.parse(Read(`${outputDir}/analysis-architecture.json`)), - functions: JSON.parse(Read(`${outputDir}/analysis-functions.json`)), - algorithms: JSON.parse(Read(`${outputDir}/analysis-algorithms.json`)), - data_structures: JSON.parse(Read(`${outputDir}/analysis-data_structures.json`)), - interfaces: JSON.parse(Read(`${outputDir}/analysis-interfaces.json`)), - exceptions: JSON.parse(Read(`${outputDir}/analysis-exceptions.json`)) -}; -const diagrams = loadAllDiagrams(`${outputDir}/diagrams/`); -``` - -**Step 4.2: Generate Document Header** - -```javascript -const header = ` - - - -# ${metadata.software_name} 软件设计说明书 - -**版本号**:${metadata.version} -**生成日期**:${new Date().toLocaleDateString('zh-CN')} -**文档性质**:软件著作权鉴别材料 - ---- - -`; -``` - -**Step 4.3: Generate Each Section** - -```javascript -function generateSection1_Overview(metadata, analyses) { - return ` -## 1. 软件概述 - -### 1.1 软件背景与用途 - -${metadata.software_name}是一款${metadata.category}软件,主要用于${inferPurpose(analyses)}。 - -本软件基于${metadata.tech_stack.language}语言开发,采用${metadata.tech_stack.framework || '自研架构'}实现核心功能。 - -### 1.2 开发目标与特点 - -**开发目标**: -${generateObjectives(analyses.functions)} - -**技术特点**: -${generateFeatures(analyses.architecture)} - -### 1.3 运行环境与技术架构 - -**运行环境**: -- 操作系统:${metadata.tech_stack.os || '跨平台'} -- 运行时:${metadata.tech_stack.runtime} -- 依赖环境:${metadata.tech_stack.dependencies?.join('、') || '无特殊依赖'} - -**技术架构**: -- 架构模式:${analyses.architecture.pattern || '模块化架构'} -- 核心框架:${metadata.tech_stack.framework || '无'} -- 主要技术栈:${metadata.tech_stack.main_technologies?.join('、') || metadata.tech_stack.language} - -`; -} - -function generateSection2_Architecture(analyses, diagrams) { - return ` -## 2. 系统架构图 - -本章节展示${metadata.software_name}的整体系统架构,包括各主要模块之间的关系与核心数据流。 - -\`\`\`mermaid -${diagrams.architecture} -\`\`\` - -**图2-1 系统架构图** - -### 架构说明 - -${generateArchitectureDescription(analyses.architecture)} - -**核心模块说明**: - -${analyses.architecture.modules.map((m, i) => - `${i + 1}. **${m.name}**:${m.responsibility}` -).join('\n')} - -`; -} - -function generateSection3_FunctionModules(analyses, diagrams) { - return ` -## 3. 功能模块设计 - -本章节详细描述各功能模块的设计与职责划分。 - -\`\`\`mermaid -${diagrams.functions} -\`\`\` - -**图3-1 功能模块结构图** - -### 3.1 功能模块列表 - -${analyses.functions.feature_groups.map((group, gi) => ` -#### 3.1.${gi + 1} ${group.group_name} - -${group.features.map((f, fi) => - `- **${f.name}**:${f.description}` -).join('\n')} -`).join('\n')} - -### 3.2 模块交互关系 - -${generateInteractionDescription(analyses.functions.interactions)} - -`; -} - -function generateSection4_Algorithms(analyses, diagrams) { - return ` -## 4. 核心算法与流程 - -本章节描述软件中的核心算法实现与关键业务处理流程。 - -${analyses.algorithms.algorithms.slice(0, 5).map((algo, i) => ` -### 4.${i + 1} ${algo.name} - -**功能描述**:${algo.description} - -**输入参数**: -${algo.inputs.map(input => `- \`${input.name}\`(${input.type}):${input.description}`).join('\n')} - -**输出结果**: -${algo.outputs.map(output => `- \`${output.name}\`(${output.type}):${output.description}`).join('\n')} - -**处理流程**: - -\`\`\`mermaid -${diagrams[`algorithm-${sanitizeId(algo.name)}`] || generateSimpleFlowchart(algo)} -\`\`\` - -**图4-${i + 1} ${algo.name}流程图** - -**算法步骤说明**: - -${algo.steps.map((step, si) => - `${si + 1}. ${step.description}` -).join('\n')} - -`).join('\n')} -`; -} - -function generateSection5_DataStructures(analyses, diagrams) { - return ` -## 5. 数据结构设计 - -本章节描述软件中使用的主要数据实体及其关系。 - -\`\`\`mermaid -${diagrams['class-diagram']} -\`\`\` - -**图5-1 数据结构类图** - -### 5.1 主要数据实体 - -${analyses.data_structures.entities.map((entity, i) => ` -#### 5.1.${i + 1} ${entity.name} - -**类型**:${entity.type} -**文件位置**:\`${entity.file}\` - -**属性列表**: - -| 属性名 | 类型 | 可见性 | 说明 | -|--------|------|--------|------| -${entity.properties.map(p => - `| ${p.name} | ${p.type} | ${p.visibility} | ${p.description || '-'} |` -).join('\n')} - -`).join('\n')} - -### 5.2 实体关系说明 - -${analyses.data_structures.relationships.map((rel, i) => - `${i + 1}. **${rel.from}** ${rel.type} **${rel.to}**:${rel.description || '关联关系'}` -).join('\n')} - -`; -} - -function generateSection6_Interfaces(analyses, diagrams) { - return ` -## 6. 接口设计 - -本章节描述软件对外提供的接口定义。 - -${diagrams.sequence ? ` -\`\`\`mermaid -${diagrams.sequence} -\`\`\` - -**图6-1 接口调用时序图** -` : ''} - -### 6.1 接口列表 - -${analyses.interfaces.apis.map((api, i) => ` -#### 6.1.${i + 1} ${api.name} - -- **路径/命令**:\`${api.path}\` -- **方法类型**:${api.method} -- **功能描述**:${api.description} - -**请求参数**: - -| 参数名 | 类型 | 必填 | 说明 | -|--------|------|------|------| -${api.parameters.map(p => - `| ${p.name} | ${p.type} | ${p.required ? '是' : '否'} | ${p.description} |` -).join('\n')} - -**返回结果**: -- 类型:${api.response.type} -- 说明:${api.response.description} - -`).join('\n')} -`; -} - -function generateSection7_Exceptions(analyses) { - return ` -## 7. 异常处理设计 - -本章节描述软件的异常处理机制与错误恢复策略。 - -### 7.1 异常类型定义 - -${analyses.exceptions.exception_types.map((ex, i) => ` -#### 7.1.${i + 1} ${ex.name} - -- **错误码**:\`${ex.code || 'N/A'}\` -- **错误信息**:${ex.message} -- **父类**:${ex.parent || '基础异常类'} -- **定义位置**:\`${ex.file}\` -`).join('\n')} - -### 7.2 错误码说明 - -| 错误码 | 错误信息 | 严重程度 | 类别 | -|--------|----------|----------|------| -${analyses.exceptions.error_codes.map(e => - `| ${e.code} | ${e.message} | ${e.severity} | ${e.category} |` -).join('\n')} - -### 7.3 异常处理策略 - -${analyses.exceptions.handling_patterns.map((pattern, i) => ` -**${i + 1}. ${pattern.pattern}** - -${pattern.description} - -应用位置:${pattern.locations.join('、')} -`).join('\n')} - -### 7.4 恢复机制 - -${analyses.exceptions.recovery_strategies.map((strategy, i) => - `${i + 1}. **${strategy.strategy}**:${strategy.action}(触发条件:${strategy.trigger})` -).join('\n')} - -`; -} -``` - -**Step 4.4: Assemble Final Document** - -```javascript -const documentContent = [ - header, - generateSection1_Overview(metadata, analyses), - generateSection2_Architecture(analyses, diagrams), - generateSection3_FunctionModules(analyses, diagrams), - generateSection4_Algorithms(analyses, diagrams), - generateSection5_DataStructures(analyses, diagrams), - generateSection6_Interfaces(analyses, diagrams), - generateSection7_Exceptions(analyses) -].join('\n'); - -const documentPath = `${outputDir}/${metadata.software_name}-软件设计说明书.md`; -Write(documentPath, documentContent); -``` - -**Step 4.5: CPCC Compliance Check** - -```javascript -function validateCPCCCompliance(document, analyses) { - const checks = [ - {name: "软件概述完整性", pass: document.includes("## 1. 软件概述")}, - {name: "系统架构图存在", pass: document.includes("图2-1 系统架构图")}, - {name: "功能模块设计完整", pass: document.includes("## 3. 功能模块设计")}, - {name: "核心算法描述", pass: document.includes("## 4. 核心算法与流程")}, - {name: "数据结构设计", pass: document.includes("## 5. 数据结构设计")}, - {name: "接口设计说明", pass: document.includes("## 6. 接口设计")}, - {name: "异常处理设计", pass: document.includes("## 7. 异常处理设计")}, - {name: "Mermaid图表语法", pass: !document.includes("mermaid error")}, - {name: "页眉信息", pass: document.includes("页眉")}, - {name: "页码说明", pass: document.includes("页码")} - ]; - - return { - passed: checks.filter(c => c.pass).length, - total: checks.length, - details: checks - }; -} - -const compliance = validateCPCCCompliance(documentContent, analyses); -console.log(`CPCC 合规检查: ${compliance.passed}/${compliance.total} 项通过`); -``` - -**TodoWrite**: Mark Phase 4 completed, Phase 5 in_progress - ---- - -### Phase 5: Iterative Refinement (Discovery-Driven) - -**Purpose**: Refine document based on analysis gaps and user feedback - -**Step 5.1: Extract Document Gaps** - -```javascript -function extractDocumentGaps(analyses, document) { - return { - // Missing descriptions - missingDescriptions: analyses.functions.features - .filter(f => !f.description || f.description.length < 20), - - // Low confidence analyses - lowConfidenceAreas: Object.entries(analyses) - .flatMap(([dim, data]) => - (data.confidence_scores || []).filter(s => s.score < 0.7) - ), - - // Complex areas needing clarification - complexAreas: analyses.algorithms.algorithms - .filter(a => a.complexity === 'high' && a.steps.length < 3), - - // Incomplete relationships - incompleteRelationships: analyses.data_structures.relationships - .filter(r => !r.description), - - // Missing exception handling - missingExceptionHandling: analyses.exceptions.exception_types - .filter(e => !e.message || e.message === 'Unknown') - }; -} - -const gaps = extractDocumentGaps(analyses, documentContent); -``` - -**Step 5.2: Generate Targeted Questions** - -```javascript -function buildRefinementQuestions(gaps, metadata) { - const questions = []; - - // Question about missing feature descriptions - if (gaps.missingDescriptions.length > 0) { - questions.push({ - question: `以下功能缺少详细描述,请选择需要补充说明的功能:`, - header: "功能描述", - multiSelect: true, - options: gaps.missingDescriptions.slice(0, 4).map(f => ({ - label: f.name, - description: `文件: ${f.entry_file} - 当前描述不完整` - })) - }); - } - - // Question about complex algorithms - if (gaps.complexAreas.length > 0) { - questions.push({ - question: `发现 ${gaps.complexAreas.length} 个复杂算法的流程描述不够详细,是否需要深入分析?`, - header: "算法详解", - multiSelect: false, - options: [ - {label: "是,详细分析所有", description: "为所有复杂算法生成详细流程图"}, - {label: "选择性分析", description: "仅分析最核心的算法"}, - {label: "保持现状", description: "当前描述已足够"} - ] - }); - } - - // Question about data relationships - if (gaps.incompleteRelationships.length > 0) { - questions.push({ - question: `发现 ${gaps.incompleteRelationships.length} 个数据实体关系缺少说明,是否补充?`, - header: "数据关系", - multiSelect: false, - options: [ - {label: "自动推断并补充", description: "基于代码分析自动补充关系说明"}, - {label: "跳过", description: "保持当前状态"} - ] - }); - } - - // Final action question - questions.push({ - question: "如何处理当前文档?", - header: "操作", - multiSelect: false, - options: [ - {label: "继续优化", description: "应用上述选择并继续检查"}, - {label: "完成文档", description: "当前文档已满足需求,生成最终版本"}, - {label: "调整范围", description: "修改分析范围或重点"} - ] - }); - - return questions.slice(0, 4); -} - -const refinementQuestions = buildRefinementQuestions(gaps, metadata); -AskUserQuestion({questions: refinementQuestions}); -``` - -**Step 5.3: Apply Refinements** - -```javascript -async function applyRefinements(responses, gaps, outputDir) { - const updates = []; - - if (responses.功能描述) { - // Re-analyze selected features with deeper exploration - for (const feature of responses.功能描述) { - const deepAnalysis = await Task({ - subagent_type: "cli-explore-agent", - prompt: `深入分析功能 ${feature.name},提供详细描述...` - }); - updates.push({type: 'feature_description', data: deepAnalysis}); - } - } - - if (responses.算法详解 === "是,详细分析所有") { - // Generate detailed flowcharts for complex algorithms - for (const algo of gaps.complexAreas) { - const detailedFlow = await analyzeAlgorithmInDepth(algo); - updates.push({type: 'algorithm_flowchart', data: detailedFlow}); - } - } - - if (responses.数据关系 === "自动推断并补充") { - // Infer relationship descriptions from code - const inferences = await inferRelationshipDescriptions(gaps.incompleteRelationships); - updates.push({type: 'relationship_descriptions', data: inferences}); - } - - return updates; -} - -const updates = await applyRefinements(user_responses, gaps, outputDir); -``` - -**Step 5.4: Regenerate Affected Sections** - -```javascript -// Update document with refinements -for (const update of updates) { - switch (update.type) { - case 'feature_description': - analyses.functions = mergeFeatureDescriptions(analyses.functions, update.data); - break; - case 'algorithm_flowchart': - diagrams[`algorithm-${update.data.name}`] = update.data.flowchart; - break; - case 'relationship_descriptions': - analyses.data_structures.relationships = update.data; - break; - } -} - -// Regenerate document -const updatedDocument = regenerateDocument(metadata, analyses, diagrams); -Write(documentPath, updatedDocument); -Write(`${outputDir}/iterations/v${iteration_count}.md`, documentContent); // Archive -``` - -**Step 5.5: Loop or Finalize** - -```javascript -if (user_responses.操作 === "完成文档") { - finalized = true; -} else if (user_responses.操作 === "调整范围") { - goto Phase1_Step1_3; -} else { - iteration_count++; - if (iteration_count > 5) { - console.log("已达到最大迭代次数,建议完成文档"); - } - goto Step5.1; -} -``` - ---- - -### Finalization - -**Step 6.1: Generate Final Document** - -```javascript -const finalDocument = ` -${header} - -${documentContent} - ---- - -**文档生成信息** - -- 生成工具:/workflow:docs:copyright -- 分析范围:${metadata.scope_path} -- 迭代次数:${iteration_count} -- 生成时间:${new Date().toISOString()} - ---- - -*本文档基于源代码自动分析生成,所有描述均来源于代码实现。* -`; - -Write(documentPath, finalDocument); -``` - -**Step 6.2: Output Summary** - -```markdown -## 软件著作权设计说明书生成完成 - -**软件名称**:${metadata.software_name} -**版本号**:${metadata.version} -**文档路径**:${documentPath} - -### 文档统计 -- 总章节数:7 -- Mermaid 图表数:${Object.keys(diagrams).length} -- 功能模块数:${analyses.functions.feature_list.length} -- 数据实体数:${analyses.data_structures.entities.length} -- 接口数量:${analyses.interfaces.apis.length} - -### CPCC 合规检查 -${compliance.details.map(c => `- [${c.pass ? '✓' : '✗'}] ${c.name}`).join('\n')} - -### 后续步骤 -1. 检查生成的 Mermaid 图表是否正确渲染 -2. 补充任何需要人工说明的细节 -3. 将文档转换为 PDF 格式(A4 纵向) -4. 确认页眉、页码格式符合要求 -``` - -**TodoWrite**: Mark all phases completed - ---- - -## Mermaid 语法规范 - -**关键规则**(确保图表正确渲染): - -```javascript -// 1. 特殊字符转义 - 使用双引号包裹 -// 正确 -A["处理(数据)"] --> B["验证{结果}"] - -// 错误 -A[处理(数据)] --> B[验证{结果}] - -// 2. 边标签转义 -// 正确 -A -->|"调用(参数)"| B - -// 错误 -A -->|调用(参数)| B - -// 3. 节点 ID 规范 - 仅使用字母数字下划线 -// 正确 -UserService --> AuthModule - -// 错误 -User-Service --> Auth.Module - -// 4. 子图标题 -// 正确 -subgraph ServiceLayer["服务层(核心)"] - -// 错误 -subgraph ServiceLayer[服务层(核心)] -``` - -## Usage Examples - -```bash -# 交互式生成(引导输入所有信息) -/workflow:docs:copyright src/ - -# 指定软件名称和版本 -/workflow:docs:copyright src/ --name="智能数据分析系统" --version="V1.0.0" - -# 分析特定模块 -/workflow:docs:copyright src/core --name="核心处理引擎" --version="V2.1.0" -``` - -## Output Structure - -``` -.workflow/.scratchpad/copyright-{timestamp}/ -├── project-metadata.json # 软件元数据 -├── project-discovery.json # 项目发现结果 -├── analysis-architecture.json # 架构分析 -├── analysis-functions.json # 功能分析 -├── analysis-algorithms.json # 算法分析 -├── analysis-data_structures.json # 数据结构分析 -├── analysis-interfaces.json # 接口分析 -├── analysis-exceptions.json # 异常处理分析 -├── diagrams/ # Mermaid 图表 -│ ├── architecture.mmd -│ ├── functions.mmd -│ ├── class-diagram.mmd -│ ├── algorithm-*.mmd -│ └── sequence-*.mmd -├── iterations/ # 迭代版本 -│ ├── v1.md -│ └── ... -└── {软件名称}-软件设计说明书.md # 最终文档 -``` - -## Related Commands - -**Follow-up Commands**: -- `/workflow:analyze` - 更深入的项目分析 -- `/memory:docs` - 生成技术文档 - -**Alternative Commands**: -- `/workflow:docs:copyright --lite` - 简化版(仅生成必要章节) diff --git a/ccw/src/core/routes/codexlens-routes.ts b/ccw/src/core/routes/codexlens-routes.ts index d844ee00..bb293f38 100644 --- a/ccw/src/core/routes/codexlens-routes.ts +++ b/ccw/src/core/routes/codexlens-routes.ts @@ -17,6 +17,16 @@ import { isIndexingInProgress } from '../../tools/codex-lens.js'; import type { ProgressInfo, GpuMode } from '../../tools/codex-lens.js'; +import { loadLiteLLMApiConfig } from '../../config/litellm-api-config-manager.js'; + +// File watcher state (persisted across requests) +let watcherProcess: any = null; +let watcherStats = { + running: false, + root_path: '', + events_processed: 0, + start_time: null as Date | null +}; export interface RouteContext { pathname: string; @@ -1052,5 +1062,478 @@ export async function handleCodexLensRoutes(ctx: RouteContext): Promise return true; } + // ============================================================ + // RERANKER CONFIGURATION ENDPOINTS + // ============================================================ + + // API: Get Reranker Configuration + if (pathname === '/api/codexlens/reranker/config' && req.method === 'GET') { + try { + const venvStatus = await checkVenvStatus(); + + // Default reranker config + const rerankerConfig = { + backend: 'onnx', + model_name: 'cross-encoder/ms-marco-MiniLM-L-6-v2', + api_provider: 'siliconflow', + api_key_set: false, + available_backends: ['onnx', 'api', 'litellm', 'legacy'], + api_providers: ['siliconflow', 'cohere', 'jina'], + litellm_endpoints: [] as string[], + config_source: 'default' + }; + + // Load LiteLLM endpoints for dropdown + try { + const litellmConfig = loadLiteLLMApiConfig(initialPath); + if (litellmConfig.endpoints && Array.isArray(litellmConfig.endpoints)) { + rerankerConfig.litellm_endpoints = litellmConfig.endpoints.map( + (ep: any) => ep.alias || ep.name || ep.baseUrl + ).filter(Boolean); + } + } catch (e) { + // LiteLLM config not available, continue with empty endpoints + } + + // If CodexLens is installed, try to get actual config + if (venvStatus.ready) { + try { + const result = await executeCodexLens(['config', '--json']); + if (result.success) { + const config = extractJSON(result.output); + if (config.success && config.result) { + // Map config values + if (config.result.reranker_backend) { + rerankerConfig.backend = config.result.reranker_backend; + rerankerConfig.config_source = 'codexlens'; + } + if (config.result.reranker_model) { + rerankerConfig.model_name = config.result.reranker_model; + } + if (config.result.reranker_api_provider) { + rerankerConfig.api_provider = config.result.reranker_api_provider; + } + // Check if API key is set (from env) + if (process.env.RERANKER_API_KEY) { + rerankerConfig.api_key_set = true; + } + } + } + } catch (e) { + console.error('[CodexLens] Failed to get reranker config:', e); + } + } + + res.writeHead(200, { 'Content-Type': 'application/json' }); + res.end(JSON.stringify({ success: true, ...rerankerConfig })); + } catch (err) { + res.writeHead(500, { 'Content-Type': 'application/json' }); + res.end(JSON.stringify({ success: false, error: err.message })); + } + return true; + } + + // API: Set Reranker Configuration + if (pathname === '/api/codexlens/reranker/config' && req.method === 'POST') { + handlePostRequest(req, res, async (body) => { + const { backend, model_name, api_provider, api_key, litellm_endpoint } = body; + + // Validate backend + const validBackends = ['onnx', 'api', 'litellm', 'legacy']; + if (backend && !validBackends.includes(backend)) { + return { success: false, error: `Invalid backend: ${backend}. Valid options: ${validBackends.join(', ')}`, status: 400 }; + } + + // Validate api_provider + const validProviders = ['siliconflow', 'cohere', 'jina']; + if (api_provider && !validProviders.includes(api_provider)) { + return { success: false, error: `Invalid api_provider: ${api_provider}. Valid options: ${validProviders.join(', ')}`, status: 400 }; + } + + try { + const updates: string[] = []; + + // Set backend + if (backend) { + const result = await executeCodexLens(['config', 'set', 'reranker_backend', backend, '--json']); + if (result.success) updates.push('backend'); + } + + // Set model + if (model_name) { + const result = await executeCodexLens(['config', 'set', 'reranker_model', model_name, '--json']); + if (result.success) updates.push('model_name'); + } + + // Set API provider + if (api_provider) { + const result = await executeCodexLens(['config', 'set', 'reranker_api_provider', api_provider, '--json']); + if (result.success) updates.push('api_provider'); + } + + // Set LiteLLM endpoint + if (litellm_endpoint) { + const result = await executeCodexLens(['config', 'set', 'reranker_litellm_endpoint', litellm_endpoint, '--json']); + if (result.success) updates.push('litellm_endpoint'); + } + + // Handle API key - write to .env file or environment + if (api_key) { + // For security, we store in process.env for the current session + // In production, this should be written to a secure .env file + process.env.RERANKER_API_KEY = api_key; + updates.push('api_key'); + } + + return { + success: true, + message: `Updated: ${updates.join(', ')}`, + updated_fields: updates + }; + } catch (err) { + return { success: false, error: err.message, status: 500 }; + } + }); + return true; + } + + // ============================================================ + // FILE WATCHER CONTROL ENDPOINTS + // ============================================================ + + // API: Get File Watcher Status + if (pathname === '/api/codexlens/watch/status') { + res.writeHead(200, { 'Content-Type': 'application/json' }); + res.end(JSON.stringify({ + success: true, + running: watcherStats.running, + root_path: watcherStats.root_path, + events_processed: watcherStats.events_processed, + start_time: watcherStats.start_time?.toISOString() || null, + uptime_seconds: watcherStats.start_time + ? Math.floor((Date.now() - watcherStats.start_time.getTime()) / 1000) + : 0 + })); + return true; + } + + // API: Start File Watcher + if (pathname === '/api/codexlens/watch/start' && req.method === 'POST') { + handlePostRequest(req, res, async (body) => { + const { path: watchPath, debounce_ms = 1000 } = body; + const targetPath = watchPath || initialPath; + + if (watcherStats.running) { + return { success: false, error: 'Watcher already running', status: 400 }; + } + + try { + const { spawn } = await import('child_process'); + const { join } = await import('path'); + const { existsSync, statSync } = await import('fs'); + + // Validate path exists and is a directory + if (!existsSync(targetPath)) { + return { success: false, error: `Path does not exist: ${targetPath}`, status: 400 }; + } + const pathStat = statSync(targetPath); + if (!pathStat.isDirectory()) { + return { success: false, error: `Path is not a directory: ${targetPath}`, status: 400 }; + } + + // Get the codexlens CLI path + const venvStatus = await checkVenvStatus(); + if (!venvStatus.ready) { + return { success: false, error: 'CodexLens not installed', status: 400 }; + } + + // Spawn watch process (no shell: true for security) + // Use process.platform to determine if we need .cmd extension on Windows + const isWindows = process.platform === 'win32'; + const codexlensCmd = isWindows ? 'codexlens.exe' : 'codexlens'; + const args = ['watch', targetPath, '--debounce', String(debounce_ms)]; + watcherProcess = spawn(codexlensCmd, args, { + cwd: targetPath, + stdio: ['ignore', 'pipe', 'pipe'], + env: { ...process.env } + }); + + watcherStats = { + running: true, + root_path: targetPath, + events_processed: 0, + start_time: new Date() + }; + + // Handle process output for event counting + if (watcherProcess.stdout) { + watcherProcess.stdout.on('data', (data: Buffer) => { + const output = data.toString(); + // Count processed events from output + const matches = output.match(/Processed \d+ events?/g); + if (matches) { + watcherStats.events_processed += matches.length; + } + }); + } + + // Handle process exit + watcherProcess.on('exit', (code: number) => { + watcherStats.running = false; + watcherProcess = null; + console.log(`[CodexLens] Watcher exited with code ${code}`); + }); + + // Broadcast watcher started + broadcastToClients({ + type: 'CODEXLENS_WATCHER_STATUS', + payload: { running: true, path: targetPath } + }); + + return { + success: true, + message: 'Watcher started', + path: targetPath, + pid: watcherProcess.pid + }; + } catch (err) { + return { success: false, error: err.message, status: 500 }; + } + }); + return true; + } + + // API: Stop File Watcher + if (pathname === '/api/codexlens/watch/stop' && req.method === 'POST') { + handlePostRequest(req, res, async () => { + if (!watcherStats.running || !watcherProcess) { + return { success: false, error: 'Watcher not running', status: 400 }; + } + + try { + // Send SIGTERM to gracefully stop the watcher + watcherProcess.kill('SIGTERM'); + + // Wait a moment for graceful shutdown + await new Promise(resolve => setTimeout(resolve, 500)); + + // Force kill if still running + if (watcherProcess && !watcherProcess.killed) { + watcherProcess.kill('SIGKILL'); + } + + const finalStats = { + events_processed: watcherStats.events_processed, + uptime_seconds: watcherStats.start_time + ? Math.floor((Date.now() - watcherStats.start_time.getTime()) / 1000) + : 0 + }; + + watcherStats = { + running: false, + root_path: '', + events_processed: 0, + start_time: null + }; + watcherProcess = null; + + // Broadcast watcher stopped + broadcastToClients({ + type: 'CODEXLENS_WATCHER_STATUS', + payload: { running: false } + }); + + return { + success: true, + message: 'Watcher stopped', + ...finalStats + }; + } catch (err) { + return { success: false, error: err.message, status: 500 }; + } + }); + return true; + } + + + // ============================================================ + // SPLADE ENDPOINTS + // ============================================================ + + // API: SPLADE Status - Check if SPLADE is available and installed + if (pathname === '/api/codexlens/splade/status') { + try { + // Check if CodexLens is installed first + const venvStatus = await checkVenvStatus(); + if (!venvStatus.ready) { + res.writeHead(200, { 'Content-Type': 'application/json' }); + res.end(JSON.stringify({ + available: false, + installed: false, + model: 'naver/splade-cocondenser-ensembledistil', + error: 'CodexLens not installed' + })); + return true; + } + + // Check SPLADE availability using Python check + const result = await executeCodexLens(['python', '-c', + 'from codexlens.semantic.splade_encoder import check_splade_available; ok, err = check_splade_available(); print("OK" if ok else err)' + ]); + + const available = result.output.includes('OK'); + + res.writeHead(200, { 'Content-Type': 'application/json' }); + res.end(JSON.stringify({ + available, + installed: available, + model: 'naver/splade-cocondenser-ensembledistil', + error: available ? null : result.output.trim() + })); + } catch (err) { + res.writeHead(200, { 'Content-Type': 'application/json' }); + res.end(JSON.stringify({ + available: false, + installed: false, + model: 'naver/splade-cocondenser-ensembledistil', + error: err.message + })); + } + return true; + } + + // API: SPLADE Install - Install SPLADE dependencies + if (pathname === '/api/codexlens/splade/install' && req.method === 'POST') { + handlePostRequest(req, res, async (body) => { + try { + const gpu = body?.gpu || false; + const packageName = gpu ? 'codex-lens[splade-gpu]' : 'codex-lens[splade]'; + + // Use pip to install the SPLADE extras + const { spawn } = await import('child_process'); + const { promisify } = await import('util'); + const execFilePromise = promisify(require('child_process').execFile); + + const result = await execFilePromise('pip', ['install', packageName], { + timeout: 600000 // 10 minutes + }); + + return { + success: true, + message: `SPLADE installed successfully (${gpu ? 'GPU' : 'CPU'} mode)`, + output: result.stdout + }; + } catch (err) { + return { + success: false, + error: err.message, + stderr: err.stderr, + status: 500 + }; + } + }); + return true; + } + + // API: SPLADE Index Status - Check if SPLADE index exists for a project + if (pathname === '/api/codexlens/splade/index-status') { + try { + const projectPath = url.searchParams.get('path'); + if (!projectPath) { + res.writeHead(400, { 'Content-Type': 'application/json' }); + res.end(JSON.stringify({ success: false, error: 'Missing path parameter' })); + return true; + } + + // Check if CodexLens is installed first + const venvStatus = await checkVenvStatus(); + if (!venvStatus.ready) { + res.writeHead(200, { 'Content-Type': 'application/json' }); + res.end(JSON.stringify({ exists: false, error: 'CodexLens not installed' })); + return true; + } + + const { join } = await import('path'); + const indexDb = join(projectPath, '.codexlens', '_index.db'); + + // Use Python to check SPLADE index status + const pythonCode = ` +from codexlens.storage.splade_index import SpladeIndex +from pathlib import Path +try: + idx = SpladeIndex(Path("${indexDb.replace(/\\/g, '\\\\')}")) + if idx.has_index(): + stats = idx.get_stats() + meta = idx.get_metadata() + model = meta.get('model_name', '') if meta else '' + print(f"OK|{stats['unique_chunks']}|{stats['total_postings']}|{model}") + else: + print("NO_INDEX") +except Exception as e: + print(f"ERROR|{str(e)}") +`; + + const result = await executeCodexLens(['python', '-c', pythonCode]); + + if (result.output.startsWith('OK|')) { + const parts = result.output.trim().split('|'); + res.writeHead(200, { 'Content-Type': 'application/json' }); + res.end(JSON.stringify({ + exists: true, + chunks: parseInt(parts[1]), + postings: parseInt(parts[2]), + model: parts[3] + })); + } else if (result.output.startsWith('ERROR|')) { + const errorMsg = result.output.substring(6).trim(); + res.writeHead(200, { 'Content-Type': 'application/json' }); + res.end(JSON.stringify({ exists: false, error: errorMsg })); + } else { + res.writeHead(200, { 'Content-Type': 'application/json' }); + res.end(JSON.stringify({ exists: false })); + } + } catch (err) { + res.writeHead(200, { 'Content-Type': 'application/json' }); + res.end(JSON.stringify({ exists: false, error: err.message })); + } + return true; + } + + // API: SPLADE Index Rebuild - Rebuild SPLADE index for a project + if (pathname === '/api/codexlens/splade/rebuild' && req.method === 'POST') { + handlePostRequest(req, res, async (body) => { + const { path: projectPath } = body; + + if (!projectPath) { + return { success: false, error: 'Missing path parameter', status: 400 }; + } + + try { + const result = await executeCodexLens(['splade-index', projectPath, '--rebuild'], { + cwd: projectPath, + timeout: 1800000 // 30 minutes for large codebases + }); + + if (result.success) { + return { + success: true, + message: 'SPLADE index rebuilt successfully', + output: result.output + }; + } else { + return { + success: false, + error: result.error || 'Failed to rebuild SPLADE index', + output: result.output, + status: 500 + }; + } + } catch (err) { + return { success: false, error: err.message, status: 500 }; + } + }); + return true; + } + return false; } diff --git a/ccw/src/templates/dashboard-js/i18n.js b/ccw/src/templates/dashboard-js/i18n.js index ef720a1a..71a7ff9d 100644 --- a/ccw/src/templates/dashboard-js/i18n.js +++ b/ccw/src/templates/dashboard-js/i18n.js @@ -366,6 +366,25 @@ const i18n = { 'codexlens.depsInstalled': 'Dependencies installed successfully', 'codexlens.depsInstallFailed': 'Failed to install dependencies', + // SPLADE Dependencies + 'codexlens.spladeDeps': 'SPLADE Sparse Retrieval', + 'codexlens.spladeInstalled': 'SPLADE Available', + 'codexlens.spladeNotInstalled': 'SPLADE Not Installed', + 'codexlens.spladeInstallHint': 'Install for improved synonym matching in code search', + 'codexlens.installingSpladePackage': 'Installing SPLADE package', + 'codexlens.spladeInstallSuccess': 'SPLADE installed successfully', + 'codexlens.spladeInstallFailed': 'SPLADE installation failed', + 'codexlens.spladeModel': 'Model', + 'codexlens.spladeIndexStatus': 'SPLADE Index', + 'codexlens.spladeIndexExists': 'Index available', + 'codexlens.spladeIndexMissing': 'No SPLADE index', + 'codexlens.spladeRebuild': 'Rebuild SPLADE Index', + 'codexlens.spladeRebuilding': 'Rebuilding SPLADE index...', + 'codexlens.spladeRebuildSuccess': 'SPLADE index rebuilt', + 'codexlens.spladeRebuildFailed': 'SPLADE index rebuild failed', + 'codexlens.spladeChunks': 'Chunks', + 'codexlens.spladePostings': 'Postings', + // GPU Mode Selection 'codexlens.selectGpuMode': 'Select acceleration mode', 'codexlens.cpuModeDesc': 'Standard CPU processing', @@ -2288,6 +2307,25 @@ const i18n = { 'codexlens.depsInstalled': '依赖安装成功', 'codexlens.depsInstallFailed': '依赖安装失败', + // SPLADE 依赖 + 'codexlens.spladeDeps': 'SPLADE 稀疏检索', + 'codexlens.spladeInstalled': 'SPLADE 已安装', + 'codexlens.spladeNotInstalled': 'SPLADE 未安装', + 'codexlens.spladeInstallHint': '安装后可改进代码搜索的同义词匹配', + 'codexlens.installingSpladePackage': '正在安装 SPLADE 包', + 'codexlens.spladeInstallSuccess': 'SPLADE 安装成功', + 'codexlens.spladeInstallFailed': 'SPLADE 安装失败', + 'codexlens.spladeModel': '模型', + 'codexlens.spladeIndexStatus': 'SPLADE 索引', + 'codexlens.spladeIndexExists': '索引可用', + 'codexlens.spladeIndexMissing': '无 SPLADE 索引', + 'codexlens.spladeRebuild': '重建 SPLADE 索引', + 'codexlens.spladeRebuilding': '正在重建 SPLADE 索引...', + 'codexlens.spladeRebuildSuccess': 'SPLADE 索引重建完成', + 'codexlens.spladeRebuildFailed': 'SPLADE 索引重建失败', + 'codexlens.spladeChunks': '分块数', + 'codexlens.spladePostings': '词条数', + // GPU 模式选择 'codexlens.selectGpuMode': '选择加速模式', 'codexlens.cpuModeDesc': '标准 CPU 处理', diff --git a/ccw/src/templates/dashboard-js/views/codexlens-manager.js b/ccw/src/templates/dashboard-js/views/codexlens-manager.js index c62f0198..39734820 100644 --- a/ccw/src/templates/dashboard-js/views/codexlens-manager.js +++ b/ccw/src/templates/dashboard-js/views/codexlens-manager.js @@ -120,6 +120,12 @@ function buildCodexLensConfigContent(config) { ? '' + + '' + + '' + '' + @@ -145,6 +151,17 @@ function buildCodexLensConfigContent(config) { '' : '') + + // SPLADE Section + (isInstalled + ? '
' + + '

' + t('codexlens.spladeDeps') + '

' + + '
' + + '
' + t('common.loading') + '
' + + '
' + + '
' + : '') + + + // Model Management Section (isInstalled ? '
' + @@ -335,6 +352,9 @@ function initCodexLensConfigEvents(currentConfig) { // Load semantic dependencies status loadSemanticDepsStatus(); + // Load SPLADE status + loadSpladeStatus(); + // Load model list loadModelList(); } @@ -714,6 +734,95 @@ async function installSemanticDeps() { await installSemanticDepsWithGpu(); } +// ============================================================ +// SPLADE MANAGEMENT +// ============================================================ + +/** + * Load SPLADE status + */ +async function loadSpladeStatus() { + var container = document.getElementById('spladeStatus'); + if (!container) return; + + try { + var response = await fetch('/api/codexlens/splade/status'); + var status = await response.json(); + + if (status.available) { + container.innerHTML = + '
' + + '
' + + '' + + '
' + + '' + t('codexlens.spladeInstalled') + '' + + '
' + status.model + '
' + + '
' + + '
' + + '
'; + } else { + container.innerHTML = + '
' + + '
' + + '' + + '
' + + '' + t('codexlens.spladeNotInstalled') + '' + + '
' + (status.error || t('codexlens.spladeInstallHint')) + '
' + + '
' + + '
' + + '
' + + '' + + '' + + '
' + + '
'; + } + + if (window.lucide) lucide.createIcons(); + } catch (err) { + container.innerHTML = '
' + err.message + '
'; + } +} + +/** + * Install SPLADE package + */ +async function installSplade(gpu) { + var container = document.getElementById('spladeStatus'); + if (!container) return; + + container.innerHTML = + '
' + + '
' + + '' + t('codexlens.installingSpladePackage') + (gpu ? ' (GPU)' : ' (CPU)') + '...' + + '
'; + if (window.lucide) lucide.createIcons(); + + try { + var response = await fetch('/api/codexlens/splade/install', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ gpu: gpu }) + }); + var result = await response.json(); + + if (result.success) { + showRefreshToast(t('codexlens.spladeInstallSuccess'), 'success'); + loadSpladeStatus(); + } else { + showRefreshToast(t('codexlens.spladeInstallFailed') + ': ' + result.error, 'error'); + loadSpladeStatus(); + } + } catch (err) { + showRefreshToast(t('common.error') + ': ' + err.message, 'error'); + loadSpladeStatus(); + } +} + + // ============================================================ // MODEL MANAGEMENT // ============================================================ @@ -2975,3 +3084,546 @@ async function saveRotationConfig() { showRefreshToast(t('common.error') + ': ' + err.message, 'error'); } } + +// ============================================================ +// RERANKER CONFIGURATION MODAL +// ============================================================ + +/** + * Show Reranker configuration modal + */ +async function showRerankerConfigModal() { + try { + showRefreshToast(t('codexlens.loadingRerankerConfig') || 'Loading reranker configuration...', 'info'); + + // Fetch current reranker config + const response = await fetch('/api/codexlens/reranker/config'); + const config = await response.json(); + + if (!config.success) { + showRefreshToast(t('common.error') + ': ' + (config.error || 'Failed to load config'), 'error'); + return; + } + + const modalHtml = buildRerankerConfigContent(config); + + // Create and show modal + const tempContainer = document.createElement('div'); + tempContainer.innerHTML = modalHtml; + const modal = tempContainer.firstElementChild; + document.body.appendChild(modal); + + // Initialize icons + if (window.lucide) lucide.createIcons(); + + // Initialize event handlers + initRerankerConfigEvents(config); + } catch (err) { + showRefreshToast(t('common.error') + ': ' + err.message, 'error'); + } +} + +/** + * Build Reranker configuration modal content + */ +function buildRerankerConfigContent(config) { + const backend = config.backend || 'onnx'; + const modelName = config.model_name || ''; + const apiProvider = config.api_provider || 'siliconflow'; + const apiKeySet = config.api_key_set || false; + const availableBackends = config.available_backends || ['onnx', 'api', 'litellm', 'legacy']; + const apiProviders = config.api_providers || ['siliconflow', 'cohere', 'jina']; + const litellmEndpoints = config.litellm_endpoints || []; + + // ONNX models + const onnxModels = [ + 'cross-encoder/ms-marco-MiniLM-L-6-v2', + 'cross-encoder/ms-marco-TinyBERT-L-2-v2', + 'BAAI/bge-reranker-base', + 'BAAI/bge-reranker-large' + ]; + + // Build backend options + const backendOptions = availableBackends.map(function(b) { + const labels = { + 'onnx': 'ONNX (Local, Optimum)', + 'api': 'API (SiliconFlow/Cohere/Jina)', + 'litellm': 'LiteLLM (Custom Endpoint)', + 'legacy': 'Legacy (SentenceTransformers)' + }; + return ''; + }).join(''); + + // Build API provider options + const providerOptions = apiProviders.map(function(p) { + return ''; + }).join(''); + + // Build ONNX model options + const onnxModelOptions = onnxModels.map(function(m) { + return ''; + }).join(''); + + // Build LiteLLM endpoint options + const litellmOptions = litellmEndpoints.length > 0 + ? litellmEndpoints.map(function(ep) { + return ''; + }).join('') + : ''; + + return ''; +} + +/** + * Toggle reranker configuration sections based on selected backend + */ +function toggleRerankerSections() { + var backend = document.getElementById('rerankerBackend').value; + + document.getElementById('rerankerOnnxSection').style.display = backend === 'onnx' ? 'block' : 'none'; + document.getElementById('rerankerApiSection').style.display = backend === 'api' ? 'block' : 'none'; + document.getElementById('rerankerLitellmSection').style.display = backend === 'litellm' ? 'block' : 'none'; + document.getElementById('rerankerLegacySection').style.display = backend === 'legacy' ? 'block' : 'none'; +} + +/** + * Initialize reranker config modal events + */ +function initRerankerConfigEvents(config) { + // Handle ONNX model custom input toggle + var onnxModelSelect = document.getElementById('rerankerOnnxModel'); + var customModelInput = document.getElementById('rerankerCustomModel'); + + if (onnxModelSelect && customModelInput) { + onnxModelSelect.addEventListener('change', function() { + customModelInput.style.display = this.value === 'custom' ? 'block' : 'none'; + }); + } + + // Store original config for reset + window._rerankerOriginalConfig = config; +} + +/** + * Close the reranker config modal + */ +function closeRerankerModal() { + var modal = document.getElementById('rerankerConfigModal'); + if (modal) modal.remove(); +} + +/** + * Reset reranker config to original values + */ +function resetRerankerConfig() { + var config = window._rerankerOriginalConfig; + if (!config) return; + + document.getElementById('rerankerBackend').value = config.backend || 'onnx'; + toggleRerankerSections(); + + // Reset ONNX section + var onnxModels = [ + 'cross-encoder/ms-marco-MiniLM-L-6-v2', + 'cross-encoder/ms-marco-TinyBERT-L-2-v2', + 'BAAI/bge-reranker-base', + 'BAAI/bge-reranker-large' + ]; + if (onnxModels.includes(config.model_name)) { + document.getElementById('rerankerOnnxModel').value = config.model_name; + document.getElementById('rerankerCustomModel').style.display = 'none'; + } else { + document.getElementById('rerankerOnnxModel').value = 'custom'; + document.getElementById('rerankerCustomModel').value = config.model_name || ''; + document.getElementById('rerankerCustomModel').style.display = 'block'; + } + + // Reset API section + document.getElementById('rerankerApiProvider').value = config.api_provider || 'siliconflow'; + document.getElementById('rerankerApiKey').value = ''; + document.getElementById('rerankerApiModel').value = config.model_name || ''; + + showRefreshToast(t('common.reset') || 'Reset to original values', 'info'); +} + +/** + * Save reranker configuration + */ +async function saveRerankerConfig() { + try { + var backend = document.getElementById('rerankerBackend').value; + var payload = { backend: backend }; + + // Collect model name based on backend + if (backend === 'onnx') { + var onnxModel = document.getElementById('rerankerOnnxModel').value; + if (onnxModel === 'custom') { + payload.model_name = document.getElementById('rerankerCustomModel').value.trim(); + } else { + payload.model_name = onnxModel; + } + } else if (backend === 'api') { + payload.api_provider = document.getElementById('rerankerApiProvider').value; + payload.model_name = document.getElementById('rerankerApiModel').value.trim(); + var apiKey = document.getElementById('rerankerApiKey').value.trim(); + if (apiKey) { + payload.api_key = apiKey; + } + } else if (backend === 'litellm') { + payload.litellm_endpoint = document.getElementById('rerankerLitellmEndpoint').value; + } + + var response = await fetch('/api/codexlens/reranker/config', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify(payload) + }); + + var result = await response.json(); + + if (result.success) { + showRefreshToast((t('codexlens.rerankerConfigSaved') || 'Reranker configuration saved') + ': ' + result.message, 'success'); + closeRerankerModal(); + } else { + showRefreshToast(t('common.saveFailed') + ': ' + result.error, 'error'); + } + } catch (err) { + showRefreshToast(t('common.error') + ': ' + err.message, 'error'); + } +} + +// ============================================================ +// FILE WATCHER CONTROL +// ============================================================ + +/** + * Show File Watcher control modal + */ +async function showWatcherControlModal() { + try { + showRefreshToast(t('codexlens.loadingWatcherStatus') || 'Loading watcher status...', 'info'); + + // Fetch current watcher status + const response = await fetch('/api/codexlens/watch/status'); + const status = await response.json(); + + const modalHtml = buildWatcherControlContent(status); + + // Create and show modal + const tempContainer = document.createElement('div'); + tempContainer.innerHTML = modalHtml; + const modal = tempContainer.firstElementChild; + document.body.appendChild(modal); + + // Initialize icons + if (window.lucide) lucide.createIcons(); + + // Start polling if watcher is running + if (status.running) { + startWatcherStatusPolling(); + } + } catch (err) { + showRefreshToast(t('common.error') + ': ' + err.message, 'error'); + } +} + +/** + * Build File Watcher control modal content + */ +function buildWatcherControlContent(status) { + const running = status.running || false; + const rootPath = status.root_path || ''; + const eventsProcessed = status.events_processed || 0; + const uptimeSeconds = status.uptime_seconds || 0; + + // Format uptime + const formatUptime = function(seconds) { + if (seconds < 60) return seconds + 's'; + if (seconds < 3600) return Math.floor(seconds / 60) + 'm ' + (seconds % 60) + 's'; + return Math.floor(seconds / 3600) + 'h ' + Math.floor((seconds % 3600) / 60) + 'm'; + }; + + return ''; +} + +/** + * Toggle file watcher on/off + */ +async function toggleWatcher() { + var toggle = document.getElementById('watcherToggle'); + var shouldRun = toggle.checked; + + try { + if (shouldRun) { + // Start watcher + var watchPath = document.getElementById('watcherPath').value.trim(); + var debounceMs = parseInt(document.getElementById('watcherDebounce').value, 10) || 1000; + + var response = await fetch('/api/codexlens/watch/start', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ path: watchPath || undefined, debounce_ms: debounceMs }) + }); + + var result = await response.json(); + + if (result.success) { + showRefreshToast((t('codexlens.watcherStarted') || 'Watcher started') + ': ' + result.path, 'success'); + document.getElementById('watcherStats').style.display = 'block'; + document.getElementById('watcherStartConfig').style.display = 'none'; + startWatcherStatusPolling(); + } else { + toggle.checked = false; + showRefreshToast(t('common.error') + ': ' + result.error, 'error'); + } + } else { + // Stop watcher + var response = await fetch('/api/codexlens/watch/stop', { + method: 'POST', + headers: { 'Content-Type': 'application/json' } + }); + + var result = await response.json(); + + if (result.success) { + showRefreshToast((t('codexlens.watcherStopped') || 'Watcher stopped') + ': ' + result.events_processed + ' events processed', 'success'); + document.getElementById('watcherStats').style.display = 'none'; + document.getElementById('watcherStartConfig').style.display = 'block'; + stopWatcherStatusPolling(); + } else { + toggle.checked = true; + showRefreshToast(t('common.error') + ': ' + result.error, 'error'); + } + } + } catch (err) { + toggle.checked = !shouldRun; + showRefreshToast(t('common.error') + ': ' + err.message, 'error'); + } +} + +// Watcher status polling +var watcherPollingInterval = null; + +function startWatcherStatusPolling() { + if (watcherPollingInterval) return; + + watcherPollingInterval = setInterval(async function() { + try { + var response = await fetch('/api/codexlens/watch/status'); + var status = await response.json(); + + if (status.running) { + document.getElementById('watcherEventsCount').textContent = status.events_processed || 0; + + // Format uptime + var seconds = status.uptime_seconds || 0; + var formatted = seconds < 60 ? seconds + 's' : + seconds < 3600 ? Math.floor(seconds / 60) + 'm ' + (seconds % 60) + 's' : + Math.floor(seconds / 3600) + 'h ' + Math.floor((seconds % 3600) / 60) + 'm'; + document.getElementById('watcherUptime').textContent = formatted; + } else { + // Watcher stopped externally + stopWatcherStatusPolling(); + document.getElementById('watcherToggle').checked = false; + document.getElementById('watcherStats').style.display = 'none'; + document.getElementById('watcherStartConfig').style.display = 'block'; + } + } catch (err) { + console.error('Failed to poll watcher status:', err); + } + }, 2000); +} + +function stopWatcherStatusPolling() { + if (watcherPollingInterval) { + clearInterval(watcherPollingInterval); + watcherPollingInterval = null; + } +} + +/** + * Close the watcher control modal + */ +function closeWatcherModal() { + stopWatcherStatusPolling(); + var modal = document.getElementById('watcherControlModal'); + if (modal) modal.remove(); +} diff --git a/codex-lens/pyproject.toml b/codex-lens/pyproject.toml index 35e66501..a5761453 100644 --- a/codex-lens/pyproject.toml +++ b/codex-lens/pyproject.toml @@ -80,6 +80,18 @@ reranker = [ "transformers>=4.36", ] +# SPLADE sparse retrieval +splade = [ + "transformers>=4.36", + "optimum[onnxruntime]>=1.16", +] + +# SPLADE with GPU acceleration (CUDA) +splade-gpu = [ + "transformers>=4.36", + "optimum[onnxruntime-gpu]>=1.16", +] + # Encoding detection for non-UTF8 files encoding = [ "chardet>=5.0", diff --git a/codex-lens/src/codexlens/cli/commands.py b/codex-lens/src/codexlens/cli/commands.py index 119e6cc2..699c39a8 100644 --- a/codex-lens/src/codexlens/cli/commands.py +++ b/codex-lens/src/codexlens/cli/commands.py @@ -415,11 +415,20 @@ def search( depth: int = typer.Option(-1, "--depth", "-d", help="Search depth (-1 = unlimited, 0 = current only)."), files_only: bool = typer.Option(False, "--files-only", "-f", help="Return only file paths without content snippets."), mode: str = typer.Option("auto", "--mode", "-m", help="Search mode: auto, exact, fuzzy, hybrid, vector, pure-vector."), - weights: Optional[str] = typer.Option(None, "--weights", help="Custom RRF weights as 'exact,fuzzy,vector' (e.g., '0.5,0.3,0.2')."), + weights: Optional[str] = typer.Option( + None, + "--weights", "-w", + help="RRF weights as key=value pairs (e.g., 'splade=0.4,vector=0.6' or 'exact=0.3,fuzzy=0.1,vector=0.6'). Default: auto-detect based on available backends." + ), + use_fts: bool = typer.Option( + False, + "--use-fts", + help="Use FTS (exact+fuzzy) instead of SPLADE for sparse retrieval" + ), json_mode: bool = typer.Option(False, "--json", help="Output JSON response."), verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable debug logging."), ) -> None: - """Search indexed file contents using SQLite FTS5 or semantic vectors. + """Search indexed file contents using hybrid retrieval. Uses chain search across directory indexes. Use --depth to limit search recursion (0 = current dir only). @@ -428,17 +437,27 @@ def search( - auto: Auto-detect (hybrid if embeddings exist, exact otherwise) [default] - exact: Exact FTS using unicode61 tokenizer - for code identifiers - fuzzy: Fuzzy FTS using trigram tokenizer - for typo-tolerant search - - hybrid: RRF fusion of exact + fuzzy + vector (recommended) - best recall - - vector: Vector search with exact FTS fallback - semantic + keyword + - hybrid: RRF fusion of sparse + dense search (recommended) - best recall + - vector: Vector search with sparse fallback - semantic + keyword - pure-vector: Pure semantic vector search only - natural language queries + SPLADE Mode: + When SPLADE is available (pip install codex-lens[splade]), it automatically + replaces FTS (exact+fuzzy) as the sparse retrieval backend. SPLADE provides + semantic term expansion for better synonym handling. + + Use --use-fts to force FTS mode instead of SPLADE. + Vector Search Requirements: Vector search modes require pre-generated embeddings. Use 'codexlens embeddings-generate' to create embeddings first. - Hybrid Mode: - Default weights: exact=0.3, fuzzy=0.1, vector=0.6 - Use --weights to customize (e.g., --weights 0.5,0.3,0.2) + Hybrid Mode Weights: + Use --weights to adjust RRF fusion weights: + - SPLADE mode: 'splade=0.4,vector=0.6' (default) + - FTS mode: 'exact=0.3,fuzzy=0.1,vector=0.6' (default) + + Legacy format also supported: '0.3,0.1,0.6' (exact,fuzzy,vector) Examples: # Auto-detect mode (uses hybrid if embeddings available) @@ -450,11 +469,19 @@ def search( # Semantic search (requires embeddings) codexlens search "how to verify user credentials" --mode pure-vector - # Force hybrid mode - codexlens search "authentication" --mode hybrid + # Force hybrid mode with custom weights + codexlens search "authentication" --mode hybrid --weights splade=0.5,vector=0.5 + + # Force FTS instead of SPLADE + codexlens search "authentication" --use-fts """ _configure_logging(verbose, json_mode) search_path = path.expanduser().resolve() + + # Configure search with FTS fallback if requested + config = Config() + if use_fts: + config.use_fts_fallback = True # Validate mode valid_modes = ["auto", "exact", "fuzzy", "hybrid", "vector", "pure-vector"] @@ -470,22 +497,56 @@ def search( hybrid_weights = None if weights: try: - weight_parts = [float(w.strip()) for w in weights.split(",")] - if len(weight_parts) == 3: - weight_sum = sum(weight_parts) + # Check if using key=value format (new) or legacy comma-separated format + if "=" in weights: + # New format: splade=0.4,vector=0.6 or exact=0.3,fuzzy=0.1,vector=0.6 + weight_dict = {} + for pair in weights.split(","): + if "=" in pair: + key, val = pair.split("=", 1) + weight_dict[key.strip()] = float(val.strip()) + else: + raise ValueError("Mixed format not supported - use all key=value pairs") + + # Validate and normalize weights + weight_sum = sum(weight_dict.values()) if abs(weight_sum - 1.0) > 0.01: - console.print(f"[yellow]Warning: Weights sum to {weight_sum:.2f}, should sum to 1.0. Normalizing...[/yellow]") - # Normalize weights - weight_parts = [w / weight_sum for w in weight_parts] - hybrid_weights = { - "exact": weight_parts[0], - "fuzzy": weight_parts[1], - "vector": weight_parts[2], - } + if not json_mode: + console.print(f"[yellow]Warning: Weights sum to {weight_sum:.2f}, should sum to 1.0. Normalizing...[/yellow]") + weight_dict = {k: v / weight_sum for k, v in weight_dict.items()} + + hybrid_weights = weight_dict else: - console.print("[yellow]Warning: Invalid weights format (need 3 values). Using defaults.[/yellow]") - except ValueError: - console.print("[yellow]Warning: Invalid weights format. Using defaults.[/yellow]") + # Legacy format: 0.3,0.1,0.6 (exact,fuzzy,vector) + weight_parts = [float(w.strip()) for w in weights.split(",")] + if len(weight_parts) == 3: + weight_sum = sum(weight_parts) + if abs(weight_sum - 1.0) > 0.01: + if not json_mode: + console.print(f"[yellow]Warning: Weights sum to {weight_sum:.2f}, should sum to 1.0. Normalizing...[/yellow]") + weight_parts = [w / weight_sum for w in weight_parts] + hybrid_weights = { + "exact": weight_parts[0], + "fuzzy": weight_parts[1], + "vector": weight_parts[2], + } + elif len(weight_parts) == 2: + # Two values: assume splade,vector + weight_sum = sum(weight_parts) + if abs(weight_sum - 1.0) > 0.01: + if not json_mode: + console.print(f"[yellow]Warning: Weights sum to {weight_sum:.2f}, should sum to 1.0. Normalizing...[/yellow]") + weight_parts = [w / weight_sum for w in weight_parts] + hybrid_weights = { + "splade": weight_parts[0], + "vector": weight_parts[1], + } + else: + if not json_mode: + console.print("[yellow]Warning: Invalid weights format. Using defaults.[/yellow]") + except ValueError as e: + if not json_mode: + console.print(f"[yellow]Warning: Invalid weights format ({e}). Using defaults.[/yellow]") registry: RegistryStore | None = None try: @@ -2381,6 +2442,188 @@ def gpu_reset( console.print(f" Device: [cyan]{gpu_info.gpu_name}[/cyan]") + +# ==================== SPLADE Commands ==================== + +@app.command("splade-index") +def splade_index_command( + path: Path = typer.Argument(..., help="Project path to index"), + rebuild: bool = typer.Option(False, "--rebuild", "-r", help="Force rebuild SPLADE index"), + verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable verbose output."), +) -> None: + """Generate SPLADE sparse index for existing codebase. + + Encodes all semantic chunks with SPLADE model and builds inverted index + for efficient sparse retrieval. + + Examples: + codexlens splade-index ~/projects/my-app + codexlens splade-index . --rebuild + """ + _configure_logging(verbose) + + from codexlens.semantic.splade_encoder import get_splade_encoder, check_splade_available + from codexlens.storage.splade_index import SpladeIndex + from codexlens.semantic.vector_store import VectorStore + + # Check SPLADE availability + ok, err = check_splade_available() + if not ok: + console.print(f"[red]SPLADE not available: {err}[/red]") + console.print("[dim]Install with: pip install transformers torch[/dim]") + raise typer.Exit(1) + + # Find index database + target_path = path.expanduser().resolve() + + # Try to find _index.db + if target_path.is_file() and target_path.name == "_index.db": + index_db = target_path + elif target_path.is_dir(): + # Check for local .codexlens/_index.db + local_index = target_path / ".codexlens" / "_index.db" + if local_index.exists(): + index_db = local_index + else: + # Try to find via registry + registry = RegistryStore() + try: + registry.initialize() + mapper = PathMapper() + index_db = mapper.source_to_index_db(target_path) + if not index_db.exists(): + console.print(f"[red]Error:[/red] No index found for {target_path}") + console.print("Run 'codexlens init' first to create an index") + raise typer.Exit(1) + finally: + registry.close() + else: + console.print(f"[red]Error:[/red] Path must be _index.db file or indexed directory") + raise typer.Exit(1) + + splade_db = index_db.parent / "_splade.db" + + if splade_db.exists() and not rebuild: + console.print("[yellow]SPLADE index exists. Use --rebuild to regenerate.[/yellow]") + return + + # Load chunks from vector store + console.print(f"[blue]Loading chunks from {index_db.name}...[/blue]") + vector_store = VectorStore(index_db) + chunks = vector_store.get_all_chunks() + + if not chunks: + console.print("[yellow]No chunks found in vector store[/yellow]") + console.print("[dim]Generate embeddings first with 'codexlens embeddings-generate'[/dim]") + raise typer.Exit(1) + + console.print(f"[blue]Encoding {len(chunks)} chunks with SPLADE...[/blue]") + + # Initialize SPLADE + encoder = get_splade_encoder() + splade_index = SpladeIndex(splade_db) + splade_index.create_tables() + + # Encode in batches with progress bar + with Progress( + SpinnerColumn(), + TextColumn("[progress.description]{task.description}"), + BarColumn(), + TextColumn("[progress.percentage]{task.percentage:>3.0f}%"), + TimeElapsedColumn(), + console=console, + ) as progress: + task = progress.add_task("Encoding...", total=len(chunks)) + for chunk in chunks: + sparse_vec = encoder.encode_text(chunk.content) + splade_index.add_posting(chunk.id, sparse_vec) + progress.advance(task) + + # Set metadata + splade_index.set_metadata( + model_name=encoder.model_name, + vocab_size=encoder.vocab_size + ) + + stats = splade_index.get_stats() + console.print(f"[green]✓[/green] SPLADE index built: {stats['unique_chunks']} chunks, {stats['total_postings']} postings") + console.print(f" Database: [dim]{splade_db}[/dim]") + + +@app.command("splade-status") +def splade_status_command( + path: Path = typer.Argument(..., help="Project path"), + verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable verbose output."), +) -> None: + """Show SPLADE index status and statistics. + + Examples: + codexlens splade-status ~/projects/my-app + codexlens splade-status . + """ + _configure_logging(verbose) + + from codexlens.storage.splade_index import SpladeIndex + from codexlens.semantic.splade_encoder import check_splade_available + + # Find index database + target_path = path.expanduser().resolve() + + if target_path.is_file() and target_path.name == "_index.db": + splade_db = target_path.parent / "_splade.db" + elif target_path.is_dir(): + # Check for local .codexlens/_splade.db + local_splade = target_path / ".codexlens" / "_splade.db" + if local_splade.exists(): + splade_db = local_splade + else: + # Try to find via registry + registry = RegistryStore() + try: + registry.initialize() + mapper = PathMapper() + index_db = mapper.source_to_index_db(target_path) + splade_db = index_db.parent / "_splade.db" + finally: + registry.close() + else: + console.print(f"[red]Error:[/red] Path must be _index.db file or indexed directory") + raise typer.Exit(1) + + if not splade_db.exists(): + console.print("[yellow]No SPLADE index found[/yellow]") + console.print(f"[dim]Run 'codexlens splade-index {path}' to create one[/dim]") + return + + splade_index = SpladeIndex(splade_db) + + if not splade_index.has_index(): + console.print("[yellow]SPLADE tables not initialized[/yellow]") + return + + metadata = splade_index.get_metadata() + stats = splade_index.get_stats() + + # Create status table + table = Table(title="SPLADE Index Status", show_header=False) + table.add_column("Property", style="cyan") + table.add_column("Value") + + table.add_row("Database", str(splade_db)) + if metadata: + table.add_row("Model", metadata['model_name']) + table.add_row("Vocab Size", str(metadata['vocab_size'])) + table.add_row("Chunks", str(stats['unique_chunks'])) + table.add_row("Unique Tokens", str(stats['unique_tokens'])) + table.add_row("Total Postings", str(stats['total_postings'])) + + ok, err = check_splade_available() + status_text = "[green]Yes[/green]" if ok else f"[red]No[/red] - {err}" + table.add_row("SPLADE Available", status_text) + + console.print(table) + + # ==================== Watch Command ==================== @app.command() diff --git a/codex-lens/src/codexlens/cli/embedding_manager.py b/codex-lens/src/codexlens/cli/embedding_manager.py index b364bc38..e552bf57 100644 --- a/codex-lens/src/codexlens/cli/embedding_manager.py +++ b/codex-lens/src/codexlens/cli/embedding_manager.py @@ -33,6 +33,15 @@ def _cleanup_fastembed_resources() -> None: pass +def _cleanup_splade_resources() -> None: + """Release SPLADE encoder ONNX resources.""" + try: + from codexlens.semantic.splade_encoder import clear_splade_cache + clear_splade_cache() + except Exception: + pass + + def _generate_chunks_from_cursor( cursor, chunker, @@ -675,10 +684,96 @@ def generate_embeddings( if progress_callback: progress_callback(f"Finalizing index... Building ANN index for {total_chunks_created} chunks") + # --- SPLADE SPARSE ENCODING (after dense embeddings) --- + # Add SPLADE encoding if enabled in config + splade_success = False + splade_error = None + + try: + from codexlens.config import Config + config = Config.load() + + if config.enable_splade: + from codexlens.semantic.splade_encoder import check_splade_available, get_splade_encoder + from codexlens.storage.splade_index import SpladeIndex + + ok, err = check_splade_available() + if ok: + if progress_callback: + progress_callback(f"Generating SPLADE sparse vectors for {total_chunks_created} chunks...") + + # Initialize SPLADE encoder and index + splade_encoder = get_splade_encoder(use_gpu=use_gpu) + # Use main index database for SPLADE (not separate _splade.db) + splade_index = SpladeIndex(index_path) + splade_index.create_tables() + + # Retrieve all chunks from database for SPLADE encoding + with sqlite3.connect(index_path) as conn: + conn.row_factory = sqlite3.Row + cursor = conn.execute("SELECT id, content FROM semantic_chunks ORDER BY id") + + # Batch encode for efficiency + SPLADE_BATCH_SIZE = 32 + batch_postings = [] + chunk_batch = [] + chunk_ids = [] + + for row in cursor: + chunk_id = row["id"] + content = row["content"] + + chunk_ids.append(chunk_id) + chunk_batch.append(content) + + # Process batch when full + if len(chunk_batch) >= SPLADE_BATCH_SIZE: + sparse_vecs = splade_encoder.encode_batch(chunk_batch, batch_size=SPLADE_BATCH_SIZE) + for cid, sparse_vec in zip(chunk_ids, sparse_vecs): + batch_postings.append((cid, sparse_vec)) + + chunk_batch = [] + chunk_ids = [] + + # Process remaining chunks + if chunk_batch: + sparse_vecs = splade_encoder.encode_batch(chunk_batch, batch_size=SPLADE_BATCH_SIZE) + for cid, sparse_vec in zip(chunk_ids, sparse_vecs): + batch_postings.append((cid, sparse_vec)) + + # Batch insert all postings + if batch_postings: + splade_index.add_postings_batch(batch_postings) + + # Set metadata + splade_index.set_metadata( + model_name=splade_encoder.model_name, + vocab_size=splade_encoder.vocab_size + ) + + splade_success = True + if progress_callback: + stats = splade_index.get_stats() + progress_callback( + f"SPLADE index created: {stats['total_postings']} postings, " + f"{stats['unique_tokens']} unique tokens" + ) + else: + logger.debug("SPLADE not available: %s", err) + splade_error = f"SPLADE not available: {err}" + except Exception as e: + splade_error = str(e) + logger.warning("SPLADE encoding failed: %s", e) + + # Report SPLADE status after processing + if progress_callback and not splade_success and splade_error: + progress_callback(f"SPLADE index: FAILED - {splade_error}") + except Exception as e: # Cleanup on error to prevent process hanging try: _cleanup_fastembed_resources() + _cleanup_splade_resources() gc.collect() except Exception: pass @@ -690,6 +785,7 @@ def generate_embeddings( # This is critical - without it, ONNX Runtime threads prevent Python from exiting try: _cleanup_fastembed_resources() + _cleanup_splade_resources() gc.collect() except Exception: pass @@ -874,6 +970,7 @@ def generate_embeddings_recursive( # Each generate_embeddings() call does its own cleanup, but do a final one to be safe try: _cleanup_fastembed_resources() + _cleanup_splade_resources() gc.collect() except Exception: pass diff --git a/codex-lens/src/codexlens/config.py b/codex-lens/src/codexlens/config.py index ac523c3c..ba63d19e 100644 --- a/codex-lens/src/codexlens/config.py +++ b/codex-lens/src/codexlens/config.py @@ -103,6 +103,15 @@ class Config: # For litellm: model name from config (e.g., "qwen3-embedding") embedding_use_gpu: bool = True # For fastembed: whether to use GPU acceleration + # SPLADE sparse retrieval configuration + enable_splade: bool = True # Enable SPLADE as default sparse backend + splade_model: str = "naver/splade-cocondenser-ensembledistil" + splade_threshold: float = 0.01 # Min weight to store in index + splade_onnx_path: Optional[str] = None # Custom ONNX model path + + # FTS fallback (disabled by default, available via --use-fts) + use_fts_fallback: bool = False # Use FTS instead of SPLADE + # Indexing/search optimizations global_symbol_index_enabled: bool = True # Enable project-wide symbol index fast path enable_merkle_detection: bool = True # Enable content-hash based incremental indexing diff --git a/codex-lens/src/codexlens/search/hybrid_search.py b/codex-lens/src/codexlens/search/hybrid_search.py index ff790431..e76b0ad4 100644 --- a/codex-lens/src/codexlens/search/hybrid_search.py +++ b/codex-lens/src/codexlens/search/hybrid_search.py @@ -8,7 +8,7 @@ from __future__ import annotations import logging import time -from concurrent.futures import ThreadPoolExecutor, as_completed +from concurrent.futures import ThreadPoolExecutor, TimeoutError as FuturesTimeoutError, as_completed from contextlib import contextmanager from pathlib import Path from typing import Any, Dict, List, Optional @@ -33,6 +33,8 @@ def timer(name: str, logger: logging.Logger, level: int = logging.DEBUG): from codexlens.config import Config from codexlens.entities import SearchResult from codexlens.search.ranking import ( + DEFAULT_WEIGHTS, + FTS_FALLBACK_WEIGHTS, apply_symbol_boost, cross_encoder_rerank, get_rrf_weights, @@ -54,12 +56,9 @@ class HybridSearchEngine: default_weights: Default RRF weights for each source """ - # Default RRF weights (vector: 60%, exact: 30%, fuzzy: 10%) - DEFAULT_WEIGHTS = { - "exact": 0.3, - "fuzzy": 0.1, - "vector": 0.6, - } + # NOTE: DEFAULT_WEIGHTS imported from ranking.py - single source of truth + # Default RRF weights: SPLADE-based hybrid (splade: 0.4, vector: 0.6) + # FTS fallback mode uses FTS_FALLBACK_WEIGHTS (exact: 0.3, fuzzy: 0.1, vector: 0.6) def __init__( self, @@ -75,10 +74,11 @@ class HybridSearchEngine: embedder: Optional embedder instance for embedding-based reranking """ self.logger = logging.getLogger(__name__) - self.weights = weights or self.DEFAULT_WEIGHTS.copy() + self.weights = weights or DEFAULT_WEIGHTS.copy() self._config = config self.embedder = embedder self.reranker: Any = None + self._use_gpu = config.embedding_use_gpu if config else True def search( self, @@ -124,6 +124,26 @@ class HybridSearchEngine: # Determine which backends to use backends = {} + + # Check if SPLADE is available + splade_available = False + # Respect config.enable_splade flag and use_fts_fallback flag + if self._config and getattr(self._config, 'use_fts_fallback', False): + # Config explicitly requests FTS fallback - disable SPLADE + splade_available = False + elif self._config and not getattr(self._config, 'enable_splade', True): + # Config explicitly disabled SPLADE + splade_available = False + else: + # Check if SPLADE dependencies are available + try: + from codexlens.semantic.splade_encoder import check_splade_available + ok, _ = check_splade_available() + if ok: + # SPLADE tables are in main index database, will check table existence in _search_splade + splade_available = True + except Exception: + pass if pure_vector: # Pure vector mode: only use vector search, no FTS fallback @@ -138,12 +158,19 @@ class HybridSearchEngine: ) backends["exact"] = True else: - # Hybrid mode: always include exact search as baseline - backends["exact"] = True - if enable_fuzzy: - backends["fuzzy"] = True - if enable_vector: - backends["vector"] = True + # Hybrid mode: default to SPLADE if available, otherwise use FTS + if splade_available: + # Default: enable SPLADE, disable exact and fuzzy + backends["splade"] = True + if enable_vector: + backends["vector"] = True + else: + # Fallback mode: enable exact+fuzzy when SPLADE unavailable + backends["exact"] = True + if enable_fuzzy: + backends["fuzzy"] = True + if enable_vector: + backends["vector"] = True # Execute parallel searches with timer("parallel_search_total", self.logger): @@ -354,23 +381,40 @@ class HybridSearchEngine: ) future_to_source[future] = "vector" - # Collect results as they complete - for future in as_completed(future_to_source): - source = future_to_source[future] - elapsed_ms = (time.perf_counter() - submit_times[source]) * 1000 - timing_data[source] = elapsed_ms - try: - results = future.result() - # Tag results with source for debugging - tagged_results = tag_search_source(results, source) - results_map[source] = tagged_results - self.logger.debug( - "[TIMING] %s_search: %.2fms (%d results)", - source, elapsed_ms, len(results) - ) - except Exception as exc: - self.logger.error("Search failed for %s: %s", source, exc) - results_map[source] = [] + if backends.get("splade"): + submit_times["splade"] = time.perf_counter() + future = executor.submit( + self._search_splade, index_path, query, limit + ) + future_to_source[future] = "splade" + + # Collect results as they complete with timeout protection + try: + for future in as_completed(future_to_source, timeout=30.0): + source = future_to_source[future] + elapsed_ms = (time.perf_counter() - submit_times[source]) * 1000 + timing_data[source] = elapsed_ms + try: + results = future.result(timeout=10.0) + # Tag results with source for debugging + tagged_results = tag_search_source(results, source) + results_map[source] = tagged_results + self.logger.debug( + "[TIMING] %s_search: %.2fms (%d results)", + source, elapsed_ms, len(results) + ) + except (Exception, FuturesTimeoutError) as exc: + self.logger.error("Search failed for %s: %s", source, exc) + results_map[source] = [] + except FuturesTimeoutError: + self.logger.warning("Search timeout: some backends did not respond in time") + # Cancel remaining futures + for future in future_to_source: + future.cancel() + # Set empty results for sources that didn't complete + for source in backends: + if source not in results_map: + results_map[source] = [] # Log timing summary if timing_data: @@ -564,3 +608,113 @@ class HybridSearchEngine: except Exception as exc: self.logger.error("Vector search error: %s", exc) return [] + + def _search_splade( + self, index_path: Path, query: str, limit: int + ) -> List[SearchResult]: + """SPLADE sparse retrieval via inverted index. + + Args: + index_path: Path to _index.db file + query: Natural language query string + limit: Maximum results + + Returns: + List of SearchResult ordered by SPLADE score + """ + try: + from codexlens.semantic.splade_encoder import get_splade_encoder, check_splade_available + from codexlens.storage.splade_index import SpladeIndex + import sqlite3 + import json + + # Check dependencies + ok, err = check_splade_available() + if not ok: + self.logger.debug("SPLADE not available: %s", err) + return [] + + # Use main index database (SPLADE tables are in _index.db, not separate _splade.db) + splade_index = SpladeIndex(index_path) + if not splade_index.has_index(): + self.logger.debug("SPLADE index not initialized") + return [] + + # Encode query to sparse vector + encoder = get_splade_encoder(use_gpu=self._use_gpu) + query_sparse = encoder.encode_text(query) + + # Search inverted index for top matches + raw_results = splade_index.search(query_sparse, limit=limit, min_score=0.0) + + if not raw_results: + return [] + + # Fetch chunk details from main index database + chunk_ids = [chunk_id for chunk_id, _ in raw_results] + score_map = {chunk_id: score for chunk_id, score in raw_results} + + # Query semantic_chunks table for full details + placeholders = ",".join("?" * len(chunk_ids)) + with sqlite3.connect(index_path) as conn: + conn.row_factory = sqlite3.Row + rows = conn.execute( + f""" + SELECT id, file_path, content, metadata + FROM semantic_chunks + WHERE id IN ({placeholders}) + """, + chunk_ids + ).fetchall() + + # Build SearchResult objects + results = [] + for row in rows: + chunk_id = row["id"] + file_path = row["file_path"] + content = row["content"] + metadata_json = row["metadata"] + metadata = json.loads(metadata_json) if metadata_json else {} + + score = score_map.get(chunk_id, 0.0) + + # Build excerpt (short preview) + excerpt = content[:200] + "..." if len(content) > 200 else content + + # Extract symbol information from metadata + symbol_name = metadata.get("symbol_name") + symbol_kind = metadata.get("symbol_kind") + start_line = metadata.get("start_line") + end_line = metadata.get("end_line") + + # Build Symbol object if we have symbol info + symbol = None + if symbol_name and symbol_kind and start_line and end_line: + try: + from codexlens.entities import Symbol + symbol = Symbol( + name=symbol_name, + kind=symbol_kind, + range=(start_line, end_line) + ) + except Exception: + pass + + results.append(SearchResult( + path=file_path, + score=score, + excerpt=excerpt, + content=content, + symbol=symbol, + metadata=metadata, + start_line=start_line, + end_line=end_line, + symbol_name=symbol_name, + symbol_kind=symbol_kind, + )) + + return results + + except Exception as exc: + self.logger.debug("SPLADE search error: %s", exc) + return [] diff --git a/codex-lens/src/codexlens/search/ranking.py b/codex-lens/src/codexlens/search/ranking.py index 34bd7719..dc53d651 100644 --- a/codex-lens/src/codexlens/search/ranking.py +++ b/codex-lens/src/codexlens/search/ranking.py @@ -1,7 +1,7 @@ """Ranking algorithms for hybrid search result fusion. Implements Reciprocal Rank Fusion (RRF) and score normalization utilities -for combining results from heterogeneous search backends (exact FTS, fuzzy FTS, vector search). +for combining results from heterogeneous search backends (SPLADE, exact FTS, fuzzy FTS, vector search). """ from __future__ import annotations @@ -14,6 +14,20 @@ from typing import Any, Dict, List from codexlens.entities import SearchResult, AdditionalLocation +# Default RRF weights for SPLADE-based hybrid search +DEFAULT_WEIGHTS = { + "splade": 0.4, # Replaces exact(0.3) + fuzzy(0.1) + "vector": 0.6, +} + +# Legacy weights for FTS fallback mode (when SPLADE unavailable) +FTS_FALLBACK_WEIGHTS = { + "exact": 0.3, + "fuzzy": 0.1, + "vector": 0.6, +} + + class QueryIntent(str, Enum): """Query intent for adaptive RRF weights (Python/TypeScript parity).""" @@ -87,15 +101,24 @@ def adjust_weights_by_intent( intent: QueryIntent, base_weights: Dict[str, float], ) -> Dict[str, float]: - """Map intent → weights (kept aligned with TypeScript mapping).""" + """Adjust RRF weights based on query intent.""" + # Check if using SPLADE or FTS mode + use_splade = "splade" in base_weights + if intent == QueryIntent.KEYWORD: - target = {"exact": 0.5, "fuzzy": 0.1, "vector": 0.4} + if use_splade: + target = {"splade": 0.6, "vector": 0.4} + else: + target = {"exact": 0.5, "fuzzy": 0.1, "vector": 0.4} elif intent == QueryIntent.SEMANTIC: - target = {"exact": 0.2, "fuzzy": 0.1, "vector": 0.7} + if use_splade: + target = {"splade": 0.3, "vector": 0.7} + else: + target = {"exact": 0.2, "fuzzy": 0.1, "vector": 0.7} else: target = dict(base_weights) - - # Preserve only keys that are present in base_weights (active backends). + + # Filter to active backends keys = list(base_weights.keys()) filtered = {k: float(target.get(k, 0.0)) for k in keys} return normalize_weights(filtered) diff --git a/codex-lens/src/codexlens/semantic/SPLADE_IMPLEMENTATION.md b/codex-lens/src/codexlens/semantic/SPLADE_IMPLEMENTATION.md new file mode 100644 index 00000000..cb3062e1 --- /dev/null +++ b/codex-lens/src/codexlens/semantic/SPLADE_IMPLEMENTATION.md @@ -0,0 +1,225 @@ +# SPLADE Encoder Implementation + +## Overview + +Created `splade_encoder.py` - A complete ONNX-optimized SPLADE sparse encoder for code search. + +## File Location + +`src/codexlens/semantic/splade_encoder.py` (474 lines) + +## Key Components + +### 1. Dependency Checking + +**Function**: `check_splade_available() -> Tuple[bool, Optional[str]]` +- Validates numpy, onnxruntime, optimum, transformers availability +- Returns (True, None) if all dependencies present +- Returns (False, error_message) with install instructions if missing + +### 2. Caching System + +**Global Cache**: Thread-safe singleton pattern +- `_splade_cache: Dict[str, SpladeEncoder]` - Global encoder cache +- `_cache_lock: threading.RLock()` - Thread safety lock + +**Factory Function**: `get_splade_encoder(...) -> SpladeEncoder` +- Cache key includes: model_name, gpu/cpu, max_length, sparsity_threshold +- Pre-loads model on first access +- Returns cached instance on subsequent calls + +**Cleanup Function**: `clear_splade_cache() -> None` +- Releases ONNX resources +- Clears model and tokenizer references +- Prevents memory leaks + +### 3. SpladeEncoder Class + +#### Initialization Parameters +- `model_name: str` - Default: "naver/splade-cocondenser-ensembledistil" +- `use_gpu: bool` - Enable GPU acceleration (default: True) +- `max_length: int` - Max sequence length (default: 512) +- `sparsity_threshold: float` - Min weight threshold (default: 0.01) +- `providers: Optional[List]` - Explicit ONNX providers (overrides use_gpu) + +#### Core Methods + +**`_load_model()`**: Lazy loading with GPU support +- Uses `optimum.onnxruntime.ORTModelForMaskedLM` +- Falls back to CPU if GPU unavailable +- Integrates with `gpu_support.get_optimal_providers()` +- Handles device_id options for DirectML/CUDA + +**`_splade_activation(logits, attention_mask)`**: Static method +- Formula: `log(1 + ReLU(logits)) * attention_mask` +- Input: (batch, seq_len, vocab_size) +- Output: (batch, seq_len, vocab_size) + +**`_max_pooling(splade_repr)`**: Static method +- Max pooling over sequence dimension +- Input: (batch, seq_len, vocab_size) +- Output: (batch, vocab_size) + +**`_to_sparse_dict(dense_vec)`**: Conversion helper +- Filters by sparsity_threshold +- Returns: `Dict[int, float]` mapping token_id to weight + +**`encode_text(text: str) -> Dict[int, float]`**: Single text encoding +- Tokenizes input with truncation/padding +- Forward pass through ONNX model +- Applies SPLADE activation + max pooling +- Returns sparse vector + +**`encode_batch(texts: List[str], batch_size: int = 32) -> List[Dict[int, float]]`**: Batch encoding +- Processes in batches for memory efficiency +- Returns list of sparse vectors + +#### Properties + +**`vocab_size: int`**: Vocabulary size (~30k for BERT) +- Cached after first model load +- Returns tokenizer length + +#### Debugging Methods + +**`get_token(token_id: int) -> str`** +- Converts token_id to human-readable string +- Uses tokenizer.decode() + +**`get_top_tokens(sparse_vec: Dict[int, float], top_k: int = 10) -> List[Tuple[str, float]]`** +- Extracts top-k highest-weight tokens +- Returns (token_string, weight) pairs +- Useful for understanding model focus + +## Design Patterns Followed + +### 1. From `onnx_reranker.py` +✓ ONNX loading with provider detection +✓ Lazy model initialization +✓ Thread-safe loading with RLock +✓ Signature inspection for backward compatibility +✓ Fallback for older Optimum versions +✓ Static helper methods for numerical operations + +### 2. From `embedder.py` +✓ Global cache with thread safety +✓ Factory function pattern (get_splade_encoder) +✓ Cache cleanup function (clear_splade_cache) +✓ GPU provider configuration +✓ Batch processing support + +### 3. From `gpu_support.py` +✓ `get_optimal_providers(use_gpu, with_device_options=True)` +✓ Device ID options for DirectML/CUDA +✓ Provider tuple format: (provider_name, options_dict) + +## SPLADE Algorithm + +### Activation Formula +```python +# Step 1: ReLU activation +relu_logits = max(0, logits) + +# Step 2: Log(1 + x) transformation +log_relu = log(1 + relu_logits) + +# Step 3: Apply attention mask +splade_repr = log_relu * attention_mask + +# Step 4: Max pooling over sequence +splade_vec = max(splade_repr, axis=sequence_length) + +# Step 5: Sparsification by threshold +sparse_dict = {token_id: weight for token_id, weight in enumerate(splade_vec) if weight > threshold} +``` + +### Output Format +- Sparse dictionary: `{token_id: weight}` +- Token IDs: 0 to vocab_size-1 (typically ~30,000) +- Weights: Float values > sparsity_threshold +- Interpretable: Can decode token_ids to strings + +## Integration Points + +### With `splade_index.py` +- `SpladeIndex.add_posting(chunk_id, sparse_vec: Dict[int, float])` +- `SpladeIndex.search(query_sparse: Dict[int, float])` +- Encoder produces the sparse vectors consumed by index + +### With Indexing Pipeline +```python +encoder = get_splade_encoder(use_gpu=True) + +# Single document +sparse_vec = encoder.encode_text("def main():\n print('hello')") +index.add_posting(chunk_id=1, sparse_vec=sparse_vec) + +# Batch indexing +texts = ["code chunk 1", "code chunk 2", ...] +sparse_vecs = encoder.encode_batch(texts, batch_size=64) +postings = [(chunk_id, vec) for chunk_id, vec in enumerate(sparse_vecs)] +index.add_postings_batch(postings) +``` + +### With Search Pipeline +```python +encoder = get_splade_encoder(use_gpu=True) +query_sparse = encoder.encode_text("authentication function") +results = index.search(query_sparse, limit=50, min_score=0.5) +``` + +## Dependencies + +Required packages: +- `numpy` - Numerical operations +- `onnxruntime` - ONNX model execution (CPU) +- `onnxruntime-gpu` - ONNX with GPU support (optional) +- `optimum[onnxruntime]` - Hugging Face ONNX optimization +- `transformers` - Tokenizer and model loading + +Install command: +```bash +# CPU only +pip install numpy onnxruntime optimum[onnxruntime] transformers + +# With GPU support +pip install numpy onnxruntime-gpu optimum[onnxruntime-gpu] transformers +``` + +## Testing Status + +✓ Python syntax validation passed +✓ Module import successful +✓ Dependency checking works correctly +✗ Full functional test pending (requires optimum installation) + +## Next Steps + +1. Install dependencies for functional testing +2. Create unit tests in `tests/semantic/test_splade_encoder.py` +3. Benchmark encoding performance (CPU vs GPU) +4. Integrate with codex-lens indexing pipeline +5. Add SPLADE option to semantic search configuration + +## Performance Considerations + +### Memory Usage +- Model size: ~100MB (ONNX optimized) +- Sparse vectors: ~100-500 non-zero entries per document +- Batch size: 32 recommended (adjust based on GPU memory) + +### Speed Benchmarks (Expected) +- CPU encoding: ~10-20 docs/sec +- GPU encoding (CUDA): ~100-200 docs/sec +- GPU encoding (DirectML): ~50-100 docs/sec + +### Sparsity Analysis +- Threshold 0.01: ~200-400 tokens per document +- Threshold 0.05: ~100-200 tokens per document +- Threshold 0.10: ~50-100 tokens per document + +## References + +- SPLADE paper: https://arxiv.org/abs/2107.05720 +- SPLADE v2: https://arxiv.org/abs/2109.10086 +- Naver model: https://huggingface.co/naver/splade-cocondenser-ensembledistil diff --git a/codex-lens/src/codexlens/semantic/splade_encoder.py b/codex-lens/src/codexlens/semantic/splade_encoder.py new file mode 100644 index 00000000..646c84bd --- /dev/null +++ b/codex-lens/src/codexlens/semantic/splade_encoder.py @@ -0,0 +1,474 @@ +"""ONNX-optimized SPLADE sparse encoder for code search. + +This module provides SPLADE (Sparse Lexical and Expansion) encoding using ONNX Runtime +for efficient sparse vector generation. SPLADE produces vocabulary-aligned sparse vectors +that combine the interpretability of BM25 with neural relevance modeling. + +Install (CPU): + pip install onnxruntime optimum[onnxruntime] transformers + +Install (GPU): + pip install onnxruntime-gpu optimum[onnxruntime-gpu] transformers +""" + +from __future__ import annotations + +import logging +import threading +from typing import Any, Dict, List, Optional, Tuple + +logger = logging.getLogger(__name__) + + +def check_splade_available() -> Tuple[bool, Optional[str]]: + """Check whether SPLADE dependencies are available. + + Returns: + Tuple of (available: bool, error_message: Optional[str]) + """ + try: + import numpy # noqa: F401 + except ImportError as exc: + return False, f"numpy not available: {exc}. Install with: pip install numpy" + + try: + import onnxruntime # noqa: F401 + except ImportError as exc: + return ( + False, + f"onnxruntime not available: {exc}. Install with: pip install onnxruntime", + ) + + try: + from optimum.onnxruntime import ORTModelForMaskedLM # noqa: F401 + except ImportError as exc: + return ( + False, + f"optimum[onnxruntime] not available: {exc}. Install with: pip install optimum[onnxruntime]", + ) + + try: + from transformers import AutoTokenizer # noqa: F401 + except ImportError as exc: + return ( + False, + f"transformers not available: {exc}. Install with: pip install transformers", + ) + + return True, None + + +# Global cache for SPLADE encoders (singleton pattern) +_splade_cache: Dict[str, "SpladeEncoder"] = {} +_cache_lock = threading.RLock() + + +def get_splade_encoder( + model_name: str = "naver/splade-cocondenser-ensembledistil", + use_gpu: bool = True, + max_length: int = 512, + sparsity_threshold: float = 0.01, +) -> "SpladeEncoder": + """Get or create cached SPLADE encoder (thread-safe singleton). + + This function provides significant performance improvement by reusing + SpladeEncoder instances across multiple searches, avoiding repeated model + loading overhead. + + Args: + model_name: SPLADE model name (default: naver/splade-cocondenser-ensembledistil) + use_gpu: If True, use GPU acceleration when available + max_length: Maximum sequence length for tokenization + sparsity_threshold: Minimum weight to include in sparse vector + + Returns: + Cached SpladeEncoder instance for the given configuration + """ + global _splade_cache + + # Cache key includes all configuration parameters + cache_key = f"{model_name}:{'gpu' if use_gpu else 'cpu'}:{max_length}:{sparsity_threshold}" + + with _cache_lock: + encoder = _splade_cache.get(cache_key) + if encoder is not None: + return encoder + + # Create new encoder and cache it + encoder = SpladeEncoder( + model_name=model_name, + use_gpu=use_gpu, + max_length=max_length, + sparsity_threshold=sparsity_threshold, + ) + # Pre-load model to ensure it's ready + encoder._load_model() + _splade_cache[cache_key] = encoder + + return encoder + + +def clear_splade_cache() -> None: + """Clear the SPLADE encoder cache and release ONNX resources. + + This method ensures proper cleanup of ONNX model resources to prevent + memory leaks when encoders are no longer needed. + """ + global _splade_cache + with _cache_lock: + # Release ONNX resources before clearing cache + for encoder in _splade_cache.values(): + if encoder._model is not None: + del encoder._model + encoder._model = None + if encoder._tokenizer is not None: + del encoder._tokenizer + encoder._tokenizer = None + _splade_cache.clear() + + +class SpladeEncoder: + """ONNX-optimized SPLADE sparse encoder. + + Produces sparse vectors with vocabulary-aligned dimensions. + Output: Dict[int, float] mapping token_id to weight. + + SPLADE activation formula: + splade_repr = log(1 + ReLU(logits)) * attention_mask + splade_vec = max_pooling(splade_repr, axis=sequence_length) + + References: + - SPLADE: https://arxiv.org/abs/2107.05720 + - SPLADE v2: https://arxiv.org/abs/2109.10086 + """ + + DEFAULT_MODEL = "naver/splade-cocondenser-ensembledistil" + + def __init__( + self, + model_name: str = DEFAULT_MODEL, + use_gpu: bool = True, + max_length: int = 512, + sparsity_threshold: float = 0.01, + providers: Optional[List[Any]] = None, + ) -> None: + """Initialize SPLADE encoder. + + Args: + model_name: SPLADE model name (default: naver/splade-cocondenser-ensembledistil) + use_gpu: If True, use GPU acceleration when available + max_length: Maximum sequence length for tokenization + sparsity_threshold: Minimum weight to include in sparse vector + providers: Explicit ONNX providers list (overrides use_gpu) + """ + self.model_name = (model_name or self.DEFAULT_MODEL).strip() + if not self.model_name: + raise ValueError("model_name cannot be blank") + + self.use_gpu = bool(use_gpu) + self.max_length = int(max_length) if max_length > 0 else 512 + self.sparsity_threshold = float(sparsity_threshold) + self.providers = providers + + self._tokenizer: Any | None = None + self._model: Any | None = None + self._vocab_size: int | None = None + self._lock = threading.RLock() + + def _load_model(self) -> None: + """Lazy load ONNX model and tokenizer.""" + if self._model is not None and self._tokenizer is not None: + return + + ok, err = check_splade_available() + if not ok: + raise ImportError(err) + + with self._lock: + if self._model is not None and self._tokenizer is not None: + return + + from inspect import signature + + from optimum.onnxruntime import ORTModelForMaskedLM + from transformers import AutoTokenizer + + if self.providers is None: + from .gpu_support import get_optimal_providers + + # Include device_id options for DirectML/CUDA selection when available + self.providers = get_optimal_providers( + use_gpu=self.use_gpu, with_device_options=True + ) + + # Some Optimum versions accept `providers`, others accept a single `provider` + # Prefer passing the full providers list, with a conservative fallback + model_kwargs: dict[str, Any] = {} + try: + params = signature(ORTModelForMaskedLM.from_pretrained).parameters + if "providers" in params: + model_kwargs["providers"] = self.providers + elif "provider" in params: + provider_name = "CPUExecutionProvider" + if self.providers: + first = self.providers[0] + provider_name = first[0] if isinstance(first, tuple) else str(first) + model_kwargs["provider"] = provider_name + except Exception: + model_kwargs = {} + + try: + self._model = ORTModelForMaskedLM.from_pretrained( + self.model_name, + **model_kwargs, + ) + logger.debug(f"SPLADE model loaded: {self.model_name}") + except TypeError: + # Fallback for older Optimum versions: retry without provider arguments + self._model = ORTModelForMaskedLM.from_pretrained(self.model_name) + logger.warning( + "Optimum version doesn't support provider parameters. " + "Upgrade optimum for GPU acceleration: pip install --upgrade optimum" + ) + + self._tokenizer = AutoTokenizer.from_pretrained(self.model_name, use_fast=True) + + # Cache vocabulary size + self._vocab_size = len(self._tokenizer) + logger.debug(f"SPLADE tokenizer loaded: vocab_size={self._vocab_size}") + + @staticmethod + def _splade_activation(logits: Any, attention_mask: Any) -> Any: + """Apply SPLADE activation function to model outputs. + + Formula: log(1 + ReLU(logits)) * attention_mask + + Args: + logits: Model output logits (batch, seq_len, vocab_size) + attention_mask: Attention mask (batch, seq_len) + + Returns: + SPLADE representations (batch, seq_len, vocab_size) + """ + import numpy as np + + # ReLU activation + relu_logits = np.maximum(0, logits) + + # Log(1 + x) transformation + log_relu = np.log1p(relu_logits) + + # Apply attention mask (expand to match vocab dimension) + # attention_mask: (batch, seq_len) -> (batch, seq_len, 1) + mask_expanded = np.expand_dims(attention_mask, axis=-1) + + # Element-wise multiplication + splade_repr = log_relu * mask_expanded + + return splade_repr + + @staticmethod + def _max_pooling(splade_repr: Any) -> Any: + """Max pooling over sequence length dimension. + + Args: + splade_repr: SPLADE representations (batch, seq_len, vocab_size) + + Returns: + Pooled sparse vectors (batch, vocab_size) + """ + import numpy as np + + # Max pooling over sequence dimension (axis=1) + return np.max(splade_repr, axis=1) + + def _to_sparse_dict(self, dense_vec: Any) -> Dict[int, float]: + """Convert dense vector to sparse dictionary. + + Args: + dense_vec: Dense vector (vocab_size,) + + Returns: + Sparse dictionary {token_id: weight} with weights above threshold + """ + import numpy as np + + # Find non-zero indices above threshold + nonzero_indices = np.where(dense_vec > self.sparsity_threshold)[0] + + # Create sparse dictionary + sparse_dict = { + int(idx): float(dense_vec[idx]) + for idx in nonzero_indices + } + + return sparse_dict + + def encode_text(self, text: str) -> Dict[int, float]: + """Encode text to sparse vector {token_id: weight}. + + Args: + text: Input text to encode + + Returns: + Sparse vector as dictionary mapping token_id to weight + """ + self._load_model() + + if self._model is None or self._tokenizer is None: + raise RuntimeError("Model not loaded") + + import numpy as np + + # Tokenize input + encoded = self._tokenizer( + text, + padding=True, + truncation=True, + max_length=self.max_length, + return_tensors="np", + ) + + # Forward pass through model + outputs = self._model(**encoded) + + # Extract logits + if hasattr(outputs, "logits"): + logits = outputs.logits + elif isinstance(outputs, dict) and "logits" in outputs: + logits = outputs["logits"] + elif isinstance(outputs, (list, tuple)) and outputs: + logits = outputs[0] + else: + raise RuntimeError("Unexpected model output format") + + # Apply SPLADE activation + attention_mask = encoded["attention_mask"] + splade_repr = self._splade_activation(logits, attention_mask) + + # Max pooling over sequence length + splade_vec = self._max_pooling(splade_repr) + + # Convert to sparse dictionary (single item batch) + sparse_dict = self._to_sparse_dict(splade_vec[0]) + + return sparse_dict + + def encode_batch(self, texts: List[str], batch_size: int = 32) -> List[Dict[int, float]]: + """Batch encode texts to sparse vectors. + + Args: + texts: List of input texts to encode + batch_size: Batch size for encoding (default: 32) + + Returns: + List of sparse vectors as dictionaries + """ + if not texts: + return [] + + self._load_model() + + if self._model is None or self._tokenizer is None: + raise RuntimeError("Model not loaded") + + import numpy as np + + results: List[Dict[int, float]] = [] + + # Process in batches + for i in range(0, len(texts), batch_size): + batch_texts = texts[i:i + batch_size] + + # Tokenize batch + encoded = self._tokenizer( + batch_texts, + padding=True, + truncation=True, + max_length=self.max_length, + return_tensors="np", + ) + + # Forward pass through model + outputs = self._model(**encoded) + + # Extract logits + if hasattr(outputs, "logits"): + logits = outputs.logits + elif isinstance(outputs, dict) and "logits" in outputs: + logits = outputs["logits"] + elif isinstance(outputs, (list, tuple)) and outputs: + logits = outputs[0] + else: + raise RuntimeError("Unexpected model output format") + + # Apply SPLADE activation + attention_mask = encoded["attention_mask"] + splade_repr = self._splade_activation(logits, attention_mask) + + # Max pooling over sequence length + splade_vecs = self._max_pooling(splade_repr) + + # Convert each vector to sparse dictionary + for vec in splade_vecs: + sparse_dict = self._to_sparse_dict(vec) + results.append(sparse_dict) + + return results + + @property + def vocab_size(self) -> int: + """Return vocabulary size (~30k for BERT-based models). + + Returns: + Vocabulary size (number of tokens in tokenizer) + """ + if self._vocab_size is not None: + return self._vocab_size + + self._load_model() + return self._vocab_size or 0 + + def get_token(self, token_id: int) -> str: + """Convert token_id to string (for debugging). + + Args: + token_id: Token ID to convert + + Returns: + Token string + """ + self._load_model() + + if self._tokenizer is None: + raise RuntimeError("Tokenizer not loaded") + + return self._tokenizer.decode([token_id]) + + def get_top_tokens(self, sparse_vec: Dict[int, float], top_k: int = 10) -> List[Tuple[str, float]]: + """Get top-k tokens with highest weights from sparse vector. + + Useful for debugging and understanding what the model is focusing on. + + Args: + sparse_vec: Sparse vector as {token_id: weight} + top_k: Number of top tokens to return + + Returns: + List of (token_string, weight) tuples, sorted by weight descending + """ + self._load_model() + + if not sparse_vec: + return [] + + # Sort by weight descending + sorted_items = sorted(sparse_vec.items(), key=lambda x: x[1], reverse=True) + + # Take top-k and convert token_ids to strings + top_items = sorted_items[:top_k] + + return [ + (self.get_token(token_id), weight) + for token_id, weight in top_items + ] diff --git a/codex-lens/src/codexlens/storage/migrations/migration_009_add_splade.py b/codex-lens/src/codexlens/storage/migrations/migration_009_add_splade.py new file mode 100644 index 00000000..c675233e --- /dev/null +++ b/codex-lens/src/codexlens/storage/migrations/migration_009_add_splade.py @@ -0,0 +1,103 @@ +""" +Migration 009: Add SPLADE sparse retrieval tables. + +This migration introduces SPLADE (Sparse Lexical AnD Expansion) support: +- splade_metadata: Model configuration (model name, vocab size, ONNX path) +- splade_posting_list: Inverted index mapping token_id -> (chunk_id, weight) + +The SPLADE tables are designed for efficient sparse vector retrieval: +- Token-based lookup for query expansion +- Chunk-based deletion for index maintenance +- Maintains backward compatibility with existing FTS tables +""" + +import logging +from sqlite3 import Connection + +log = logging.getLogger(__name__) + + +def upgrade(db_conn: Connection) -> None: + """ + Adds SPLADE tables for sparse retrieval. + + Creates: + - splade_metadata: Stores model configuration and ONNX path + - splade_posting_list: Inverted index with token_id -> (chunk_id, weight) mappings + - Indexes for efficient token-based and chunk-based lookups + + Args: + db_conn: The SQLite database connection. + """ + cursor = db_conn.cursor() + + log.info("Creating splade_metadata table...") + cursor.execute( + """ + CREATE TABLE IF NOT EXISTS splade_metadata ( + id INTEGER PRIMARY KEY DEFAULT 1, + model_name TEXT NOT NULL, + vocab_size INTEGER NOT NULL, + onnx_path TEXT, + created_at REAL + ) + """ + ) + + log.info("Creating splade_posting_list table...") + cursor.execute( + """ + CREATE TABLE IF NOT EXISTS splade_posting_list ( + token_id INTEGER NOT NULL, + chunk_id INTEGER NOT NULL, + weight REAL NOT NULL, + PRIMARY KEY (token_id, chunk_id), + FOREIGN KEY (chunk_id) REFERENCES semantic_chunks(id) ON DELETE CASCADE + ) + """ + ) + + log.info("Creating indexes for splade_posting_list...") + # Index for efficient chunk-based lookups (deletion, updates) + cursor.execute( + """ + CREATE INDEX IF NOT EXISTS idx_splade_by_chunk + ON splade_posting_list(chunk_id) + """ + ) + + # Index for efficient term-based retrieval + cursor.execute( + """ + CREATE INDEX IF NOT EXISTS idx_splade_by_token + ON splade_posting_list(token_id) + """ + ) + + log.info("Migration 009 completed successfully") + + +def downgrade(db_conn: Connection) -> None: + """ + Removes SPLADE tables. + + Drops: + - splade_posting_list (and associated indexes) + - splade_metadata + + Args: + db_conn: The SQLite database connection. + """ + cursor = db_conn.cursor() + + log.info("Dropping SPLADE indexes...") + cursor.execute("DROP INDEX IF EXISTS idx_splade_by_chunk") + cursor.execute("DROP INDEX IF EXISTS idx_splade_by_token") + + log.info("Dropping splade_posting_list table...") + cursor.execute("DROP TABLE IF EXISTS splade_posting_list") + + log.info("Dropping splade_metadata table...") + cursor.execute("DROP TABLE IF EXISTS splade_metadata") + + log.info("Migration 009 downgrade completed successfully") diff --git a/codex-lens/src/codexlens/storage/splade_index.py b/codex-lens/src/codexlens/storage/splade_index.py new file mode 100644 index 00000000..65fa6b41 --- /dev/null +++ b/codex-lens/src/codexlens/storage/splade_index.py @@ -0,0 +1,432 @@ +"""SPLADE inverted index storage for sparse vector retrieval. + +This module implements SQLite-based inverted index for SPLADE sparse vectors, +enabling efficient sparse retrieval using dot-product scoring. +""" + +from __future__ import annotations + +import logging +import sqlite3 +import threading +import time +from pathlib import Path +from typing import Dict, List, Optional, Tuple + +from codexlens.entities import SearchResult +from codexlens.errors import StorageError + +logger = logging.getLogger(__name__) + + +class SpladeIndex: + """SQLite-based inverted index for SPLADE sparse vectors. + + Stores sparse vectors as posting lists mapping token_id -> (chunk_id, weight). + Supports efficient dot-product retrieval using SQL joins. + """ + + def __init__(self, db_path: Path | str) -> None: + """Initialize SPLADE index. + + Args: + db_path: Path to SQLite database file. + """ + self.db_path = Path(db_path) + self.db_path.parent.mkdir(parents=True, exist_ok=True) + + # Thread-safe connection management + self._lock = threading.RLock() + self._local = threading.local() + + def _get_connection(self) -> sqlite3.Connection: + """Get or create a thread-local database connection.""" + conn = getattr(self._local, "conn", None) + if conn is None: + conn = sqlite3.connect(self.db_path, check_same_thread=False) + conn.row_factory = sqlite3.Row + conn.execute("PRAGMA journal_mode=WAL") + conn.execute("PRAGMA synchronous=NORMAL") + conn.execute("PRAGMA foreign_keys=ON") + conn.execute("PRAGMA mmap_size=30000000000") # 30GB limit + self._local.conn = conn + return conn + + def close(self) -> None: + """Close thread-local database connection.""" + with self._lock: + conn = getattr(self._local, "conn", None) + if conn is not None: + conn.close() + self._local.conn = None + + def __enter__(self) -> SpladeIndex: + """Context manager entry.""" + self.create_tables() + return self + + def __exit__(self, exc_type, exc, tb) -> None: + """Context manager exit.""" + self.close() + + def has_index(self) -> bool: + """Check if SPLADE tables exist in database. + + Returns: + True if tables exist, False otherwise. + """ + with self._lock: + conn = self._get_connection() + try: + cursor = conn.execute( + """ + SELECT name FROM sqlite_master + WHERE type='table' AND name='splade_posting_list' + """ + ) + return cursor.fetchone() is not None + except sqlite3.Error as e: + logger.error("Failed to check index existence: %s", e) + return False + + def create_tables(self) -> None: + """Create SPLADE schema if not exists. + + Note: The splade_posting_list table has a FOREIGN KEY constraint + referencing semantic_chunks(id). Ensure VectorStore.create_tables() + is called first to create the semantic_chunks table. + """ + with self._lock: + conn = self._get_connection() + try: + # Inverted index for sparse vectors + conn.execute(""" + CREATE TABLE IF NOT EXISTS splade_posting_list ( + token_id INTEGER NOT NULL, + chunk_id INTEGER NOT NULL, + weight REAL NOT NULL, + PRIMARY KEY (token_id, chunk_id), + FOREIGN KEY (chunk_id) REFERENCES semantic_chunks(id) ON DELETE CASCADE + ) + """) + + # Indexes for efficient lookups + conn.execute(""" + CREATE INDEX IF NOT EXISTS idx_splade_by_chunk + ON splade_posting_list(chunk_id) + """) + conn.execute(""" + CREATE INDEX IF NOT EXISTS idx_splade_by_token + ON splade_posting_list(token_id) + """) + + # Model metadata + conn.execute(""" + CREATE TABLE IF NOT EXISTS splade_metadata ( + id INTEGER PRIMARY KEY DEFAULT 1, + model_name TEXT NOT NULL, + vocab_size INTEGER NOT NULL, + onnx_path TEXT, + created_at REAL + ) + """) + + conn.commit() + logger.debug("SPLADE schema created successfully") + except sqlite3.Error as e: + raise StorageError( + f"Failed to create SPLADE schema: {e}", + db_path=str(self.db_path), + operation="create_tables" + ) from e + + def add_posting(self, chunk_id: int, sparse_vec: Dict[int, float]) -> None: + """Add a single document to inverted index. + + Args: + chunk_id: Chunk ID (foreign key to semantic_chunks.id). + sparse_vec: Sparse vector as {token_id: weight} mapping. + """ + if not sparse_vec: + logger.warning("Empty sparse vector for chunk_id=%d, skipping", chunk_id) + return + + with self._lock: + conn = self._get_connection() + try: + # Insert all non-zero weights for this chunk + postings = [ + (token_id, chunk_id, weight) + for token_id, weight in sparse_vec.items() + if weight > 0 # Only store non-zero weights + ] + + if postings: + conn.executemany( + """ + INSERT OR REPLACE INTO splade_posting_list + (token_id, chunk_id, weight) + VALUES (?, ?, ?) + """, + postings + ) + conn.commit() + logger.debug( + "Added %d postings for chunk_id=%d", len(postings), chunk_id + ) + except sqlite3.Error as e: + raise StorageError( + f"Failed to add posting for chunk_id={chunk_id}: {e}", + db_path=str(self.db_path), + operation="add_posting" + ) from e + + def add_postings_batch( + self, postings: List[Tuple[int, Dict[int, float]]] + ) -> None: + """Batch insert postings for multiple chunks. + + Args: + postings: List of (chunk_id, sparse_vec) tuples. + """ + if not postings: + return + + with self._lock: + conn = self._get_connection() + try: + # Flatten all postings into single batch + batch_data = [] + for chunk_id, sparse_vec in postings: + for token_id, weight in sparse_vec.items(): + if weight > 0: # Only store non-zero weights + batch_data.append((token_id, chunk_id, weight)) + + if batch_data: + conn.executemany( + """ + INSERT OR REPLACE INTO splade_posting_list + (token_id, chunk_id, weight) + VALUES (?, ?, ?) + """, + batch_data + ) + conn.commit() + logger.debug( + "Batch inserted %d postings for %d chunks", + len(batch_data), + len(postings) + ) + except sqlite3.Error as e: + raise StorageError( + f"Failed to batch insert postings: {e}", + db_path=str(self.db_path), + operation="add_postings_batch" + ) from e + + def remove_chunk(self, chunk_id: int) -> int: + """Remove all postings for a chunk. + + Args: + chunk_id: Chunk ID to remove. + + Returns: + Number of deleted postings. + """ + with self._lock: + conn = self._get_connection() + try: + cursor = conn.execute( + "DELETE FROM splade_posting_list WHERE chunk_id = ?", + (chunk_id,) + ) + conn.commit() + deleted = cursor.rowcount + logger.debug("Removed %d postings for chunk_id=%d", deleted, chunk_id) + return deleted + except sqlite3.Error as e: + raise StorageError( + f"Failed to remove chunk_id={chunk_id}: {e}", + db_path=str(self.db_path), + operation="remove_chunk" + ) from e + + def search( + self, + query_sparse: Dict[int, float], + limit: int = 50, + min_score: float = 0.0 + ) -> List[Tuple[int, float]]: + """Search for similar chunks using dot-product scoring. + + Implements efficient sparse dot-product via SQL JOIN: + score(q, d) = sum(q[t] * d[t]) for all tokens t + + Args: + query_sparse: Query sparse vector as {token_id: weight}. + limit: Maximum number of results. + min_score: Minimum score threshold. + + Returns: + List of (chunk_id, score) tuples, ordered by score descending. + """ + if not query_sparse: + logger.warning("Empty query sparse vector") + return [] + + with self._lock: + conn = self._get_connection() + try: + # Build VALUES clause for query terms + # Each term: (token_id, weight) + query_terms = [ + (token_id, weight) + for token_id, weight in query_sparse.items() + if weight > 0 + ] + + if not query_terms: + logger.warning("No non-zero query terms") + return [] + + # Create CTE for query terms using parameterized VALUES + # Build placeholders and params to prevent SQL injection + params = [] + placeholders = [] + for token_id, weight in query_terms: + placeholders.append("(?, ?)") + params.extend([token_id, weight]) + + values_placeholders = ", ".join(placeholders) + + sql = f""" + WITH query_terms(token_id, weight) AS ( + VALUES {values_placeholders} + ) + SELECT + p.chunk_id, + SUM(p.weight * q.weight) as score + FROM splade_posting_list p + INNER JOIN query_terms q ON p.token_id = q.token_id + GROUP BY p.chunk_id + HAVING score >= ? + ORDER BY score DESC + LIMIT ? + """ + + # Append min_score and limit to params + params.extend([min_score, limit]) + rows = conn.execute(sql, params).fetchall() + + results = [(row["chunk_id"], float(row["score"])) for row in rows] + logger.debug( + "SPLADE search: %d query terms, %d results", + len(query_terms), + len(results) + ) + return results + + except sqlite3.Error as e: + raise StorageError( + f"SPLADE search failed: {e}", + db_path=str(self.db_path), + operation="search" + ) from e + + def get_metadata(self) -> Optional[Dict]: + """Get SPLADE model metadata. + + Returns: + Dictionary with model_name, vocab_size, onnx_path, created_at, + or None if not set. + """ + with self._lock: + conn = self._get_connection() + try: + row = conn.execute( + """ + SELECT model_name, vocab_size, onnx_path, created_at + FROM splade_metadata + WHERE id = 1 + """ + ).fetchone() + + if row: + return { + "model_name": row["model_name"], + "vocab_size": row["vocab_size"], + "onnx_path": row["onnx_path"], + "created_at": row["created_at"] + } + return None + except sqlite3.Error as e: + logger.error("Failed to get metadata: %s", e) + return None + + def set_metadata( + self, + model_name: str, + vocab_size: int, + onnx_path: Optional[str] = None + ) -> None: + """Set SPLADE model metadata. + + Args: + model_name: SPLADE model name. + vocab_size: Vocabulary size (typically ~30k for BERT vocab). + onnx_path: Optional path to ONNX model file. + """ + with self._lock: + conn = self._get_connection() + try: + current_time = time.time() + conn.execute( + """ + INSERT OR REPLACE INTO splade_metadata + (id, model_name, vocab_size, onnx_path, created_at) + VALUES (1, ?, ?, ?, ?) + """, + (model_name, vocab_size, onnx_path, current_time) + ) + conn.commit() + logger.info( + "Set SPLADE metadata: model=%s, vocab_size=%d", + model_name, + vocab_size + ) + except sqlite3.Error as e: + raise StorageError( + f"Failed to set metadata: {e}", + db_path=str(self.db_path), + operation="set_metadata" + ) from e + + def get_stats(self) -> Dict: + """Get index statistics. + + Returns: + Dictionary with total_postings, unique_tokens, unique_chunks. + """ + with self._lock: + conn = self._get_connection() + try: + row = conn.execute(""" + SELECT + COUNT(*) as total_postings, + COUNT(DISTINCT token_id) as unique_tokens, + COUNT(DISTINCT chunk_id) as unique_chunks + FROM splade_posting_list + """).fetchone() + + return { + "total_postings": row["total_postings"], + "unique_tokens": row["unique_tokens"], + "unique_chunks": row["unique_chunks"] + } + except sqlite3.Error as e: + logger.error("Failed to get stats: %s", e) + return { + "total_postings": 0, + "unique_tokens": 0, + "unique_chunks": 0 + } diff --git a/codex-lens/verify_watcher.py b/codex-lens/verify_watcher.py new file mode 100644 index 00000000..f64ff089 --- /dev/null +++ b/codex-lens/verify_watcher.py @@ -0,0 +1,117 @@ +#!/usr/bin/env python3 +"""Verification script for FileWatcher event filtering and debouncing.""" + +import time +from pathlib import Path +from codexlens.watcher.file_watcher import FileWatcher +from codexlens.watcher.events import WatcherConfig, FileEvent + +def test_should_index_file(): + """Test _should_index_file filtering logic.""" + print("Testing _should_index_file filtering...") + + # Create watcher instance + config = WatcherConfig() + watcher = FileWatcher( + root_path=Path("."), + config=config, + on_changes=lambda events: None, + ) + + # Test cases + test_cases = [ + # (path, expected_result, description) + (Path("test.py"), True, "Python file should be indexed"), + (Path("test.txt"), True, "Text file should be indexed"), + (Path("test.js"), True, "JavaScript file should be indexed"), + (Path("test.ts"), True, "TypeScript file should be indexed"), + (Path("src/test.py"), True, "Python file in subdirectory should be indexed"), + (Path(".git/config"), False, ".git files should be filtered"), + (Path("node_modules/pkg/index.js"), False, "node_modules should be filtered"), + (Path("__pycache__/test.pyc"), False, "__pycache__ should be filtered"), + (Path(".venv/lib/test.py"), False, ".venv should be filtered"), + (Path("test.unknown"), False, "Unknown extension should be filtered"), + (Path("README.md"), True, "Markdown file should be indexed"), + ] + + passed = 0 + failed = 0 + + for path, expected, description in test_cases: + result = watcher._should_index_file(path) + status = "✓" if result == expected else "✗" + + if result == expected: + passed += 1 + else: + failed += 1 + + print(f" {status} {description}") + print(f" Path: {path}, Expected: {expected}, Got: {result}") + + print(f"\nResults: {passed} passed, {failed} failed") + return failed == 0 + +def test_debounce_and_dedup(): + """Test event debouncing and deduplication.""" + print("\nTesting event debouncing and deduplication...") + + received_events = [] + + def on_changes(events): + received_events.append(events) + print(f" Received batch: {len(events)} events") + + # Create watcher with short debounce time for testing + config = WatcherConfig(debounce_ms=500) + watcher = FileWatcher( + root_path=Path("."), + config=config, + on_changes=on_changes, + ) + + # Simulate rapid events to same file (should be deduplicated) + from codexlens.watcher.events import ChangeType + + test_path = Path("test_file.py") + for i in range(5): + event = FileEvent( + path=test_path, + change_type=ChangeType.MODIFIED, + timestamp=time.time(), + ) + watcher._on_raw_event(event) + + # Wait for debounce + time.sleep(0.6) + + # Force flush to ensure we get the events + watcher._flush_events() + + if received_events: + batch = received_events[0] + # Should deduplicate 5 events to 1 + if len(batch) == 1: + print(" ✓ Deduplication working: 5 events reduced to 1") + return True + else: + print(f" ✗ Deduplication failed: expected 1 event, got {len(batch)}") + return False + else: + print(" ✗ No events received") + return False + +if __name__ == "__main__": + print("=" * 60) + print("FileWatcher Verification") + print("=" * 60) + + test1 = test_should_index_file() + test2 = test_debounce_and_dedup() + + print("\n" + "=" * 60) + if test1 and test2: + print("✓ All tests passed!") + else: + print("✗ Some tests failed") + print("=" * 60)