From 6d3f10d1d7b60d26ae244bde1f3f5b3e9b5b96fb Mon Sep 17 00:00:00 2001 From: catlog22 Date: Sun, 21 Dec 2025 21:45:04 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20=E5=A2=9E=E5=8A=A0=E6=96=87=E4=BB=B6?= =?UTF-8?q?=E8=AF=BB=E5=8F=96=E5=8A=9F=E8=83=BD=E7=9A=84=E8=A1=8C=E5=88=86?= =?UTF-8?q?=E9=A1=B5=E6=94=AF=E6=8C=81=EF=BC=8C=E4=BC=98=E5=8C=96=E6=99=BA?= =?UTF-8?q?=E8=83=BD=E6=90=9C=E7=B4=A2=E7=9A=84=E5=A4=9A=E8=AF=8D=E6=9F=A5?= =?UTF-8?q?=E8=AF=A2=E5=8C=B9=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../dashboard-js/views/mcp-manager.js | 8 +- ccw/src/tools/read-file.ts | 106 ++++++++++-- ccw/src/tools/smart-search.ts | 152 ++++++++++++++++-- 3 files changed, 237 insertions(+), 29 deletions(-) diff --git a/ccw/src/templates/dashboard-js/views/mcp-manager.js b/ccw/src/templates/dashboard-js/views/mcp-manager.js index 55248cc4..fb0a5c70 100644 --- a/ccw/src/templates/dashboard-js/views/mcp-manager.js +++ b/ccw/src/templates/dashboard-js/views/mcp-manager.js @@ -256,7 +256,7 @@ async function renderMcpManager() {
- +
- +
- +
- + ; @@ -40,6 +42,8 @@ interface FileEntry { content?: string; truncated?: boolean; matches?: string[]; + totalLines?: number; + lineRange?: { start: number; end: number }; } interface ReadResult { @@ -123,23 +127,69 @@ function collectFiles( return files; } +interface ReadContentOptions { + maxLength: number; + offset?: number; + limit?: number; +} + +interface ReadContentResult { + content: string; + truncated: boolean; + totalLines?: number; + lineRange?: { start: number; end: number }; +} + /** - * Read file content with truncation + * Read file content with truncation and optional line-based pagination */ -function readFileContent(filePath: string, maxLength: number): { content: string; truncated: boolean } { +function readFileContent(filePath: string, options: ReadContentOptions): ReadContentResult { + const { maxLength, offset, limit } = options; + if (isBinaryFile(filePath)) { return { content: '[Binary file]', truncated: false }; } try { const content = readFileSync(filePath, 'utf8'); + const lines = content.split('\n'); + const totalLines = lines.length; + + // If offset/limit specified, use line-based pagination + if (offset !== undefined || limit !== undefined) { + const startLine = Math.min(offset ?? 0, totalLines); + const endLine = limit !== undefined ? Math.min(startLine + limit, totalLines) : totalLines; + const selectedLines = lines.slice(startLine, endLine); + const selectedContent = selectedLines.join('\n'); + + const actualEnd = endLine; + const hasMore = actualEnd < totalLines; + + let finalContent = selectedContent; + if (selectedContent.length > maxLength) { + finalContent = selectedContent.substring(0, maxLength) + `\n... (+${selectedContent.length - maxLength} chars)`; + } + + // Calculate actual line range (handle empty selection) + const actualLineEnd = selectedLines.length > 0 ? startLine + selectedLines.length - 1 : startLine; + + return { + content: finalContent, + truncated: hasMore || selectedContent.length > maxLength, + totalLines, + lineRange: { start: startLine, end: actualLineEnd }, + }; + } + + // Default behavior: truncate by character length if (content.length > maxLength) { return { content: content.substring(0, maxLength) + `\n... (+${content.length - maxLength} chars)`, - truncated: true + truncated: true, + totalLines, }; } - return { content, truncated: false }; + return { content, truncated: false, totalLines }; } catch (error) { return { content: `[Error: ${(error as Error).message}]`, truncated: false }; } @@ -171,15 +221,17 @@ function findMatches(content: string, pattern: string): string[] { // Tool schema for MCP export const schema: ToolSchema = { name: 'read_file', - description: `Read files with multi-file, directory, and regex support. + description: `Read files with multi-file, directory, regex support, and line-based pagination. Usage: - read_file(paths="file.ts") # Single file - read_file(paths=["a.ts", "b.ts"]) # Multiple files - read_file(paths="src/", pattern="*.ts") # Directory with pattern - read_file(paths="src/", contentPattern="TODO") # Search content + read_file(paths="file.ts") # Single file (full content) + read_file(paths="file.ts", offset=100, limit=50) # Lines 100-149 (0-based) + read_file(paths=["a.ts", "b.ts"]) # Multiple files + read_file(paths="src/", pattern="*.ts") # Directory with pattern + read_file(paths="src/", contentPattern="TODO") # Search content -Returns compact file list with optional content.`, +Supports both absolute and relative paths. Relative paths are resolved from project root. +Returns compact file list with optional content. Use offset/limit for large file pagination.`, inputSchema: { type: 'object', properties: { @@ -213,6 +265,16 @@ Returns compact file list with optional content.`, description: `Max number of files to return (default: ${MAX_FILES})`, default: MAX_FILES, }, + offset: { + type: 'number', + description: 'Line offset to start reading from (0-based, for single file only)', + minimum: 0, + }, + limit: { + type: 'number', + description: 'Number of lines to read (for single file only)', + minimum: 1, + }, }, required: ['paths'], }, @@ -232,6 +294,8 @@ export async function handler(params: Record): Promise): Promise= MAX_TOTAL_CONTENT) break; @@ -283,7 +351,15 @@ export async function handler(params: Record): Promise): Promise): Promise): Promise maxFiles) { message += ` (showing ${maxFiles} of ${totalFiles})`; } + if (useLinePagination && files.length > 0 && files[0].lineRange) { + const { start, end } = files[0].lineRange; + message += ` [lines ${start}-${end} of ${files[0].totalLines}]`; + } if (contentPattern) { message += ` matching "${contentPattern}"`; } diff --git a/ccw/src/tools/smart-search.ts b/ccw/src/tools/smart-search.ts index e18c8545..f609de77 100644 --- a/ccw/src/tools/smart-search.ts +++ b/ccw/src/tools/smart-search.ts @@ -45,6 +45,7 @@ const ParamsSchema = z.object({ // Search modifiers for ripgrep mode regex: z.boolean().default(true), // Use regex pattern matching (default: enabled) caseSensitive: z.boolean().default(true), // Case sensitivity (default: case-sensitive) + tokenize: z.boolean().default(true), // Tokenize multi-word queries for OR matching (default: enabled) // Fuzzy matching is implicit in hybrid mode (RRF fusion) }); @@ -96,6 +97,87 @@ function buildExcludeArgs(): string[] { return args; } +/** + * Tokenize query for multi-word OR matching + * Splits on whitespace and common delimiters, filters stop words and short tokens + * @param query - The search query + * @returns Array of tokens + */ +function tokenizeQuery(query: string): string[] { + // Stop words for filtering (common English + programming keywords) + const stopWords = new Set([ + 'the', 'a', 'an', 'is', 'are', 'was', 'were', 'be', 'been', 'being', + 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', + 'should', 'may', 'might', 'must', 'can', 'to', 'of', 'in', 'for', 'on', + 'with', 'at', 'by', 'from', 'as', 'into', 'through', 'and', 'but', 'if', + 'or', 'not', 'this', 'that', 'these', 'those', 'it', 'its', 'how', 'what', + 'where', 'when', 'why', 'which', 'who', 'whom', + ]); + + // Split on whitespace and common delimiters, keep meaningful tokens + const tokens = query + .split(/[\s,;:]+/) + .map(token => token.trim()) + .filter(token => { + // Keep tokens that are: + // - At least 2 characters long + // - Not a stop word (case-insensitive) + // - Or look like identifiers (contain underscore/camelCase) + if (token.length < 2) return false; + if (stopWords.has(token.toLowerCase()) && !token.includes('_') && !/[A-Z]/.test(token)) { + return false; + } + return true; + }); + + return tokens; +} + +/** + * Score results based on token match count for ranking + * @param results - Search results + * @param tokens - Query tokens + * @returns Results with match scores + */ +function scoreByTokenMatch(results: ExactMatch[], tokens: string[]): ExactMatch[] { + if (tokens.length <= 1) return results; + + // Create case-insensitive patterns for each token + const tokenPatterns = tokens.map(t => { + const escaped = t.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); + return new RegExp(escaped, 'i'); + }); + + return results.map(r => { + const content = r.content || ''; + const file = r.file || ''; + const searchText = `${file} ${content}`; + + // Count how many tokens match + let matchCount = 0; + for (const pattern of tokenPatterns) { + if (pattern.test(searchText)) { + matchCount++; + } + } + + // Calculate match ratio (0 to 1) + const matchRatio = matchCount / tokens.length; + + return { + ...r, + matchScore: matchRatio, + matchCount, + }; + }).sort((a, b) => { + // Sort by match ratio (descending), then by line number + if (b.matchScore !== a.matchScore) { + return b.matchScore - a.matchScore; + } + return (a.line || 0) - (b.line || 0); + }); +} + interface Classification { mode: string; confidence: number; @@ -107,6 +189,8 @@ interface ExactMatch { line: number; column: number; content: string; + matchScore?: number; // Token match ratio (0-1) for multi-word queries + matchCount?: number; // Number of tokens matched } interface RelationshipInfo { @@ -162,6 +246,9 @@ interface SearchMetadata { index_status?: 'indexed' | 'not_indexed' | 'partial'; fallback_history?: string[]; suggested_weights?: Record; + // Tokenization metadata (ripgrep mode) + tokens?: string[]; // Query tokens used for multi-word search + tokenized?: boolean; // Whether tokenization was applied // Pagination metadata pagination?: PaginationInfo; // Init action specific @@ -373,8 +460,9 @@ function checkToolAvailability(toolName: string): boolean { /** * Build ripgrep command arguments + * Supports tokenized multi-word queries with OR matching * @param params - Search parameters - * @returns Command and arguments + * @returns Command, arguments, and tokens used */ function buildRipgrepCommand(params: { query: string; @@ -384,8 +472,9 @@ function buildRipgrepCommand(params: { includeHidden: boolean; regex?: boolean; caseSensitive?: boolean; -}): { command: string; args: string[] } { - const { query, paths = ['.'], contextLines = 0, maxResults = 10, includeHidden = false, regex = false, caseSensitive = true } = params; + tokenize?: boolean; +}): { command: string; args: string[]; tokens: string[] } { + const { query, paths = ['.'], contextLines = 0, maxResults = 10, includeHidden = false, regex = false, caseSensitive = true, tokenize = true } = params; const args = [ '-n', @@ -415,16 +504,33 @@ function buildRipgrepCommand(params: { args.push('--hidden'); } - // Regex mode (-e) vs fixed string mode (-F) - if (regex) { - args.push('-e', query); + // Tokenize query for multi-word OR matching + const tokens = tokenize ? tokenizeQuery(query) : [query]; + + if (tokens.length > 1) { + // Multi-token: use multiple -e patterns (OR matching) + // Each token is escaped for regex safety unless regex mode is enabled + for (const token of tokens) { + if (regex) { + args.push('-e', token); + } else { + // Escape regex special chars for literal matching + const escaped = token.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); + args.push('-e', escaped); + } + } } else { - args.push('-F', query); + // Single token or no tokenization: use original behavior + if (regex) { + args.push('-e', query); + } else { + args.push('-F', query); + } } args.push(...paths); - return { command: 'rg', args }; + return { command: 'rg', args, tokens }; } /** @@ -578,9 +684,10 @@ async function executeAutoMode(params: Params): Promise { /** * Mode: ripgrep - Fast literal string matching using ripgrep * No index required, fallback to CodexLens if ripgrep unavailable + * Supports tokenized multi-word queries with OR matching and result ranking */ async function executeRipgrepMode(params: Params): Promise { - const { query, paths = [], contextLines = 0, maxResults = 10, includeHidden = false, path = '.', regex = true, caseSensitive = true } = params; + const { query, paths = [], contextLines = 0, maxResults = 10, includeHidden = false, path = '.', regex = true, caseSensitive = true, tokenize = true } = params; if (!query) { return { @@ -648,7 +755,7 @@ async function executeRipgrepMode(params: Params): Promise { } // Use ripgrep - const { command, args } = buildRipgrepCommand({ + const { command, args, tokens } = buildRipgrepCommand({ query, paths: paths.length > 0 ? paths : [path], contextLines, @@ -656,6 +763,7 @@ async function executeRipgrepMode(params: Params): Promise { includeHidden, regex, caseSensitive, + tokenize, }); return new Promise((resolve) => { @@ -704,15 +812,21 @@ async function executeRipgrepMode(params: Params): Promise { // If we have results despite the error, return them as partial success const isWindowsDeviceError = stderr.includes('os error 1') || stderr.includes('函数不正确'); - if (code === 0 || code === 1 || (isWindowsDeviceError && results.length > 0)) { + // Apply token-based scoring and sorting for multi-word queries + // Results matching more tokens are ranked higher (exact matches first) + const scoredResults = tokens.length > 1 ? scoreByTokenMatch(results, tokens) : results; + + if (code === 0 || code === 1 || (isWindowsDeviceError && scoredResults.length > 0)) { resolve({ success: true, - results, + results: scoredResults, metadata: { mode: 'ripgrep', backend: 'ripgrep', - count: results.length, + count: scoredResults.length, query, + tokens: tokens.length > 1 ? tokens : undefined, // Include tokens in metadata for debugging + tokenized: tokens.length > 1, ...(isWindowsDeviceError && { warning: 'Some Windows device files were skipped' }), }, }); @@ -1310,12 +1424,17 @@ export const schema: ToolSchema = { smart_search(query="auth", limit=10, offset=0) # first page smart_search(query="auth", limit=10, offset=10) # second page +**Multi-Word Search (ripgrep mode with tokenization):** + smart_search(query="CCW_PROJECT_ROOT CCW_ALLOWED_DIRS", mode="ripgrep") # tokenized OR matching + smart_search(query="auth login user", mode="ripgrep") # matches any token, ranks by match count + smart_search(query="exact phrase", mode="ripgrep", tokenize=false) # disable tokenization + **Regex Search (ripgrep mode):** smart_search(query="class.*Builder") # auto-detects regex pattern smart_search(query="def.*\\(.*\\):") # find function definitions smart_search(query="import.*from", caseSensitive=false) # case-insensitive -**Modes:** auto (intelligent routing), hybrid (semantic+fuzzy), exact (FTS), ripgrep (fast), priority (fallback chain)`, +**Modes:** auto (intelligent routing), hybrid (semantic+fuzzy), exact (FTS), ripgrep (fast with tokenization), priority (fallback chain)`, inputSchema: { type: 'object', properties: { @@ -1402,6 +1521,11 @@ export const schema: ToolSchema = { description: 'Case-sensitive search (default: true). Set to false for case-insensitive matching.', default: true, }, + tokenize: { + type: 'boolean', + description: 'Tokenize multi-word queries for OR matching (ripgrep mode). Default: true. Results are ranked by token match count (exact matches first).', + default: true, + }, }, required: [], },