mirror of https://github.com/catlog22/Claude-Code-Workflow.git synced 2026-02-09 02:24:11 +08:00

Files

catlog22 3ffb907a6f feat: add semantic graph design for static code analysis

- Introduced a comprehensive design document for a Code Semantic Graph aimed at enhancing static analysis capabilities.
- Defined the architecture, core components, and implementation steps for analyzing function calls, data flow, and dependencies.
- Included detailed specifications for nodes and edges in the graph, along with database schema for storage.
- Outlined phases for implementation, technical challenges, success metrics, and application scenarios.

2025-12-15 09:47:18 +08:00

30 KiB

Raw Blame History

Docstring与LLM混合策略设计方案

1. 背景与目标

1.1 当前问题

现有 llm_enhancer.py 的实现存在以下问题：

忽略已有文档：对所有代码无差别调用LLM，即使已有高质量的docstring
成本浪费：重复生成已有信息，增加API调用费用和时间
信息质量不一致：LLM生成的内容可能不如作者编写的docstring准确
缺少作者意图：丢失了docstring中的设计决策、使用示例等关键信息

1.2 设计目标

实现智能混合策略，结合docstring和LLM的优势：

优先使用docstring：作为最权威的信息源
LLM作为补充：填补docstring缺失或质量不足的部分
智能质量评估：自动判断docstring质量，决定是否需要LLM增强
成本优化：减少不必要的LLM调用，降低API费用
信息融合：将docstring和LLM生成的内容有机结合

2. 技术架构

2.1 整体流程

Code Symbol
    ↓
[Docstring Extractor] ← 提取docstring
    ↓
[Quality Evaluator]   ← 评估docstring质量
    ↓
    ├─ High Quality → Use Docstring Directly
    │                 + LLM Generate Keywords Only
    │
    ├─ Medium Quality → LLM Refine & Enhance
    │                   (docstring作为base)
    │
    └─ Low/No Docstring → LLM Full Generation
                          (现有流程)
    ↓
[Metadata Merger]     ← 合并docstring和LLM内容
    ↓
Final SemanticMetadata

2.2 核心组件

from dataclasses import dataclass
from enum import Enum
from typing import Optional

class DocstringQuality(Enum):
    """Docstring质量等级"""
    MISSING = "missing"           # 无docstring
    LOW = "low"                   # 质量低：<10字符或纯占位符
    MEDIUM = "medium"             # 质量中：有基本描述但不完整
    HIGH = "high"                 # 质量高：详细且结构化

@dataclass
class DocstringMetadata:
    """从docstring提取的元数据"""
    raw_text: str
    quality: DocstringQuality
    summary: Optional[str] = None       # 提取的摘要
    parameters: Optional[dict] = None   # 参数说明
    returns: Optional[str] = None       # 返回值说明
    examples: Optional[str] = None      # 使用示例
    notes: Optional[str] = None         # 注意事项

3. 详细实现步骤

3.1 Docstring提取与解析

import re
from typing import Optional

class DocstringExtractor:
    """Docstring提取器"""

    # Docstring风格正则
    GOOGLE_STYLE_PATTERN = re.compile(
        r'Args:|Returns:|Raises:|Examples:|Note:',
        re.MULTILINE
    )

    NUMPY_STYLE_PATTERN = re.compile(
        r'Parameters\n-+|Returns\n-+|Examples\n-+',
        re.MULTILINE
    )

    def extract_from_code(self, content: str, symbol: Symbol) -> Optional[str]:
        """从代码中提取docstring"""

        lines = content.splitlines()
        start_line = symbol.range[0] - 1  # 0-indexed

        # 查找函数定义后的第一个字符串字面量
        # 通常在函数定义的下一行或几行内
        for i in range(start_line + 1, min(start_line + 10, len(lines))):
            line = lines[i].strip()

            # Python triple-quoted string
            if line.startswith('"""') or line.startswith("'''"):
                return self._extract_multiline_docstring(lines, i)

        return None

    def _extract_multiline_docstring(
        self,
        lines: List[str],
        start_idx: int
    ) -> str:
        """提取多行docstring"""

        quote_char = '"""' if lines[start_idx].strip().startswith('"""') else "'''"
        docstring_lines = []

        # 检查是否单行docstring
        first_line = lines[start_idx].strip()
        if first_line.count(quote_char) == 2:
            # 单行: """This is a docstring."""
            return first_line.strip(quote_char).strip()

        # 多行docstring
        in_docstring = True
        for i in range(start_idx, len(lines)):
            line = lines[i]

            if i == start_idx:
                # 第一行：移除开始的引号
                docstring_lines.append(line.strip().lstrip(quote_char))
            elif quote_char in line:
                # 结束行：移除结束的引号
                docstring_lines.append(line.strip().rstrip(quote_char))
                break
            else:
                docstring_lines.append(line.strip())

        return '\n'.join(docstring_lines).strip()

    def parse_docstring(self, raw_docstring: str) -> DocstringMetadata:
        """解析docstring，提取结构化信息"""

        if not raw_docstring:
            return DocstringMetadata(
                raw_text="",
                quality=DocstringQuality.MISSING
            )

        # 评估质量
        quality = self._evaluate_quality(raw_docstring)

        # 提取各个部分
        metadata = DocstringMetadata(
            raw_text=raw_docstring,
            quality=quality,
        )

        # 提取摘要（第一行或第一段）
        metadata.summary = self._extract_summary(raw_docstring)

        # 如果是Google或NumPy风格，提取结构化内容
        if self.GOOGLE_STYLE_PATTERN.search(raw_docstring):
            self._parse_google_style(raw_docstring, metadata)
        elif self.NUMPY_STYLE_PATTERN.search(raw_docstring):
            self._parse_numpy_style(raw_docstring, metadata)

        return metadata

    def _evaluate_quality(self, docstring: str) -> DocstringQuality:
        """评估docstring质量"""

        if not docstring or len(docstring.strip()) == 0:
            return DocstringQuality.MISSING

        # 检查是否是占位符
        placeholders = ['todo', 'fixme', 'tbd', 'placeholder', '...']
        if any(p in docstring.lower() for p in placeholders):
            return DocstringQuality.LOW

        # 长度检查
        if len(docstring.strip()) < 10:
            return DocstringQuality.LOW

        # 检查是否有结构化内容
        has_structure = (
            self.GOOGLE_STYLE_PATTERN.search(docstring) or
            self.NUMPY_STYLE_PATTERN.search(docstring)
        )

        # 检查是否有足够的描述性文本
        word_count = len(docstring.split())

        if has_structure and word_count >= 20:
            return DocstringQuality.HIGH
        elif word_count >= 10:
            return DocstringQuality.MEDIUM
        else:
            return DocstringQuality.LOW

    def _extract_summary(self, docstring: str) -> str:
        """提取摘要（第一行或第一段）"""

        lines = docstring.split('\n')
        # 第一行非空行作为摘要
        for line in lines:
            if line.strip():
                return line.strip()

        return ""

    def _parse_google_style(self, docstring: str, metadata: DocstringMetadata):
        """解析Google风格docstring"""

        # 提取Args
        args_match = re.search(r'Args:(.*?)(?=Returns:|Raises:|Examples:|Note:|\Z)', docstring, re.DOTALL)
        if args_match:
            metadata.parameters = self._parse_args_section(args_match.group(1))

        # 提取Returns
        returns_match = re.search(r'Returns:(.*?)(?=Raises:|Examples:|Note:|\Z)', docstring, re.DOTALL)
        if returns_match:
            metadata.returns = returns_match.group(1).strip()

        # 提取Examples
        examples_match = re.search(r'Examples:(.*?)(?=Note:|\Z)', docstring, re.DOTALL)
        if examples_match:
            metadata.examples = examples_match.group(1).strip()

    def _parse_args_section(self, args_text: str) -> dict:
        """解析参数列表"""

        params = {}
        # 匹配 "param_name (type): description" 或 "param_name: description"
        pattern = re.compile(r'(\w+)\s*(?:\(([^)]+)\))?\s*:\s*(.+)')

        for line in args_text.split('\n'):
            match = pattern.search(line.strip())
            if match:
                param_name, param_type, description = match.groups()
                params[param_name] = {
                    'type': param_type,
                    'description': description.strip()
                }

        return params

3.2 智能混合策略引擎

class HybridEnhancer:
    """Docstring与LLM混合增强器"""

    def __init__(
        self,
        llm_enhancer: LLMEnhancer,
        docstring_extractor: DocstringExtractor
    ):
        self.llm_enhancer = llm_enhancer
        self.docstring_extractor = docstring_extractor

    def enhance_with_strategy(
        self,
        file_data: FileData,
        symbols: List[Symbol]
    ) -> Dict[str, SemanticMetadata]:
        """根据docstring质量选择增强策略"""

        results = {}

        for symbol in symbols:
            # 1. 提取并解析docstring
            raw_docstring = self.docstring_extractor.extract_from_code(
                file_data.content, symbol
            )
            doc_metadata = self.docstring_extractor.parse_docstring(raw_docstring or "")

            # 2. 根据质量选择策略
            semantic_metadata = self._apply_strategy(
                file_data, symbol, doc_metadata
            )

            results[symbol.name] = semantic_metadata

        return results

    def _apply_strategy(
        self,
        file_data: FileData,
        symbol: Symbol,
        doc_metadata: DocstringMetadata
    ) -> SemanticMetadata:
        """应用混合策略"""

        quality = doc_metadata.quality

        if quality == DocstringQuality.HIGH:
            # 高质量：直接使用docstring，只用LLM生成keywords
            return self._use_docstring_with_llm_keywords(symbol, doc_metadata)

        elif quality == DocstringQuality.MEDIUM:
            # 中等质量：让LLM精炼和增强
            return self._refine_with_llm(file_data, symbol, doc_metadata)

        else:  # LOW or MISSING
            # 低质量或无：完全由LLM生成
            return self._full_llm_generation(file_data, symbol)

    def _use_docstring_with_llm_keywords(
        self,
        symbol: Symbol,
        doc_metadata: DocstringMetadata
    ) -> SemanticMetadata:
        """策略1：使用docstring，LLM只生成keywords"""

        # 直接使用docstring的摘要
        summary = doc_metadata.summary or doc_metadata.raw_text[:200]

        # 使用LLM生成keywords
        keywords = self._generate_keywords_only(summary, symbol.name)

        # 从docstring推断purpose
        purpose = self._infer_purpose_from_docstring(doc_metadata)

        return SemanticMetadata(
            summary=summary,
            keywords=keywords,
            purpose=purpose,
            file_path=symbol.file_path if hasattr(symbol, 'file_path') else None,
            symbol_name=symbol.name,
            llm_tool="hybrid_docstring_primary",
        )

    def _refine_with_llm(
        self,
        file_data: FileData,
        symbol: Symbol,
        doc_metadata: DocstringMetadata
    ) -> SemanticMetadata:
        """策略2：让LLM精炼和增强docstring"""

        prompt = f"""
PURPOSE: Refine and enhance an existing docstring for better semantic search
TASK:
- Review the existing docstring
- Generate a concise summary (1-2 sentences) that captures the core purpose
- Extract 8-12 relevant keywords for search
- Identify the functional category/purpose

EXISTING DOCSTRING:
{doc_metadata.raw_text}

CODE CONTEXT:
Function: {symbol.name}
```{file_data.language}
{self._get_symbol_code(file_data.content, symbol)}

OUTPUT: JSON format {{ "summary": "refined summary based on docstring and code", "keywords": ["keyword1", "keyword2", ...], "purpose": "category" }} """

    response = self.llm_enhancer._invoke_ccw_cli(prompt, tool='gemini')
    if response['success']:
        data = json.loads(self.llm_enhancer._extract_json(response['stdout']))
        return SemanticMetadata(
            summary=data.get('summary', doc_metadata.summary),
            keywords=data.get('keywords', []),
            purpose=data.get('purpose', 'unknown'),
            file_path=file_data.path,
            symbol_name=symbol.name,
            llm_tool="hybrid_llm_refined",
        )

    # Fallback: 使用docstring
    return self._use_docstring_with_llm_keywords(symbol, doc_metadata)

def _full_llm_generation(
    self,
    file_data: FileData,
    symbol: Symbol
) -> SemanticMetadata:
    """策略3：完全由LLM生成（原有流程）"""

    # 复用现有的LLM enhancer
    code_snippet = self._get_symbol_code(file_data.content, symbol)

    results = self.llm_enhancer.enhance_files([
        FileData(
            path=f"{file_data.path}:{symbol.name}",
            content=code_snippet,
            language=file_data.language
        )
    ])

    return results.get(f"{file_data.path}:{symbol.name}", SemanticMetadata(
        summary="",
        keywords=[],
        purpose="unknown",
        file_path=file_data.path,
        symbol_name=symbol.name,
        llm_tool="hybrid_llm_full",
    ))

def _generate_keywords_only(self, summary: str, symbol_name: str) -> List[str]:
    """仅生成keywords（快速LLM调用）"""

    prompt = f"""

PURPOSE: Generate search keywords for a code function TASK: Extract 5-8 relevant keywords from the summary

Summary: {summary} Function Name: {symbol_name}

OUTPUT: Comma-separated keywords """

    response = self.llm_enhancer._invoke_ccw_cli(prompt, tool='gemini')
    if response['success']:
        keywords_str = response['stdout'].strip()
        return [k.strip() for k in keywords_str.split(',')]

    # Fallback: 从摘要提取关键词
    return self._extract_keywords_heuristic(summary)

def _extract_keywords_heuristic(self, text: str) -> List[str]:
    """启发式关键词提取（无需LLM）"""

    # 简单实现：提取名词性词组
    import re
    words = re.findall(r'\b[a-z]{4,}\b', text.lower())

    # 过滤常见词
    stopwords = {'this', 'that', 'with', 'from', 'have', 'will', 'your', 'their'}
    keywords = [w for w in words if w not in stopwords]

    return list(set(keywords))[:8]

def _infer_purpose_from_docstring(self, doc_metadata: DocstringMetadata) -> str:
    """从docstring推断purpose（无需LLM）"""

    summary = doc_metadata.summary.lower()

    # 简单规则匹配
    if 'authenticate' in summary or 'login' in summary:
        return 'auth'
    elif 'validate' in summary or 'check' in summary:
        return 'validation'
    elif 'parse' in summary or 'format' in summary:
        return 'data_processing'
    elif 'api' in summary or 'endpoint' in summary:
        return 'api'
    elif 'database' in summary or 'query' in summary:
        return 'data'
    elif 'test' in summary:
        return 'test'

    return 'util'

def _get_symbol_code(self, content: str, symbol: Symbol) -> str:
    """提取符号的代码"""

    lines = content.splitlines()
    start, end = symbol.range
    return '\n'.join(lines[start-1:end])


### 3.3 成本优化统计

```python
@dataclass
class EnhancementStats:
    """增强统计"""
    total_symbols: int = 0
    used_docstring_only: int = 0      # 只使用docstring
    llm_keywords_only: int = 0        # LLM只生成keywords
    llm_refined: int = 0              # LLM精炼docstring
    llm_full_generation: int = 0      # LLM完全生成
    total_llm_calls: int = 0
    estimated_cost_savings: float = 0.0  # 相比全用LLM节省的成本

class CostOptimizedEnhancer(HybridEnhancer):
    """带成本统计的增强器"""

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.stats = EnhancementStats()

    def enhance_with_strategy(
        self,
        file_data: FileData,
        symbols: List[Symbol]
    ) -> Dict[str, SemanticMetadata]:
        """增强并统计成本"""

        self.stats.total_symbols += len(symbols)
        results = super().enhance_with_strategy(file_data, symbols)

        # 统计各策略使用情况
        for metadata in results.values():
            if metadata.llm_tool == "hybrid_docstring_primary":
                self.stats.used_docstring_only += 1
                self.stats.llm_keywords_only += 1
                self.stats.total_llm_calls += 1
            elif metadata.llm_tool == "hybrid_llm_refined":
                self.stats.llm_refined += 1
                self.stats.total_llm_calls += 1
            elif metadata.llm_tool == "hybrid_llm_full":
                self.stats.llm_full_generation += 1
                self.stats.total_llm_calls += 1

        # 计算成本节省（假设：keywords-only调用成本为full的20%）
        keywords_only_savings = self.stats.llm_keywords_only * 0.8  # 节省80%
        full_generation_count = self.stats.total_symbols - self.stats.llm_keywords_only
        self.stats.estimated_cost_savings = keywords_only_savings / full_generation_count if full_generation_count > 0 else 0

        return results

    def print_stats(self):
        """打印统计信息"""

        print("=== Enhancement Statistics ===")
        print(f"Total Symbols: {self.stats.total_symbols}")
        print(f"Used Docstring (with LLM keywords): {self.stats.used_docstring_only} ({self.stats.used_docstring_only/self.stats.total_symbols*100:.1f}%)")
        print(f"LLM Refined Docstring: {self.stats.llm_refined} ({self.stats.llm_refined/self.stats.total_symbols*100:.1f}%)")
        print(f"LLM Full Generation: {self.stats.llm_full_generation} ({self.stats.llm_full_generation/self.stats.total_symbols*100:.1f}%)")
        print(f"Total LLM Calls: {self.stats.total_llm_calls}")
        print(f"Estimated Cost Savings: {self.stats.estimated_cost_savings*100:.1f}%")

4. 配置选项

@dataclass
class HybridEnhancementConfig:
    """混合增强配置"""

    # 是否启用混合策略（False则回退到全LLM模式）
    enable_hybrid: bool = True

    # 质量阈值配置
    use_docstring_threshold: DocstringQuality = DocstringQuality.HIGH
    refine_docstring_threshold: DocstringQuality = DocstringQuality.MEDIUM

    # 是否为高质量docstring生成keywords
    generate_keywords_for_docstring: bool = True

    # LLM配置
    llm_tool: str = "gemini"
    llm_timeout: int = 300000

    # 成本优化
    batch_size: int = 5              # 批量处理大小
    skip_test_files: bool = True     # 跳过测试文件（通常docstring较少）

    # 调试选项
    log_strategy_decisions: bool = False  # 记录策略决策日志

5. 测试策略

5.1 单元测试

import pytest

class TestDocstringExtractor:
    """测试docstring提取"""

    def test_extract_google_style(self):
        """测试Google风格docstring提取"""
        code = '''
def calculate_total(items, discount=0):
    """Calculate total price with optional discount.

    This function processes a list of items and applies
    a discount if specified.

    Args:
        items (list): List of item objects with price attribute.
        discount (float): Discount percentage (0-1). Defaults to 0.

    Returns:
        float: Total price after discount.

    Examples:
        >>> calculate_total([item1, item2], discount=0.1)
        90.0
    """
    total = sum(item.price for item in items)
    return total * (1 - discount)
'''
        extractor = DocstringExtractor()
        symbol = Symbol(name='calculate_total', kind='function', range=(1, 18))
        docstring = extractor.extract_from_code(code, symbol)

        assert docstring is not None
        metadata = extractor.parse_docstring(docstring)

        assert metadata.quality == DocstringQuality.HIGH
        assert 'Calculate total price' in metadata.summary
        assert metadata.parameters is not None
        assert 'items' in metadata.parameters
        assert metadata.returns is not None
        assert metadata.examples is not None

    def test_extract_low_quality_docstring(self):
        """测试低质量docstring识别"""
        code = '''
def process():
    """TODO"""
    pass
'''
        extractor = DocstringExtractor()
        symbol = Symbol(name='process', kind='function', range=(1, 3))
        docstring = extractor.extract_from_code(code, symbol)

        metadata = extractor.parse_docstring(docstring)
        assert metadata.quality == DocstringQuality.LOW

class TestHybridEnhancer:
    """测试混合增强器"""

    def test_high_quality_docstring_strategy(self):
        """测试高质量docstring使用策略"""

        extractor = DocstringExtractor()
        llm_enhancer = LLMEnhancer(LLMConfig(enabled=True))
        hybrid = HybridEnhancer(llm_enhancer, extractor)

        # 模拟高质量docstring
        doc_metadata = DocstringMetadata(
            raw_text="Validate user credentials against database.",
            quality=DocstringQuality.HIGH,
            summary="Validate user credentials against database."
        )

        symbol = Symbol(name='validate_user', kind='function', range=(1, 10))

        result = hybrid._use_docstring_with_llm_keywords(symbol, doc_metadata)

        # 应该使用docstring的摘要
        assert result.summary == doc_metadata.summary
        # 应该有keywords（可能由LLM或启发式生成）
        assert len(result.keywords) > 0

    def test_cost_optimization(self):
        """测试成本优化效果"""

        enhancer = CostOptimizedEnhancer(
            llm_enhancer=LLMEnhancer(LLMConfig(enabled=False)),  # Mock
            docstring_extractor=DocstringExtractor()
        )

        # 模拟处理10个symbol，其中5个有高质量docstring
        # 预期：5个只调用keywords生成，5个完整LLM
        # 总调用10次，但成本降低（keywords调用更便宜）

        # 实际测试需要mock LLM调用
        pass

5.2 集成测试

class TestHybridEnhancementPipeline:
    """测试完整的混合增强流程"""

    def test_full_pipeline(self):
        """测试完整流程：代码 -> docstring提取 -> 质量评估 -> 策略选择 -> 增强"""

        code = '''
def authenticate_user(username, password):
    """Authenticate user with username and password.

    Args:
        username (str): User's username
        password (str): User's password

    Returns:
        bool: True if authenticated, False otherwise
    """
    # ... implementation
    pass

def helper_func(x):
    # No docstring
    return x * 2
'''

        file_data = FileData(path='auth.py', content=code, language='python')
        symbols = [
            Symbol(name='authenticate_user', kind='function', range=(1, 11)),
            Symbol(name='helper_func', kind='function', range=(13, 15)),
        ]

        extractor = DocstringExtractor()
        llm_enhancer = LLMEnhancer(LLMConfig(enabled=True))
        hybrid = CostOptimizedEnhancer(llm_enhancer, extractor)

        results = hybrid.enhance_with_strategy(file_data, symbols)

        # authenticate_user 应该使用docstring
        assert results['authenticate_user'].llm_tool == "hybrid_docstring_primary"

        # helper_func 应该完全LLM生成
        assert results['helper_func'].llm_tool == "hybrid_llm_full"

        # 统计
        assert hybrid.stats.total_symbols == 2
        assert hybrid.stats.used_docstring_only >= 1
        assert hybrid.stats.llm_full_generation >= 1

6. 实施路线图

Phase 1: 基础设施（1周）

设计数据结构（DocstringMetadata, DocstringQuality）
实现DocstringExtractor（提取和解析）
支持Python docstring（Google/NumPy/reStructuredText风格）
单元测试

Phase 2: 质量评估（1周）

实现质量评估算法
启发式规则优化
测试不同质量的docstring
调整阈值参数

Phase 3: 混合策略（1-2周）

实现HybridEnhancer
三种策略实现（docstring-only, refine, full-llm）
策略选择逻辑
集成测试

Phase 4: 成本优化（1周）

实现CostOptimizedEnhancer
统计和监控
批量处理优化
性能测试

Phase 5: 多语言支持（1-2周）

JavaScript/TypeScript JSDoc
Java Javadoc
其他语言docstring格式

Phase 6: 集成与部署（1周）

集成到现有llm_enhancer
CLI选项暴露
配置文件支持
文档和示例

总计预估时间：6-8周

7. 性能与成本分析

7.1 预期成本节省

假设场景：分析1000个函数

Docstring质量分布	占比	LLM调用策略	相对成本
High (有详细docstring)	30%	只生成keywords	20%
Medium (有基本docstring)	40%	精炼增强	60%
Low/Missing	30%	完全生成	100%

总成本计算：

纯LLM模式：1000 * 100% = 1000 units
混合模式：30020% + 40060% + 300*100% = 60 + 240 + 300 = 600 units
节省：40%

7.2 质量对比

指标	纯LLM模式	混合模式
准确性	中（可能有幻觉）	高（docstring权威）
一致性	中（依赖prompt）	高（保留作者风格）
覆盖率	高（全覆盖）	高（98%+）
成本	高	低（节省40%）
速度	慢（所有文件）	快（减少LLM调用）

8. 潜在问题与解决方案

8.1 问题：Docstring过时

现象：代码已修改，但docstring未更新，导致信息不准确。

解决方案：

class DocstringFreshnessChecker:
    """检查docstring与代码的一致性"""

    def check_freshness(
        self,
        symbol: Symbol,
        code: str,
        doc_metadata: DocstringMetadata
    ) -> bool:
        """检查docstring是否与代码匹配"""

        # 检查1: 参数列表是否匹配
        if doc_metadata.parameters:
            actual_params = self._extract_actual_parameters(code)
            documented_params = set(doc_metadata.parameters.keys())

            if actual_params != documented_params:
                logger.warning(
                    f"Parameter mismatch in {symbol.name}: "
                    f"code has {actual_params}, doc has {documented_params}"
                )
                return False

        # 检查2: 使用LLM验证一致性
        # TODO: 构建验证prompt

        return True

8.2 问题：不同docstring风格混用

现象：同一项目中使用多种docstring风格（Google, NumPy, 自定义）。

解决方案：

class MultiStyleDocstringParser:
    """支持多种docstring风格的解析器"""

    def parse(self, docstring: str) -> DocstringMetadata:
        """自动检测并解析不同风格"""

        # 尝试各种解析器
        for parser in [
            GoogleStyleParser(),
            NumpyStyleParser(),
            ReStructuredTextParser(),
            SimpleParser(),  # Fallback
        ]:
            try:
                metadata = parser.parse(docstring)
                if metadata.quality != DocstringQuality.LOW:
                    return metadata
            except Exception:
                continue

        # 如果所有解析器都失败，返回简单解析结果
        return SimpleParser().parse(docstring)

8.3 问题：多语言docstring提取差异

现象：不同语言的docstring格式和位置不同。

解决方案：

class LanguageSpecificExtractor:
    """语言特定的docstring提取器"""

    def extract(self, language: str, code: str, symbol: Symbol) -> Optional[str]:
        """根据语言选择合适的提取器"""

        extractors = {
            'python': PythonDocstringExtractor(),
            'javascript': JSDocExtractor(),
            'typescript': TSDocExtractor(),
            'java': JavadocExtractor(),
        }

        extractor = extractors.get(language, GenericExtractor())
        return extractor.extract(code, symbol)

class JSDocExtractor:
    """JavaScript/TypeScript JSDoc提取器"""

    def extract(self, code: str, symbol: Symbol) -> Optional[str]:
        """提取JSDoc注释"""

        lines = code.splitlines()
        start_line = symbol.range[0] - 1

        # 向上查找 /** ... */ 注释
        for i in range(start_line - 1, max(0, start_line - 20), -1):
            if '*/' in lines[i]:
                # 找到结束标记，向上提取
                return self._extract_jsdoc_block(lines, i)

        return None

9. 配置示例

9.1 配置文件

# .codexlens/hybrid_enhancement.yaml

hybrid_enhancement:
  enabled: true

  # 质量阈值
  quality_thresholds:
    use_docstring: high      # high/medium/low
    refine_docstring: medium

  # LLM选项
  llm:
    tool: gemini
    fallback: qwen
    timeout_ms: 300000
    batch_size: 5

  # 成本优化
  cost_optimization:
    generate_keywords_for_docstring: true
    skip_test_files: true
    skip_private_methods: false

  # 语言支持
  languages:
    python:
      styles: [google, numpy, sphinx]
    javascript:
      styles: [jsdoc]
    java:
      styles: [javadoc]

  # 监控
  logging:
    log_strategy_decisions: false
    log_cost_savings: true

9.2 CLI使用

# 使用混合策略增强
codex-lens enhance . --hybrid --tool gemini

# 查看成本统计
codex-lens enhance . --hybrid --show-stats

# 仅对高质量docstring生成keywords
codex-lens enhance . --hybrid --keywords-only

# 禁用混合模式，回退到纯LLM
codex-lens enhance . --no-hybrid --tool gemini

10. 成功指标

成本节省：相比纯LLM模式，降低API调用成本40%+
准确性提升：使用docstring的符号，元数据准确率>95%
覆盖率：98%+的符号有语义元数据（docstring或LLM生成）
速度提升：整体处理速度提升30%+（减少LLM调用）
用户满意度：保留docstring信息，开发者认可度高

30 KiB Raw Blame History Unescape Escape

Docstring与LLM混合策略设计方案

1. 背景与目标

1.1 当前问题

1.2 设计目标

2. 技术架构

2.1 整体流程

2.2 核心组件

3. 详细实现步骤

3.1 Docstring提取与解析

3.2 智能混合策略引擎

4. 配置选项

5. 测试策略

5.1 单元测试

5.2 集成测试

6. 实施路线图

Phase 1: 基础设施（1周）

Phase 2: 质量评估（1周）

Phase 3: 混合策略（1-2周）

Phase 4: 成本优化（1周）

Phase 5: 多语言支持（1-2周）

Phase 6: 集成与部署（1周）

7. 性能与成本分析

7.1 预期成本节省

7.2 质量对比

8. 潜在问题与解决方案

8.1 问题：Docstring过时

8.2 问题：不同docstring风格混用

8.3 问题：多语言docstring提取差异

9. 配置示例

9.1 配置文件

9.2 CLI使用

10. 成功指标

11. 参考资料

30 KiB

Raw Blame History