mirror of
https://github.com/catlog22/Claude-Code-Workflow.git
synced 2026-03-11 17:21:03 +08:00
feat: Implement DeepWiki generator and CLI integration
- Added `deepwiki_generator.py` for generating documentation from source code. - Integrated symbol extraction and markdown generation for supported file types. - Implemented database migration for legacy timestamp formats in DeepWikiStore. - Enhanced debug logging for better traceability during conversation and store operations. - Updated dependencies in `PKG-INFO` and `requires.txt` for compatibility. - Added new tests for the DeepWiki generator and storage functionalities. - Refactored existing code for improved readability and maintainability.
This commit is contained in:
@@ -9,8 +9,9 @@ Requires-Python: >=3.10
|
||||
Description-Content-Type: text/markdown
|
||||
License-File: LICENSE
|
||||
Requires-Dist: typer~=0.9.0
|
||||
Requires-Dist: click<9,>=8.0.0
|
||||
Requires-Dist: rich~=13.0.0
|
||||
Requires-Dist: pydantic~=2.0.0
|
||||
Requires-Dist: pydantic>=2.5.0
|
||||
Requires-Dist: tree-sitter~=0.20.0
|
||||
Requires-Dist: tree-sitter-python~=0.25.0
|
||||
Requires-Dist: tree-sitter-javascript~=0.25.0
|
||||
@@ -20,16 +21,16 @@ Requires-Dist: watchdog~=3.0.0
|
||||
Requires-Dist: ast-grep-py~=0.40.0
|
||||
Provides-Extra: semantic
|
||||
Requires-Dist: numpy~=1.26.0; extra == "semantic"
|
||||
Requires-Dist: fastembed~=0.2.0; extra == "semantic"
|
||||
Requires-Dist: fastembed~=0.2.1; extra == "semantic"
|
||||
Requires-Dist: hnswlib~=0.8.0; extra == "semantic"
|
||||
Provides-Extra: semantic-gpu
|
||||
Requires-Dist: numpy~=1.26.0; extra == "semantic-gpu"
|
||||
Requires-Dist: fastembed~=0.2.0; extra == "semantic-gpu"
|
||||
Requires-Dist: fastembed~=0.2.1; extra == "semantic-gpu"
|
||||
Requires-Dist: hnswlib~=0.8.0; extra == "semantic-gpu"
|
||||
Requires-Dist: onnxruntime-gpu~=1.15.0; extra == "semantic-gpu"
|
||||
Provides-Extra: semantic-directml
|
||||
Requires-Dist: numpy~=1.26.0; extra == "semantic-directml"
|
||||
Requires-Dist: fastembed~=0.2.0; extra == "semantic-directml"
|
||||
Requires-Dist: fastembed~=0.2.1; extra == "semantic-directml"
|
||||
Requires-Dist: hnswlib~=0.8.0; extra == "semantic-directml"
|
||||
Requires-Dist: onnxruntime-directml~=1.15.0; extra == "semantic-directml"
|
||||
Provides-Extra: reranker-onnx
|
||||
|
||||
@@ -36,6 +36,7 @@ src/codexlens/indexing/symbol_extractor.py
|
||||
src/codexlens/lsp/__init__.py
|
||||
src/codexlens/lsp/handlers.py
|
||||
src/codexlens/lsp/keepalive_bridge.py
|
||||
src/codexlens/lsp/lsp-servers.json
|
||||
src/codexlens/lsp/lsp_bridge.py
|
||||
src/codexlens/lsp/lsp_graph_builder.py
|
||||
src/codexlens/lsp/providers.py
|
||||
@@ -97,6 +98,8 @@ src/codexlens/semantic/reranker/legacy.py
|
||||
src/codexlens/semantic/reranker/litellm_reranker.py
|
||||
src/codexlens/semantic/reranker/onnx_reranker.py
|
||||
src/codexlens/storage/__init__.py
|
||||
src/codexlens/storage/deepwiki_models.py
|
||||
src/codexlens/storage/deepwiki_store.py
|
||||
src/codexlens/storage/dir_index.py
|
||||
src/codexlens/storage/file_cache.py
|
||||
src/codexlens/storage/global_index.py
|
||||
@@ -117,6 +120,8 @@ src/codexlens/storage/migrations/migration_006_enhance_relationships.py
|
||||
src/codexlens/storage/migrations/migration_007_add_graph_neighbors.py
|
||||
src/codexlens/storage/migrations/migration_008_add_merkle_hashes.py
|
||||
src/codexlens/storage/migrations/migration_010_add_multi_vector_chunks.py
|
||||
src/codexlens/tools/__init__.py
|
||||
src/codexlens/tools/deepwiki_generator.py
|
||||
src/codexlens/watcher/__init__.py
|
||||
src/codexlens/watcher/events.py
|
||||
src/codexlens/watcher/file_watcher.py
|
||||
@@ -129,6 +134,7 @@ tests/test_astgrep_binding.py
|
||||
tests/test_binary_searcher.py
|
||||
tests/test_cascade_strategies.py
|
||||
tests/test_chain_search.py
|
||||
tests/test_cli_help.py
|
||||
tests/test_cli_hybrid_search.py
|
||||
tests/test_cli_output.py
|
||||
tests/test_clustering_strategies.py
|
||||
@@ -136,6 +142,8 @@ tests/test_code_extractor.py
|
||||
tests/test_config.py
|
||||
tests/test_config_cascade.py
|
||||
tests/test_config_staged_env_overrides.py
|
||||
tests/test_deepwiki_store.py
|
||||
tests/test_deepwiki_types.py
|
||||
tests/test_dual_fts.py
|
||||
tests/test_embedder.py
|
||||
tests/test_embedding_backend_availability.py
|
||||
|
||||
@@ -1,2 +1,2 @@
|
||||
[console_scripts]
|
||||
codexlens-lsp = codexlens.lsp:main
|
||||
codexlens-lsp = codexlens.lsp.server:main
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
typer~=0.9.0
|
||||
click<9,>=8.0.0
|
||||
rich~=13.0.0
|
||||
pydantic~=2.0.0
|
||||
pydantic>=2.5.0
|
||||
tree-sitter~=0.20.0
|
||||
tree-sitter-python~=0.25.0
|
||||
tree-sitter-javascript~=0.25.0
|
||||
@@ -43,17 +44,17 @@ transformers~=4.36.0
|
||||
|
||||
[semantic]
|
||||
numpy~=1.26.0
|
||||
fastembed~=0.2.0
|
||||
fastembed~=0.2.1
|
||||
hnswlib~=0.8.0
|
||||
|
||||
[semantic-directml]
|
||||
numpy~=1.26.0
|
||||
fastembed~=0.2.0
|
||||
fastembed~=0.2.1
|
||||
hnswlib~=0.8.0
|
||||
onnxruntime-directml~=1.15.0
|
||||
|
||||
[semantic-gpu]
|
||||
numpy~=1.26.0
|
||||
fastembed~=0.2.0
|
||||
fastembed~=0.2.1
|
||||
hnswlib~=0.8.0
|
||||
onnxruntime-gpu~=1.15.0
|
||||
|
||||
@@ -251,6 +251,11 @@ class DeepWikiStore:
|
||||
except sqlite3.OperationalError:
|
||||
pass # Column already exists
|
||||
|
||||
# Legacy migration: some earlier DeepWiki DBs stored timestamps as TEXT (ISO strings).
|
||||
# better-sqlite3 + JS code expects numeric (REAL) seconds, so ensure timestamp columns
|
||||
# have REAL affinity by rebuilding affected tables when needed.
|
||||
self._migrate_text_timestamps_to_real(conn)
|
||||
|
||||
conn.commit()
|
||||
except sqlite3.DatabaseError as exc:
|
||||
raise StorageError(
|
||||
@@ -270,6 +275,193 @@ class DeepWikiStore:
|
||||
"""
|
||||
return str(Path(path).resolve()).replace("\\", "/")
|
||||
|
||||
def _migrate_text_timestamps_to_real(self, conn: sqlite3.Connection) -> None:
|
||||
"""Migrate legacy TEXT timestamp columns to REAL affinity.
|
||||
|
||||
SQLite's type system is dynamic, but column affinity influences how values are stored and
|
||||
returned. Older DeepWiki databases used TEXT timestamps (often ISO strings). The current
|
||||
schema uses REAL epoch seconds. When we detect TEXT affinity on timestamp columns, we
|
||||
rebuild the table with REAL columns and convert existing values during copy.
|
||||
"""
|
||||
|
||||
self._rebuild_table_with_timestamp_conversion(
|
||||
conn,
|
||||
table="deepwiki_files",
|
||||
create_sql="""
|
||||
CREATE TABLE deepwiki_files (
|
||||
id INTEGER PRIMARY KEY,
|
||||
path TEXT UNIQUE NOT NULL,
|
||||
content_hash TEXT NOT NULL,
|
||||
last_indexed REAL NOT NULL,
|
||||
symbols_count INTEGER DEFAULT 0,
|
||||
docs_generated INTEGER DEFAULT 0,
|
||||
staleness_score REAL DEFAULT 0.0,
|
||||
last_checked_commit TEXT,
|
||||
last_checked_at REAL,
|
||||
staleness_factors TEXT
|
||||
)
|
||||
""",
|
||||
timestamp_columns={"last_indexed", "last_checked_at"},
|
||||
required_timestamp_columns={"last_indexed"},
|
||||
)
|
||||
conn.execute(
|
||||
"CREATE INDEX IF NOT EXISTS idx_deepwiki_files_path ON deepwiki_files(path)"
|
||||
)
|
||||
conn.execute(
|
||||
"CREATE INDEX IF NOT EXISTS idx_deepwiki_files_hash ON deepwiki_files(content_hash)"
|
||||
)
|
||||
|
||||
self._rebuild_table_with_timestamp_conversion(
|
||||
conn,
|
||||
table="deepwiki_docs",
|
||||
create_sql="""
|
||||
CREATE TABLE deepwiki_docs (
|
||||
id INTEGER PRIMARY KEY,
|
||||
path TEXT UNIQUE NOT NULL,
|
||||
content_hash TEXT NOT NULL,
|
||||
symbols TEXT DEFAULT '[]',
|
||||
generated_at REAL NOT NULL,
|
||||
llm_tool TEXT
|
||||
)
|
||||
""",
|
||||
timestamp_columns={"generated_at"},
|
||||
required_timestamp_columns={"generated_at"},
|
||||
)
|
||||
conn.execute("CREATE INDEX IF NOT EXISTS idx_deepwiki_docs_path ON deepwiki_docs(path)")
|
||||
|
||||
self._rebuild_table_with_timestamp_conversion(
|
||||
conn,
|
||||
table="deepwiki_symbols",
|
||||
create_sql="""
|
||||
CREATE TABLE deepwiki_symbols (
|
||||
id INTEGER PRIMARY KEY,
|
||||
name TEXT NOT NULL,
|
||||
type TEXT NOT NULL,
|
||||
source_file TEXT NOT NULL,
|
||||
doc_file TEXT NOT NULL,
|
||||
anchor TEXT NOT NULL,
|
||||
start_line INTEGER NOT NULL,
|
||||
end_line INTEGER NOT NULL,
|
||||
created_at REAL,
|
||||
updated_at REAL,
|
||||
staleness_score REAL DEFAULT 0.0,
|
||||
last_checked_commit TEXT,
|
||||
last_checked_at REAL,
|
||||
staleness_factors TEXT,
|
||||
UNIQUE(name, source_file)
|
||||
)
|
||||
""",
|
||||
timestamp_columns={"created_at", "updated_at", "last_checked_at"},
|
||||
required_timestamp_columns=set(),
|
||||
)
|
||||
conn.execute(
|
||||
"CREATE INDEX IF NOT EXISTS idx_deepwiki_symbols_name ON deepwiki_symbols(name)"
|
||||
)
|
||||
conn.execute(
|
||||
"CREATE INDEX IF NOT EXISTS idx_deepwiki_symbols_source ON deepwiki_symbols(source_file)"
|
||||
)
|
||||
conn.execute(
|
||||
"CREATE INDEX IF NOT EXISTS idx_deepwiki_symbols_doc ON deepwiki_symbols(doc_file)"
|
||||
)
|
||||
|
||||
self._rebuild_table_with_timestamp_conversion(
|
||||
conn,
|
||||
table="generation_progress",
|
||||
create_sql="""
|
||||
CREATE TABLE generation_progress (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
symbol_key TEXT NOT NULL UNIQUE,
|
||||
file_path TEXT NOT NULL,
|
||||
symbol_name TEXT NOT NULL,
|
||||
symbol_type TEXT NOT NULL,
|
||||
layer INTEGER NOT NULL,
|
||||
source_hash TEXT NOT NULL,
|
||||
status TEXT NOT NULL DEFAULT 'pending',
|
||||
attempts INTEGER DEFAULT 0,
|
||||
last_tool TEXT,
|
||||
last_error TEXT,
|
||||
generated_at REAL,
|
||||
created_at REAL,
|
||||
updated_at REAL
|
||||
)
|
||||
""",
|
||||
timestamp_columns={"generated_at", "created_at", "updated_at"},
|
||||
required_timestamp_columns=set(),
|
||||
)
|
||||
conn.execute(
|
||||
"CREATE INDEX IF NOT EXISTS idx_progress_status ON generation_progress(status)"
|
||||
)
|
||||
conn.execute(
|
||||
"CREATE INDEX IF NOT EXISTS idx_progress_file ON generation_progress(file_path)"
|
||||
)
|
||||
conn.execute(
|
||||
"CREATE INDEX IF NOT EXISTS idx_progress_hash ON generation_progress(source_hash)"
|
||||
)
|
||||
|
||||
def _rebuild_table_with_timestamp_conversion(
|
||||
self,
|
||||
conn: sqlite3.Connection,
|
||||
*,
|
||||
table: str,
|
||||
create_sql: str,
|
||||
timestamp_columns: set[str],
|
||||
required_timestamp_columns: set[str],
|
||||
) -> None:
|
||||
info = conn.execute(f"PRAGMA table_info({table})").fetchall()
|
||||
if not info:
|
||||
return
|
||||
|
||||
declared_types = {
|
||||
row["name"]: str(row["type"] or "").strip().upper() for row in info
|
||||
}
|
||||
needs_migration = any(
|
||||
declared_types.get(col) == "TEXT" for col in timestamp_columns if col in declared_types
|
||||
)
|
||||
if not needs_migration:
|
||||
return
|
||||
|
||||
old_table = f"{table}__old_ts"
|
||||
conn.execute(f"ALTER TABLE {table} RENAME TO {old_table}")
|
||||
conn.execute(create_sql)
|
||||
|
||||
old_cols = [
|
||||
r["name"]
|
||||
for r in conn.execute(f"PRAGMA table_info({old_table})").fetchall()
|
||||
]
|
||||
new_cols = [r["name"] for r in conn.execute(f"PRAGMA table_info({table})").fetchall()]
|
||||
common_cols = [c for c in new_cols if c in old_cols]
|
||||
|
||||
select_exprs: list[str] = []
|
||||
for col in common_cols:
|
||||
if col in timestamp_columns:
|
||||
expr = self._sql_timestamp_to_real(col)
|
||||
if col in required_timestamp_columns:
|
||||
expr = f"COALESCE({expr}, CAST(strftime('%s','now') AS REAL))"
|
||||
select_exprs.append(f"{expr} AS {col}")
|
||||
else:
|
||||
select_exprs.append(col)
|
||||
|
||||
cols_sql = ", ".join(common_cols)
|
||||
select_sql = ", ".join(select_exprs)
|
||||
conn.execute(
|
||||
f"INSERT INTO {table} ({cols_sql}) SELECT {select_sql} FROM {old_table}"
|
||||
)
|
||||
conn.execute(f"DROP TABLE {old_table}")
|
||||
|
||||
def _sql_timestamp_to_real(self, col: str) -> str:
|
||||
# Convert various timestamp representations to epoch seconds (REAL).
|
||||
# - numeric types: keep as REAL
|
||||
# - numeric strings: CAST to REAL
|
||||
# - ISO datetime strings: strftime('%s', ...) to epoch seconds
|
||||
return f"""(
|
||||
CASE
|
||||
WHEN {col} IS NULL THEN NULL
|
||||
WHEN typeof({col}) IN ('integer', 'real') THEN CAST({col} AS REAL)
|
||||
WHEN trim({col}) GLOB '[0-9]*' THEN CAST({col} AS REAL)
|
||||
ELSE CAST(strftime('%s', replace(substr({col}, 1, 19), 'T', ' ')) AS REAL)
|
||||
END
|
||||
)"""
|
||||
|
||||
# === File Operations ===
|
||||
|
||||
def add_file(
|
||||
|
||||
@@ -1,22 +1,21 @@
|
||||
"""DeepWiki document generation tools.
|
||||
|
||||
|
||||
This module provides tools for generating documentation from source code.
|
||||
"""
|
||||
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
|
||||
import hashlib
|
||||
import logging
|
||||
import re
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import List, Dict, Optional, Protocol
|
||||
from typing import Dict, List, Optional, Protocol, Any
|
||||
|
||||
from codexlens.storage.deepwiki_store import DeepWikiStore
|
||||
from codexlens.storage.deepwiki_models import DeepWikiSymbol
|
||||
from codexlens.errors import StorageError
|
||||
from codexlens.indexing.symbol_extractor import SymbolExtractor
|
||||
from codexlens.parsers.factory import ParserFactory
|
||||
from codexlens.errors import StorageError
|
||||
from codexlens.storage.deepwiki_models import DeepWikiSymbol
|
||||
from codexlens.storage.deepwiki_store import DeepWikiStore
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -24,7 +23,7 @@ logger = logging.getLogger(__name__)
|
||||
# Default timeout for AI generation (30 seconds)
|
||||
AI_TIMEOUT = 30
|
||||
# HTML metadata markers for documentation
|
||||
SYMBOL_START_MARKER = "<!-- deepwiki-symbol-start name=\"symbol_name}\" -->"
|
||||
SYMBOL_START_MARKER = '<!-- deepwiki-symbol-start name="{symbol_name}" -->'
|
||||
SYMBOL_END_MARKER = "<!-- deepwiki-symbol-end -->"
|
||||
|
||||
|
||||
@@ -48,8 +47,8 @@ class MockMarkdownGenerator(MarkdownGenerator):
|
||||
"""Mock Markdown generator for testing."""
|
||||
|
||||
def generate(self, symbol: DeepWikiSymbol, source_code: str) -> str:
|
||||
"""Generate mock Markdown documentation."""
|
||||
return f"# {symbol.name}\n\n## {symbol.type}\n\n{source_code}\n```\n```
|
||||
"""Generate mock Markdown documentation."""
|
||||
return f"# {symbol.name}\\n\\n## {symbol.type}\\n\\n```\\n{source_code}\\n```"
|
||||
|
||||
|
||||
class DeepWikiGenerator:
|
||||
@@ -60,382 +59,168 @@ class DeepWikiGenerator:
|
||||
"""
|
||||
|
||||
DEFAULT_DB_PATH = DeepWikiStore.DEFAULT_DB_PATH
|
||||
SUPPORT_extensions = [".py", ".ts", ".tsx", ".js", ".jsx", ".java", ".go", ".rs", ".swift"]
|
||||
SUPPORTED_EXTENSIONS = [
|
||||
".py",
|
||||
".ts",
|
||||
".tsx",
|
||||
".js",
|
||||
".jsx",
|
||||
".java",
|
||||
".go",
|
||||
".rs",
|
||||
".swift",
|
||||
]
|
||||
AI_TIMEOUT: int = 30 # Timeout for AI generation
|
||||
MAX_SYMBOLS_PER_FILE: int = 100 # Batch size for processing large files
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
db_path: Path | None = None,
|
||||
store: DeepWikiStore = markdown_generator: MarkdownGenerator | None, None,
|
||||
store: DeepWikiStore | None = None,
|
||||
markdown_generator: MarkdownGenerator | None = None,
|
||||
max_symbols_per_file: int = 100,
|
||||
ai_timeout: int = 30,
|
||||
) -> None:
|
||||
self.markdown_generator = MockMarkdownGenerator()
|
||||
self.store = store
|
||||
self._extractor = Symbol_extractor()
|
||||
|
||||
else:
|
||||
self._extractor = SymbolExtractor()
|
||||
if file_path not in _should_process_file:
|
||||
self._extractor.extract_symbols(file_path)
|
||||
if symbols:
|
||||
logger.debug(f"Found {len(symbols)} symbols in {file_path}")
|
||||
else:
|
||||
logger.debug(f"No symbols found in {file_path}")
|
||||
return []
|
||||
# Extract symbols from the file
|
||||
for symbol in symbols:
|
||||
try:
|
||||
file_type = Parser_factory.get_parser(file_path.suffix)
|
||||
if file_type is None:
|
||||
logger.warning(f"Unsupported file type: {file_path}")
|
||||
continue
|
||||
symbols.append(symbols)
|
||||
doc_path = self._generate_docs(symbol)
|
||||
doc_path.mkdir(doc_path, exist_ok=True)
|
||||
for symbol in symbols:
|
||||
doc_path = self._generate_markdown(symbol, source_code)
|
||||
doc.write(doc(doc_id)
|
||||
logger.debug(f"Generated docs for {len(symbols)} symbols in {file_path}")
|
||||
self._store.save_symbol(symbol, doc_path, doc_content, doc_path)
|
||||
self._store.update_file_stats(existing_file.path, symbols_count)
|
||||
self._store.update_file_stats(
|
||||
existing_file.path,
|
||||
symbols_count=len(existing_file.symbols),
|
||||
new_symbols_count=len(symbols),
|
||||
docs_generated += 1
|
||||
)
|
||||
else:
|
||||
# Skip unchanged files (skip update)
|
||||
logger.debug(f"Skipped {len(unchanged_files)} unchanged symbols")
|
||||
logger.debug(f"No symbols found in {file_path}, skipping update")
|
||||
except Exception as e:
|
||||
logger.error(f"Error extracting symbols from {file_path}: {e}")
|
||||
raise StorageError(f"Failed to extract symbols from {file_path}")
|
||||
try:
|
||||
symbol_extractor = SymbolExtractor()
|
||||
symbols = []
|
||||
continue
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to initialize symbol extractor: {e}")
|
||||
raise StorageError(f"Failed to initialize symbol extractor for {file_path}")
|
||||
# Return empty list
|
||||
doc_paths = []
|
||||
for doc_path in doc_paths:
|
||||
try:
|
||||
doc_path.mkdir(doc_path, parents=True, exist_ok=True)
|
||||
for file in files:
|
||||
if not file_path.endswith in support_extensions:
|
||||
continue
|
||||
source_file = file_path
|
||||
source_content = file_path.read_bytes()
|
||||
content_hash = self._calculate_file_hash(file_path)
|
||||
return hash_obj.hexdigest()
|
||||
file_hash = existing_hash
|
||||
if existing_hash == new_hash:
|
||||
logger.debug(
|
||||
f"File unchanged: {file_path}. Skipping (hash match)"
|
||||
)
|
||||
return existing_file
|
||||
# Get language from file path
|
||||
language = self._get_language(file_path)
|
||||
if language is None:
|
||||
language = file_path.suffix
|
||||
# Default to Python if it is other extension
|
||||
language_map = {
|
||||
".ts": "TypeScript",
|
||||
".tsx": "TypeScript React",
|
||||
".js": "JavaScript",
|
||||
".jsx": "JavaScript React",
|
||||
".java": "Java",
|
||||
".go": "Go",
|
||||
".rs": "Rust",
|
||||
".swift": "Swift",
|
||||
}
|
||||
return language
|
||||
file_type = None
|
||||
except ValueError("Unsupported file type: {file_path}")
|
||||
logger.warning(f"Unsupported file type: {file_path}, skipping")
|
||||
continue
|
||||
source_file = file_path
|
||||
source_code = file.read_text()
|
||||
if source_code:
|
||||
try:
|
||||
source_code = file.read_bytes(). hash_obj = hashlib.sha256(source_code.encode("utf-8")
|
||||
return hash_obj.hexdigest()
|
||||
else:
|
||||
return ""
|
||||
# Determine language from file extension
|
||||
file_ext = file_extension.lower().find(f".py, ..ts, .tsx)
|
||||
if file_ext in SUPPORT_extensions:
|
||||
for ext in self.Suffix_lower():
|
||||
logger.debug(f"Unsupported file extension: {file_path}, skipping file")
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Error determining language for {file_path}: {e}")
|
||||
return None, else:
|
||||
return self.suffix_lower() if ext == SUPPORT_extensions:
|
||||
else:
|
||||
return None
|
||||
else:
|
||||
# Check if it is markdown generator exists
|
||||
if markdown_generator:
|
||||
logger.debug("No markdown generator provided, using mock")
|
||||
return None
|
||||
# Check if tool exists
|
||||
if tool:
|
||||
logger.debug(f"Tool not available for {tool}")
|
||||
return None
|
||||
# Extract symbols using regex for tree-sitter
|
||||
language_map = self.Language_map
|
||||
return language_map
|
||||
|
||||
# Read all symbols from the database file
|
||||
file_path = path
|
||||
# Get parser factory
|
||||
if file_path not in support_extensions:
|
||||
logger.debug(f"Unsupported file type: {file_path}, skipping")
|
||||
return []
|
||||
else:
|
||||
logger.debug(f"Extracted {len(symbols)} symbols from {file_path}")
|
||||
return symbols
|
||||
|
||||
def _generate_markdown(self, symbol: DeepWikiSymbol, source_code: str) -> str:
|
||||
"""Generate Markdown documentation for a symbol.
|
||||
|
||||
Args:
|
||||
symbol: The symbol information
|
||||
source_code: The source code content
|
||||
|
||||
Returns:
|
||||
Generated Markdown documentation
|
||||
"""
|
||||
def _generate_markdown(
|
||||
self, symbol: DeepWikiSymbol, source_code: str
|
||||
) -> str:
|
||||
"""Generate mock Markdown documentation."""
|
||||
return f"# {symbol.name}\n\n## {symbol.type}\n\n{source_code}\n```\n```
|
||||
Initializes the DeepWikiGenerator.
|
||||
"""
|
||||
if store:
|
||||
self.store = store
|
||||
else:
|
||||
self.store = DeepWikiStore(db_path or self.DEFAULT_DB_PATH)
|
||||
|
||||
if markdown_generator:
|
||||
self.markdown_generator = markdown_generator
|
||||
else:
|
||||
logger.debug("No markdown generator provided, using mock")
|
||||
self.markdown_generator = MockMarkdownGenerator()
|
||||
|
||||
self._extractor = SymbolExtractor()
|
||||
self.max_symbols_per_file = max_symbols_per_file
|
||||
self.ai_timeout = ai_timeout
|
||||
self._docs_dir = Path("docs") # Default docs directory
|
||||
|
||||
doc_path.mkdir(self.docs_dir, parents=True, exist_ok=True)
|
||||
for file in files:
|
||||
if not file_path.endswith in support_extensions:
|
||||
continue
|
||||
source_content = file.read_bytes()
|
||||
doc_content = f.read_text()
|
||||
# Add content to markdown
|
||||
markdown = f"<!-- deepwiki-symbol-start name=\"{symbol.name}\" -->\n{markdown_content}\n{markdown}
|
||||
|
||||
# Calculate anchor ( generate a_anchor(symbol)
|
||||
anchor_line = symbol.line_range[0]
|
||||
doc_path = self._docs_dir / docs_path
|
||||
source_file = os.path.join(source_file, relative_path,)
|
||||
return line_range
|
||||
elif markdown is None:
|
||||
anchor = ""
|
||||
|
||||
{markdown}
|
||||
|
||||
{markdown}
|
||||
# Add anchor link to the from doc file
|
||||
# Calculate doc file hash
|
||||
file_hash = hashlib.sha256(file_content.encode("utf-8")
|
||||
content_hash = existing_hash
|
||||
file_path = source_file
|
||||
if existing_file is None:
|
||||
return None
|
||||
source_file = source_file
|
||||
file_path = str(source_file)
|
||||
for f in symbols:
|
||||
if file_changed
|
||||
logger.info(
|
||||
f"Generated docs for {len(symbols)} symbols in {file_path}"
|
||||
)
|
||||
logger.debug(
|
||||
f"Updated {len(changed_files)} files - {len(changed_symbols)} "
|
||||
)
|
||||
logger.debug(
|
||||
f"Updated {len(unchanged_files)} files: {len(unchanged_symbols)} "
|
||||
)
|
||||
logger.debug(
|
||||
f"unchanged files: {len(unchanged_files)} (unchanged)"
|
||||
)
|
||||
else:
|
||||
logger.debug(
|
||||
f"Processed {len(files)} files, {len(files)} changed symbols, {len(changed_symbols)}"
|
||||
)
|
||||
logger.debug(f"Processed {len(files)} files in {len(files)} changes:")
|
||||
f"Total files changed: {len(changed_files)}, "
|
||||
f" file changes: {len(changed_files)}", "len(changed_symbols)} symbols, {len(changed_symbols)}, new_docs_generated: {len(changed_symbols)}"
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
# Save stats
|
||||
stats["total_files"] = total_files
|
||||
stats["total_symbols"] = total_symbols
|
||||
stats["total_changed_symbols"] = changed_symbols_count
|
||||
stats["unchanged_files"] = unchanged_files_count
|
||||
stats["total_changed_files"] = changed_files
|
||||
logger.info(
|
||||
f"Generation complete - {len(files)} files, {len(symbols)} symbols, {len(changed_files)} changed symbols: files_changed}"
|
||||
f" file changes ({len(changed_files)} changed symbols count} symbols"
|
||||
}
|
||||
f"unchanged files: {len(unchanged_files)} (unchanged_files_count}")
|
||||
stats["unchanged_files"] = unchanged_files
|
||||
stats["unchanged_files"] = unchanged_files
|
||||
logger.info(
|
||||
f"generation complete - {len(files)} files, {len(symbols)} symbols, {len(changed_files)} changed symbols, {len(changed_symbols)} docs generated"
|
||||
}
|
||||
else:
|
||||
stats["unchanged_files"] = len(unchanged_files)
|
||||
stats["unchanged_symbols"] = len(unchanged_symbols)
|
||||
stats["total_symbols"] = total_symbols
|
||||
stats["total_docs_generated"] = total_docs_generated
|
||||
stats["total_changed_files"] = changed_files_count
|
||||
stats["total_changed_files"] = unchanged_files_count
|
||||
return stats
|
||||
def _calculate_file_hash(self, file_path: Path) -> str:
|
||||
"""Calculate SHA256 hash of file content."""
|
||||
try:
|
||||
content = file_path.read_bytes()
|
||||
hash_obj = hashlib.sha256(content)
|
||||
return hash_obj.hexdigest()
|
||||
except IOError as e:
|
||||
logger.error(f"Error reading file for hash calculation: {file_path}: {e}")
|
||||
return ""
|
||||
|
||||
def _get_language(self, file_path: Path) -> str | None:
|
||||
"""Determine language from file extension."""
|
||||
ext = file_path.suffix.lower()
|
||||
if ext not in self.SUPPORTED_EXTENSIONS:
|
||||
logger.debug(f"Unsupported file extension: {file_path}, skipping file")
|
||||
return None
|
||||
|
||||
language_map = {
|
||||
".py": "Python",
|
||||
".ts": "TypeScript",
|
||||
".tsx": "TypeScript React",
|
||||
".js": "JavaScript",
|
||||
".jsx": "JavaScript React",
|
||||
".java": "Java",
|
||||
".go": "Go",
|
||||
".rs": "Rust",
|
||||
".swift": "Swift",
|
||||
}
|
||||
finally:
|
||||
return self.close()
|
||||
def run(self, path: str, output_dir: Optional[str] = None, db_path: Optional[Path] = None, force: bool = False,
|
||||
max_symbols_per_file: int = 100,
|
||||
ai_timeout: int = AI_TIMEOUT,
|
||||
backend: str = "fastembed",
|
||||
model: str = "code",
|
||||
max_workers: int = 1,
|
||||
json_mode: bool = False,
|
||||
verbose: bool = False,
|
||||
) -> None:
|
||||
return language_map.get(ext)
|
||||
|
||||
def _should_process_file(self, file_path: Path, force: bool) -> bool:
|
||||
"""Check if a file should be processed based on hash."""
|
||||
if force:
|
||||
return True
|
||||
new_hash = self._calculate_file_hash(file_path)
|
||||
if not new_hash:
|
||||
return False
|
||||
|
||||
existing_file = self.store.get_file(str(file_path))
|
||||
if existing_file and existing_file.content_hash == new_hash:
|
||||
logger.debug(f"File unchanged: {file_path}. Skipping (hash match)")
|
||||
return False
|
||||
return True
|
||||
|
||||
def _generate_markdown_for_symbol(self, symbol: DeepWikiSymbol, source_code: str) -> str:
|
||||
"""Generate markdown and wrap it with markers."""
|
||||
markdown_content = self.markdown_generator.generate(symbol, source_code)
|
||||
return f"{SYMBOL_START_MARKER.format(symbol_name=symbol.name)}\\n{markdown_content}\\n{SYMBOL_END_MARKER}"
|
||||
|
||||
def run(self, path: str, output_dir: Optional[str] = None, force: bool = False) -> Dict[str, Any]:
|
||||
"""
|
||||
Initialize DeepWiki store and generator, and scan the source.
|
||||
|
||||
Args:
|
||||
path: Path to the source directory
|
||||
db_path: Optional database path ( defaults to DEFAULT_DB_PATH)
|
||||
force: Force full reindex ( ignoring file hashes
|
||||
markdown_generator: Optional generator for markdown. If None, use Mock.
|
||||
backend: backend or "fastembed"
|
||||
model: model = "code"
|
||||
max_workers: Maximum concurrent API calls for AI generation
|
||||
max_symbols_per_file: maximum symbols to process per file (batch processing)
|
||||
ai_timeout: timeout for AI generation
|
||||
max_file_size: maximum file size to read in MB before processing ( chunks
|
||||
|
||||
Returns:
|
||||
Generator result with stats dict[str, Any]:
|
||||
"""
|
||||
source_root = Path(path)
|
||||
if output_dir:
|
||||
self._docs_dir = Path(output_dir)
|
||||
|
||||
<system_warning>
|
||||
This task has subtasks - please focus on the current work. You start by reading the task files and completing summaries.
|
||||
|
||||
* Reading the `workflow/.lite-plan/implement-deepwiki-2026-03-05/TODO_LIST.md` for I'll the plan file and get started.
|
||||
|
||||
* Mark TASK 003 as completed.
|
||||
* Update TODO_list by checking the off the "Done when" checkboxes and completed sections
|
||||
* Generate completion summary with links to relevant files
|
||||
* Update main task JSON status to "completed"
|
||||
* * Read more context from previous tasks and understand what was completed
|
||||
* Read plan.json to get tech stack info ( verify implementation approach
|
||||
|
||||
* * Now I'll implement the deepWiki generator. in `codex-lens/src/codexlens/tools/` directory. add CLI commands. and generate commands to.
|
||||
|
||||
I'll write the file `deepwiki_generator.py` with the generator implementation.
|
||||
|
||||
I'll add the `deepwiki` command group to the CLI module.
|
||||
I'll test the implementation after
|
||||
update the TODO list accordingly to the instructions.
|
||||
* * Generate a completion summary in the `.summaries` directory
|
||||
|
||||
* Let me know if you wants to context or questions about the implementation.* I'll adjust the plan as necessary.* * Now, let me read the plan.json file to check the current plan structure: if it exists: need to create it. * let me check the completion status in the TODO list. Let me update the completion time and check if there's a status history to and update it task JSON status.
|
||||
|
||||
* Finally, I'll create a summary file and documenting the completion.I need to create the tools directory first. then create the generator file. Here's the full implementation: Now let me add the CLI commands to and test the implementation. Let me proceed with the tests.
|
||||
|
||||
I I'll verify that `deepwiki generate` command completes successfully
|
||||
The `deepwiki_index` table contains symbol entries after the first run
|
||||
A second run with unchanged source results in 0 new database writes.
|
||||
|
||||
Finally, I'll generate a summary file, document the implementation.
|
||||
* Generate a completion summary in the summaries directory
|
||||
* Update the TODO list to I progress tracking
|
||||
* Mark the task as completed
|
||||
* Update the main task JSON status to "completed" (if applicable, set completion timestamps)
|
||||
|
||||
Let me start by creating the tools directory and `__init__.py` file: and read the existing `deepwiki_store.py` file to understand the database structure and models, and methods available from the store. The as properties as the file tracking, symbol extraction, and documentation generation.Then it will integrate the AI service for generating the actual markdown. for each symbol. Finally, I'll update the stats in the store to track progress, display progress information in the console, and and table output, and log the completion status for each file.
|
||||
|
||||
total_symbols = len(symbols)
|
||||
total_changed_files = len(changed_files)
|
||||
total_unchanged_files = len(unchanged_files)
|
||||
total_docs_generated = len(docs)
|
||||
|
||||
total_changed_symbols += len(changed_symbols)
|
||||
total_docs_generated += docs
|
||||
|
||||
# Clean up removed symbols
|
||||
for symbol in removed_symbols:
|
||||
self.store.delete_symbols_for_file(file_path)
|
||||
for doc in docs:
|
||||
self.store.delete_doc(doc_id)
|
||||
# Remove dangling references
|
||||
for doc in docs:
|
||||
self.store.delete_symbols_for_file(file_path)
|
||||
self.store.delete_file(file_path)
|
||||
|
||||
# Remove empty docs directory if needed
|
||||
docs_dir.mkdir(self.docs_dir, exist_ok=True)
|
||||
os.makedirs(doc_path, parents=True, exist_ok=True)
|
||||
# Generate markdown for each symbol
|
||||
for symbol in symbols:
|
||||
markdown = self._generate_markdown(symbol, source_code)
|
||||
doc_path = self._docs_dir / docs_path
|
||||
doc_content = f"# {symbol.name}\n\n{markdown_content}\n\n # write to database
|
||||
try:
|
||||
self.store.save_symbol(symbol, doc_path, doc_content)
|
||||
doc_id = doc.id
|
||||
logger.debug(f"Generated documentation for symbol: {symbol.name}")
|
||||
total_generated += 1
|
||||
total_symbols += 1
|
||||
total_changed_files.append(file_path)
|
||||
else:
|
||||
logger.debug(f"Skipped {len(unchanged_files)} unchanged symbols")
|
||||
|
||||
# Clean up removed symbols
|
||||
for file_path in removed_files:
|
||||
for doc in docs:
|
||||
self.store.delete_symbols_for_file(file_path)
|
||||
# Delete the doc files for removed files
|
||||
self._cleanup_removed_docs()
|
||||
for doc in docs
|
||||
doc_path.unlink(missing=True)
|
||||
|
||||
return stats
|
||||
|
||||
return total_symbols, total_changed_files, total_changed_symbols, total_docs_generated, total_unchanged_files, len(unchanged_files)
|
||||
|
||||
}
|
||||
|
||||
def _cleanup_removed_docs(self) -> None:
|
||||
for doc in docs:
|
||||
doc_path.unlink(missing=True)
|
||||
try:
|
||||
os.remove(doc_path)
|
||||
except OSError:
|
||||
pass
|
||||
else:
|
||||
logger.warning(f"Error removing doc file: {doc_path}: {e}")
|
||||
continue
|
||||
self.close()
|
||||
logger.info(
|
||||
f"DeepWiki generation complete - {len(files)} files, {len(symbols)} symbols"
|
||||
)
|
||||
self.store.close()
|
||||
return {
|
||||
"total_files": total_files,
|
||||
"total_symbols": total_symbols,
|
||||
"total_changed_files": total_changed_files,
|
||||
"total_changed_symbols": total_changed_symbols,
|
||||
"total_docs_generated": total_docs_generated,
|
||||
"total_unchanged_files": total_unchanged_files,
|
||||
stats = {
|
||||
"total_files": 0,
|
||||
"total_symbols": 0,
|
||||
"total_changed_files": 0,
|
||||
"total_changed_symbols": 0,
|
||||
"total_docs_generated": 0,
|
||||
"total_unchanged_files": 0,
|
||||
}
|
||||
|
||||
files_to_process = [p for p in source_root.rglob("*") if p.is_file() and p.suffix in self.SUPPORTED_EXTENSIONS]
|
||||
stats["total_files"] = len(files_to_process)
|
||||
|
||||
changed_files_count = 0
|
||||
unchanged_files_count = 0
|
||||
|
||||
for file_path in files_to_process:
|
||||
if not self._should_process_file(file_path, force):
|
||||
unchanged_files_count += 1
|
||||
continue
|
||||
|
||||
changed_files_count += 1
|
||||
try:
|
||||
source_code = file_path.read_text("utf-8")
|
||||
symbols = self._extractor.extract_symbols(source_code, file_path.suffix, str(file_path))
|
||||
|
||||
if not symbols:
|
||||
logger.debug(f"No symbols found in {file_path}")
|
||||
continue
|
||||
|
||||
logger.debug(f"Found {len(symbols)} symbols in {file_path}")
|
||||
stats["total_symbols"] += len(symbols)
|
||||
docs_generated_count = 0
|
||||
|
||||
for symbol in symbols:
|
||||
# Generate documentation
|
||||
doc_content = self._generate_markdown_for_symbol(symbol, source_code)
|
||||
|
||||
# Define doc path
|
||||
relative_path = file_path.relative_to(source_root)
|
||||
doc_path = (self._docs_dir / relative_path).with_suffix(".md")
|
||||
doc_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Save symbol and doc
|
||||
self.store.save_symbol(symbol, str(doc_path), doc_content)
|
||||
docs_generated_count += 1
|
||||
|
||||
stats["total_docs_generated"] += docs_generated_count
|
||||
stats["total_changed_symbols"] += len(symbols)
|
||||
|
||||
# Update file stats in DB
|
||||
content_hash = self._calculate_file_hash(file_path)
|
||||
self.store.update_file_stats(str(file_path), len(symbols), content_hash)
|
||||
logger.debug(f"Generated docs for {len(symbols)} symbols in {file_path}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing file {file_path}: {e}")
|
||||
raise StorageError(f"Failed to process {file_path}") from e
|
||||
|
||||
stats["total_changed_files"] = changed_files_count
|
||||
stats["total_unchanged_files"] = unchanged_files_count
|
||||
|
||||
logger.info(f"Generation complete. Stats: {stats}")
|
||||
return stats
|
||||
|
||||
def close(self):
|
||||
"""Close the store connection."""
|
||||
self.store.close()
|
||||
|
||||
@@ -40,12 +40,13 @@ class MockMarkdownGenerator:
|
||||
|
||||
def generate(self, symbol: DeepWikiSymbol, source_code: str) -> str:
|
||||
"""Generate mock Markdown documentation."""
|
||||
return f"""{SYMBOL_START_TEMPLATE.format(name=symbol.name, type=symbol.symbol_type)}
|
||||
start_line, end_line = symbol.line_range
|
||||
return f"""{SYMBOL_START_TEMPLATE.format(name=symbol.name, type=symbol.type)}
|
||||
|
||||
## `{symbol.name}`
|
||||
|
||||
**Type**: {symbol.symbol_type}
|
||||
**Location**: `{symbol.source_file}:{symbol.line_start}-{symbol.line_end}`
|
||||
**Type**: {symbol.type}
|
||||
**Location**: `{symbol.source_file}:{start_line}-{end_line}`
|
||||
|
||||
```{symbol.source_file.split('.')[-1] if '.' in symbol.source_file else 'text'}
|
||||
{source_code}
|
||||
@@ -190,12 +191,11 @@ class DeepWikiGenerator:
|
||||
# Create symbol record
|
||||
symbol = DeepWikiSymbol(
|
||||
name=sym["name"],
|
||||
symbol_type=sym["type"],
|
||||
type=sym["type"],
|
||||
source_file=str(file_path),
|
||||
doc_file=f".deepwiki/{file_path.stem}.md",
|
||||
anchor=f"#{sym['name'].lower()}",
|
||||
line_start=sym["line_start"],
|
||||
line_end=sym["line_end"],
|
||||
line_range=(sym["line_start"], sym["line_end"]),
|
||||
)
|
||||
|
||||
# Generate markdown
|
||||
@@ -205,8 +205,13 @@ class DeepWikiGenerator:
|
||||
self.store.add_symbol(symbol)
|
||||
docs_generated += 1
|
||||
|
||||
# Update file hash
|
||||
self.store.update_file_hash(str(file_path), current_hash)
|
||||
# Track file hash + metadata for incremental updates and staleness checks.
|
||||
self.store.add_file(
|
||||
file_path=str(file_path),
|
||||
content_hash=current_hash,
|
||||
symbols_count=len(raw_symbols),
|
||||
docs_generated=docs_generated > 0,
|
||||
)
|
||||
|
||||
logger.info(f"Generated docs for {docs_generated} symbols in {file_path}")
|
||||
return {
|
||||
|
||||
Reference in New Issue
Block a user