"""Code chunking strategies for semantic search.

This module provides various chunking strategies for breaking down source code
into semantic chunks suitable for embedding and search.

Lightweight Mode:
    The ChunkConfig supports a `skip_token_count` option for performance optimization.
    When enabled, token counting uses a fast character-based estimation (char/4)
    instead of expensive tiktoken encoding.

    Use cases for lightweight mode:
    - Large-scale indexing where speed is critical
    - Scenarios where approximate token counts are acceptable
    - Memory-constrained environments
    - Initial prototyping and development

    Example:
        # Default mode (accurate tiktoken encoding)
        config = ChunkConfig()
        chunker = Chunker(config)

        # Lightweight mode (fast char/4 estimation)
        config = ChunkConfig(skip_token_count=True)
        chunker = Chunker(config)
        chunks = chunker.chunk_file(content, symbols, path, language)
"""

from __future__ import annotations

from dataclasses import dataclass
from pathlib import Path
from typing import List, Optional, Tuple

from codexlens.entities import SemanticChunk, Symbol
from codexlens.parsers.tokenizer import get_default_tokenizer


@dataclass
class ChunkConfig:
    """Configuration for chunking strategies."""
    max_chunk_size: int = 1000  # Max characters per chunk
    overlap: int = 200  # Overlap for sliding window (increased from 100 for better context)
    strategy: str = "auto"  # Chunking strategy: auto, symbol, sliding_window, hybrid
    min_chunk_size: int = 50  # Minimum chunk size
    skip_token_count: bool = False  # Skip expensive token counting (use char/4 estimate)


class Chunker:
    """Chunk code files for semantic embedding."""

    def __init__(self, config: ChunkConfig | None = None) -> None:
        self.config = config or ChunkConfig()
        self._tokenizer = get_default_tokenizer()

    def _estimate_token_count(self, text: str) -> int:
        """Estimate token count based on config.

        If skip_token_count is True, uses character-based estimation (char/4).
        Otherwise, uses accurate tiktoken encoding.

        Args:
            text: Text to count tokens for

        Returns:
            Estimated token count
        """
        if self.config.skip_token_count:
            # Fast character-based estimation: ~4 chars per token
            return max(1, len(text) // 4)
        return self._tokenizer.count_tokens(text)

    def chunk_by_symbol(
        self,
        content: str,
        symbols: List[Symbol],
        file_path: str | Path,
        language: str,
        symbol_token_counts: Optional[dict[str, int]] = None,
    ) -> List[SemanticChunk]:
        """Chunk code by extracted symbols (functions, classes).

        Each symbol becomes one chunk with its full content.
        Large symbols exceeding max_chunk_size are recursively split using sliding window.

        Args:
            content: Source code content
            symbols: List of extracted symbols
            file_path: Path to source file
            language: Programming language
            symbol_token_counts: Optional dict mapping symbol names to token counts
        """
        chunks: List[SemanticChunk] = []
        lines = content.splitlines(keepends=True)

        for symbol in symbols:
            start_line, end_line = symbol.range
            # Convert to 0-indexed
            start_idx = max(0, start_line - 1)
            end_idx = min(len(lines), end_line)

            chunk_content = "".join(lines[start_idx:end_idx])
            if len(chunk_content.strip()) < self.config.min_chunk_size:
                continue

            # Check if symbol content exceeds max_chunk_size
            if len(chunk_content) > self.config.max_chunk_size:
                # Create line mapping for correct line number tracking
                line_mapping = list(range(start_line, end_line + 1))

                # Use sliding window to split large symbol
                sub_chunks = self.chunk_sliding_window(
                    chunk_content,
                    file_path=file_path,
                    language=language,
                    line_mapping=line_mapping
                )

                # Update sub_chunks with parent symbol metadata
                for sub_chunk in sub_chunks:
                    sub_chunk.metadata["symbol_name"] = symbol.name
                    sub_chunk.metadata["symbol_kind"] = symbol.kind
                    sub_chunk.metadata["strategy"] = "symbol_split"
                    sub_chunk.metadata["parent_symbol_range"] = (start_line, end_line)

                chunks.extend(sub_chunks)
            else:
                # Calculate token count if not provided
                token_count = None
                if symbol_token_counts and symbol.name in symbol_token_counts:
                    token_count = symbol_token_counts[symbol.name]
                else:
                    token_count = self._estimate_token_count(chunk_content)

                chunks.append(SemanticChunk(
                    content=chunk_content,
                    embedding=None,
                    metadata={
                        "file": str(file_path),
                        "language": language,
                        "symbol_name": symbol.name,
                        "symbol_kind": symbol.kind,
                        "start_line": start_line,
                        "end_line": end_line,
                        "strategy": "symbol",
                        "token_count": token_count,
                    }
                ))

        return chunks

    def chunk_sliding_window(
        self,
        content: str,
        file_path: str | Path,
        language: str,
        line_mapping: Optional[List[int]] = None,
    ) -> List[SemanticChunk]:
        """Chunk code using sliding window approach.

        Used for files without clear symbol boundaries or very long functions.

        Args:
            content: Source code content
            file_path: Path to source file
            language: Programming language
            line_mapping: Optional list mapping content line indices to original line numbers
                         (1-indexed). If provided, line_mapping[i] is the original line number
                         for the i-th line in content.
        """
        chunks: List[SemanticChunk] = []
        lines = content.splitlines(keepends=True)

        if not lines:
            return chunks

        # Calculate lines per chunk based on average line length
        avg_line_len = len(content) / max(len(lines), 1)
        lines_per_chunk = max(10, int(self.config.max_chunk_size / max(avg_line_len, 1)))
        overlap_lines = max(2, int(self.config.overlap / max(avg_line_len, 1)))
        # Ensure overlap is less than chunk size to prevent infinite loop
        overlap_lines = min(overlap_lines, lines_per_chunk - 1)

        start = 0
        chunk_idx = 0

        while start < len(lines):
            end = min(start + lines_per_chunk, len(lines))
            chunk_content = "".join(lines[start:end])

            if len(chunk_content.strip()) >= self.config.min_chunk_size:
                token_count = self._estimate_token_count(chunk_content)

                # Calculate correct line numbers
                if line_mapping:
                    # Use line mapping to get original line numbers
                    start_line = line_mapping[start]
                    end_line = line_mapping[end - 1]
                else:
                    # Default behavior: treat content as starting at line 1
                    start_line = start + 1
                    end_line = end

                chunks.append(SemanticChunk(
                    content=chunk_content,
                    embedding=None,
                    metadata={
                        "file": str(file_path),
                        "language": language,
                        "chunk_index": chunk_idx,
                        "start_line": start_line,
                        "end_line": end_line,
                        "strategy": "sliding_window",
                        "token_count": token_count,
                    }
                ))
                chunk_idx += 1

            # Move window, accounting for overlap
            step = lines_per_chunk - overlap_lines
            if step <= 0:
                step = 1  # Failsafe to prevent infinite loop
            start += step

            # Break if we've reached the end
            if end >= len(lines):
                break

        return chunks

    def chunk_file(
        self,
        content: str,
        symbols: List[Symbol],
        file_path: str | Path,
        language: str,
        symbol_token_counts: Optional[dict[str, int]] = None,
    ) -> List[SemanticChunk]:
        """Chunk a file using the best strategy.

        Uses symbol-based chunking if symbols available,
        falls back to sliding window for files without symbols.

        Args:
            content: Source code content
            symbols: List of extracted symbols
            file_path: Path to source file
            language: Programming language
            symbol_token_counts: Optional dict mapping symbol names to token counts
        """
        if symbols:
            return self.chunk_by_symbol(content, symbols, file_path, language, symbol_token_counts)
        return self.chunk_sliding_window(content, file_path, language)

class DocstringExtractor:
    """Extract docstrings from source code."""

    @staticmethod
    def extract_python_docstrings(content: str) -> List[Tuple[str, int, int]]:
        """Extract Python docstrings with their line ranges.

        Returns: List of (docstring_content, start_line, end_line) tuples
        """
        docstrings: List[Tuple[str, int, int]] = []
        lines = content.splitlines(keepends=True)

        i = 0
        while i < len(lines):
            line = lines[i]
            stripped = line.strip()
            if stripped.startswith('"""') or stripped.startswith("'''"):
                quote_type = '"""' if stripped.startswith('"""') else "'''"
                start_line = i + 1

                if stripped.count(quote_type) >= 2:
                    docstring_content = line
                    end_line = i + 1
                    docstrings.append((docstring_content, start_line, end_line))
                    i += 1
                    continue

                docstring_lines = [line]
                i += 1
                while i < len(lines):
                    docstring_lines.append(lines[i])
                    if quote_type in lines[i]:
                        break
                    i += 1

                end_line = i + 1
                docstring_content = "".join(docstring_lines)
                docstrings.append((docstring_content, start_line, end_line))

            i += 1

        return docstrings

    @staticmethod
    def extract_jsdoc_comments(content: str) -> List[Tuple[str, int, int]]:
        """Extract JSDoc comments with their line ranges.

        Returns: List of (comment_content, start_line, end_line) tuples
        """
        comments: List[Tuple[str, int, int]] = []
        lines = content.splitlines(keepends=True)

        i = 0
        while i < len(lines):
            line = lines[i]
            stripped = line.strip()

            if stripped.startswith('/**'):
                start_line = i + 1
                comment_lines = [line]
                i += 1

                while i < len(lines):
                    comment_lines.append(lines[i])
                    if '*/' in lines[i]:
                        break
                    i += 1

                end_line = i + 1
                comment_content = "".join(comment_lines)
                comments.append((comment_content, start_line, end_line))

            i += 1

        return comments

    @classmethod
    def extract_docstrings(
        cls,
        content: str,
        language: str
    ) -> List[Tuple[str, int, int]]:
        """Extract docstrings based on language.

        Returns: List of (docstring_content, start_line, end_line) tuples
        """
        if language == "python":
            return cls.extract_python_docstrings(content)
        elif language in {"javascript", "typescript"}:
            return cls.extract_jsdoc_comments(content)
        return []


class HybridChunker:
    """Hybrid chunker that prioritizes docstrings before symbol-based chunking.

    Composition-based strategy that:
    1. Extracts docstrings as dedicated chunks
    2. For remaining code, uses base chunker (symbol or sliding window)
    """

    def __init__(
        self,
        base_chunker: Chunker | None = None,
        config: ChunkConfig | None = None
    ) -> None:
        """Initialize hybrid chunker.

        Args:
            base_chunker: Chunker to use for non-docstring content
            config: Configuration for chunking
        """
        self.config = config or ChunkConfig()
        self.base_chunker = base_chunker or Chunker(self.config)
        self.docstring_extractor = DocstringExtractor()

    def _get_excluded_line_ranges(
        self,
        docstrings: List[Tuple[str, int, int]]
    ) -> set[int]:
        """Get set of line numbers that are part of docstrings."""
        excluded_lines: set[int] = set()
        for _, start_line, end_line in docstrings:
            for line_num in range(start_line, end_line + 1):
                excluded_lines.add(line_num)
        return excluded_lines

    def _filter_symbols_outside_docstrings(
        self,
        symbols: List[Symbol],
        excluded_lines: set[int]
    ) -> List[Symbol]:
        """Filter symbols to exclude those completely within docstrings."""
        filtered: List[Symbol] = []
        for symbol in symbols:
            start_line, end_line = symbol.range
            symbol_lines = set(range(start_line, end_line + 1))
            if not symbol_lines.issubset(excluded_lines):
                filtered.append(symbol)
        return filtered

    def _find_parent_symbol(
        self,
        start_line: int,
        end_line: int,
        symbols: List[Symbol],
    ) -> Optional[Symbol]:
        """Find the smallest symbol range that fully contains a docstring span."""
        candidates: List[Symbol] = []
        for symbol in symbols:
            sym_start, sym_end = symbol.range
            if sym_start <= start_line and end_line <= sym_end:
                candidates.append(symbol)
        if not candidates:
            return None
        return min(candidates, key=lambda s: (s.range[1] - s.range[0], s.range[0]))

    def chunk_file(
        self,
        content: str,
        symbols: List[Symbol],
        file_path: str | Path,
        language: str,
        symbol_token_counts: Optional[dict[str, int]] = None,
    ) -> List[SemanticChunk]:
        """Chunk file using hybrid strategy.

        Extracts docstrings first, then chunks remaining code.

        Args:
            content: Source code content
            symbols: List of extracted symbols
            file_path: Path to source file
            language: Programming language
            symbol_token_counts: Optional dict mapping symbol names to token counts
        """
        chunks: List[SemanticChunk] = []

        # Step 1: Extract docstrings as dedicated chunks
        docstrings: List[Tuple[str, int, int]] = []
        if language == "python":
            # Fast path: avoid expensive docstring extraction if delimiters are absent.
            if '"""' in content or "'''" in content:
                docstrings = self.docstring_extractor.extract_docstrings(content, language)
        elif language in {"javascript", "typescript"}:
            if "/**" in content:
                docstrings = self.docstring_extractor.extract_docstrings(content, language)
        else:
            docstrings = self.docstring_extractor.extract_docstrings(content, language)

        # Fast path: no docstrings -> delegate to base chunker directly.
        if not docstrings:
            if symbols:
                base_chunks = self.base_chunker.chunk_by_symbol(
                    content, symbols, file_path, language, symbol_token_counts
                )
            else:
                base_chunks = self.base_chunker.chunk_sliding_window(content, file_path, language)

            for chunk in base_chunks:
                chunk.metadata["strategy"] = "hybrid"
                chunk.metadata["chunk_type"] = "code"
            return base_chunks

        for docstring_content, start_line, end_line in docstrings:
            if len(docstring_content.strip()) >= self.config.min_chunk_size:
                parent_symbol = self._find_parent_symbol(start_line, end_line, symbols)
                # Use base chunker's token estimation method
                token_count = self.base_chunker._estimate_token_count(docstring_content)
                metadata = {
                    "file": str(file_path),
                    "language": language,
                    "chunk_type": "docstring",
                    "start_line": start_line,
                    "end_line": end_line,
                    "strategy": "hybrid",
                    "token_count": token_count,
                }
                if parent_symbol is not None:
                    metadata["parent_symbol"] = parent_symbol.name
                    metadata["parent_symbol_kind"] = parent_symbol.kind
                    metadata["parent_symbol_range"] = parent_symbol.range
                chunks.append(SemanticChunk(
                    content=docstring_content,
                    embedding=None,
                    metadata=metadata
                ))

        # Step 2: Get line ranges occupied by docstrings
        excluded_lines = self._get_excluded_line_ranges(docstrings)

        # Step 3: Filter symbols to exclude docstring-only ranges
        filtered_symbols = self._filter_symbols_outside_docstrings(symbols, excluded_lines)

        # Step 4: Chunk remaining content using base chunker
        if filtered_symbols:
            base_chunks = self.base_chunker.chunk_by_symbol(
                content, filtered_symbols, file_path, language, symbol_token_counts
            )
            for chunk in base_chunks:
                chunk.metadata["strategy"] = "hybrid"
                chunk.metadata["chunk_type"] = "code"
                chunks.append(chunk)
        else:
            lines = content.splitlines(keepends=True)
            remaining_lines: List[str] = []

            for i, line in enumerate(lines, start=1):
                if i not in excluded_lines:
                    remaining_lines.append(line)

            if remaining_lines:
                remaining_content = "".join(remaining_lines)
                if len(remaining_content.strip()) >= self.config.min_chunk_size:
                    base_chunks = self.base_chunker.chunk_sliding_window(
                        remaining_content, file_path, language
                    )
                    for chunk in base_chunks:
                        chunk.metadata["strategy"] = "hybrid"
                        chunk.metadata["chunk_type"] = "code"
                        chunks.append(chunk)

        return chunks