Files

129 lines
5.2 KiB
Python

"""Pydantic entity models for CodexLens."""
from __future__ import annotations
import math
from enum import Enum
from typing import Any, Dict, List, Optional, Tuple
from pydantic import BaseModel, Field, field_validator
class Symbol(BaseModel):
"""A code symbol discovered in a file."""
name: str = Field(..., min_length=1)
kind: str = Field(..., min_length=1)
range: Tuple[int, int] = Field(..., description="(start_line, end_line), 1-based inclusive")
file: Optional[str] = Field(default=None, description="Full path to the file containing this symbol")
@field_validator("range")
@classmethod
def validate_range(cls, value: Tuple[int, int]) -> Tuple[int, int]:
if len(value) != 2:
raise ValueError("range must be a (start_line, end_line) tuple")
start_line, end_line = value
if start_line < 1 or end_line < 1:
raise ValueError("range lines must be >= 1")
if end_line < start_line:
raise ValueError("end_line must be >= start_line")
return value
class SemanticChunk(BaseModel):
"""A semantically meaningful chunk of content, optionally embedded."""
content: str = Field(..., min_length=1)
embedding: Optional[List[float]] = Field(default=None, description="Vector embedding for semantic search")
metadata: Dict[str, Any] = Field(default_factory=dict)
id: Optional[int] = Field(default=None, description="Database row ID")
file_path: Optional[str] = Field(default=None, description="Source file path")
@field_validator("embedding")
@classmethod
def validate_embedding(cls, value: Optional[List[float]]) -> Optional[List[float]]:
if value is None:
return value
if not value:
raise ValueError("embedding cannot be empty when provided")
norm = math.sqrt(sum(x * x for x in value))
epsilon = 1e-10
if norm < epsilon:
raise ValueError("embedding cannot be a zero vector")
return value
class IndexedFile(BaseModel):
"""An indexed source file with symbols and optional semantic chunks."""
path: str = Field(..., min_length=1)
language: str = Field(..., min_length=1)
symbols: List[Symbol] = Field(default_factory=list)
chunks: List[SemanticChunk] = Field(default_factory=list)
relationships: List["CodeRelationship"] = Field(default_factory=list)
@field_validator("path", "language")
@classmethod
def strip_and_validate_nonempty(cls, value: str) -> str:
cleaned = value.strip()
if not cleaned:
raise ValueError("value cannot be blank")
return cleaned
class RelationshipType(str, Enum):
"""Types of code relationships."""
CALL = "calls"
INHERITS = "inherits"
IMPORTS = "imports"
class CodeRelationship(BaseModel):
"""A relationship between code symbols (e.g., function calls, inheritance)."""
source_symbol: str = Field(..., min_length=1, description="Name of source symbol")
target_symbol: str = Field(..., min_length=1, description="Name of target symbol")
relationship_type: RelationshipType = Field(..., description="Type of relationship (call, inherits, etc.)")
source_file: str = Field(..., min_length=1, description="File path containing source symbol")
target_file: Optional[str] = Field(default=None, description="File path containing target (None if same file)")
source_line: int = Field(..., ge=1, description="Line number where relationship occurs (1-based)")
class AdditionalLocation(BaseModel):
"""A pointer to another location where a similar result was found.
Used for grouping search results with similar scores and content,
where the primary result is stored in SearchResult and secondary
locations are stored in this model.
"""
path: str = Field(..., min_length=1)
score: float = Field(..., ge=0.0)
start_line: Optional[int] = Field(default=None, description="Start line of the result (1-based)")
end_line: Optional[int] = Field(default=None, description="End line of the result (1-based)")
symbol_name: Optional[str] = Field(default=None, description="Name of matched symbol")
class SearchResult(BaseModel):
"""A unified search result for lexical or semantic search."""
path: str = Field(..., min_length=1)
score: float = Field(..., ge=0.0)
excerpt: Optional[str] = None
content: Optional[str] = Field(default=None, description="Full content of matched code block")
symbol: Optional[Symbol] = None
chunk: Optional[SemanticChunk] = None
metadata: Dict[str, Any] = Field(default_factory=dict)
# Additional context for complete code blocks
start_line: Optional[int] = Field(default=None, description="Start line of code block (1-based)")
end_line: Optional[int] = Field(default=None, description="End line of code block (1-based)")
symbol_name: Optional[str] = Field(default=None, description="Name of matched symbol/function/class")
symbol_kind: Optional[str] = Field(default=None, description="Kind of symbol (function/class/method)")
# Field for grouping similar results
additional_locations: List["AdditionalLocation"] = Field(
default_factory=list,
description="Other locations for grouped results with similar scores and content."
)