Claude-Code-Workflow/ccw/src/core/session-clustering-service.ts

/**
 * Session Clustering Service
 * Intelligently groups related sessions into clusters using multi-dimensional similarity analysis
 */

import { CoreMemoryStore, SessionCluster, ClusterMember, SessionMetadataCache } from './core-memory-store.js';
import { CliHistoryStore } from '../tools/cli-history-store.js';
import { StoragePaths } from '../config/storage-paths.js';
import { readdirSync, readFileSync, statSync, existsSync } from 'fs';
import { join } from 'path';

// Clustering dimension weights
const WEIGHTS = {
  fileOverlap: 0.2,
  temporalProximity: 0.15,
  keywordSimilarity: 0.15,
  vectorSimilarity: 0.3,
  intentAlignment: 0.2,
};

// Clustering threshold (0.4 = moderate similarity required)
const CLUSTER_THRESHOLD = 0.4;

export interface ClusteringOptions {
  scope?: 'all' | 'recent' | 'unclustered';
  timeRange?: { start: string; end: string };
  minClusterSize?: number;
}

export interface ClusteringResult {
  clustersCreated: number;
  sessionsProcessed: number;
  sessionsClustered: number;
}

export class SessionClusteringService {
  private coreMemoryStore: CoreMemoryStore;
  private cliHistoryStore: CliHistoryStore;
  private projectPath: string;

  constructor(projectPath: string) {
    this.projectPath = projectPath;
    this.coreMemoryStore = new CoreMemoryStore(projectPath);
    this.cliHistoryStore = new CliHistoryStore(projectPath);
  }

  /**
   * Collect all session sources
   */
  async collectSessions(options?: ClusteringOptions): Promise<SessionMetadataCache[]> {
    const sessions: SessionMetadataCache[] = [];

    // 1. Core Memories
    const memories = this.coreMemoryStore.getMemories({ archived: false, limit: 1000 });
    for (const memory of memories) {
      const cached = this.coreMemoryStore.getSessionMetadata(memory.id);
      if (cached) {
        sessions.push(cached);
      } else {
        const metadata = this.extractMetadata(memory, 'core_memory');
        sessions.push(metadata);
      }
    }

    // 2. CLI History
    const history = this.cliHistoryStore.getHistory({ limit: 1000 });
    for (const exec of history.executions) {
      const cached = this.coreMemoryStore.getSessionMetadata(exec.id);
      if (cached) {
        sessions.push(cached);
      } else {
        const conversation = this.cliHistoryStore.getConversation(exec.id);
        if (conversation) {
          const metadata = this.extractMetadata(conversation, 'cli_history');
          sessions.push(metadata);
        }
      }
    }

    // 3. Workflow Sessions (WFS-*)
    const workflowSessions = await this.parseWorkflowSessions();
    sessions.push(...workflowSessions);

    // Apply scope filter
    if (options?.scope === 'recent') {
      // Last 30 days
      const cutoff = new Date();
      cutoff.setDate(cutoff.getDate() - 30);
      const cutoffStr = cutoff.toISOString();
      return sessions.filter(s => (s.created_at || '') >= cutoffStr);
    } else if (options?.scope === 'unclustered') {
      // Only sessions not in any cluster
      return sessions.filter(s => {
        const clusters = this.coreMemoryStore.getSessionClusters(s.session_id);
        return clusters.length === 0;
      });
    }

    return sessions;
  }

  /**
   * Extract metadata from a session
   */
  extractMetadata(session: any, type: 'core_memory' | 'workflow' | 'cli_history' | 'native'): SessionMetadataCache {
    let content = '';
    let title = '';
    let created_at = '';

    if (type === 'core_memory') {
      content = session.content || '';
      created_at = session.created_at;
      // Extract title from first line
      const lines = content.split('\n');
      title = lines[0].replace(/^#+\s*/, '').trim().substring(0, 100);
    } else if (type === 'cli_history') {
      // Extract from conversation turns
      const turns = session.turns || [];
      if (turns.length > 0) {
        content = turns.map((t: any) => t.prompt).join('\n');
        title = turns[0].prompt.substring(0, 100);
        created_at = session.created_at || turns[0].timestamp;
      }
    } else if (type === 'workflow') {
      content = session.content || '';
      title = session.title || 'Workflow Session';
      created_at = session.created_at || '';
    }

    const summary = content.substring(0, 200).trim();
    const keywords = this.extractKeywords(content);
    const file_patterns = this.extractFilePatterns(content);
    const token_estimate = Math.ceil(content.length / 4);

    return {
      session_id: session.id,
      session_type: type,
      title,
      summary,
      keywords,
      token_estimate,
      file_patterns,
      created_at,
      last_accessed: new Date().toISOString(),
      access_count: 0
    };
  }

  /**
   * Extract keywords from content
   */
  private extractKeywords(content: string): string[] {
    const keywords = new Set<string>();

    // 1. File paths (src/xxx, .ts, .js, etc)
    const filePathRegex = /(?:^|\s|["'`])((?:\.\/|\.\.\/|\/)?[\w-]+(?:\/[\w-]+)*\.[\w]+)(?:\s|["'`]|$)/g;
    let match;
    while ((match = filePathRegex.exec(content)) !== null) {
      keywords.add(match[1]);
    }

    // 2. Function/Class names (camelCase, PascalCase)
    const camelCaseRegex = /\b([A-Z][a-z]+(?:[A-Z][a-z]+)+|[a-z]+[A-Z][a-z]+(?:[A-Z][a-z]+)*)\b/g;
    while ((match = camelCaseRegex.exec(content)) !== null) {
      keywords.add(match[1]);
    }

    // 3. Technical terms (common frameworks/libraries/concepts)
    const techTerms = [
      // Frameworks
      'react', 'vue', 'angular', 'typescript', 'javascript', 'node', 'express',
      // Auth
      'auth', 'authentication', 'jwt', 'oauth', 'session', 'token',
      // Data
      'api', 'rest', 'graphql', 'database', 'sql', 'mongodb', 'redis',
      // Testing
      'test', 'testing', 'jest', 'mocha', 'vitest',
      // Development
      'refactor', 'refactoring', 'optimization', 'performance',
      'bug', 'fix', 'error', 'issue', 'debug',
      // CCW-specific terms
      'cluster', 'clustering', 'memory', 'hook', 'service', 'context',
      'workflow', 'skill', 'prompt', 'embedding', 'vector', 'semantic',
      'dashboard', 'view', 'route', 'command', 'cli', 'mcp'
    ];

    const lowerContent = content.toLowerCase();
    for (const term of techTerms) {
      if (lowerContent.includes(term)) {
        keywords.add(term);
      }
    }

    // 4. Generic word extraction (words >= 4 chars, not stopwords)
    const stopwords = new Set([
      'the', 'and', 'for', 'that', 'this', 'with', 'from', 'have', 'will',
      'are', 'was', 'were', 'been', 'being', 'what', 'when', 'where', 'which',
      'there', 'their', 'they', 'them', 'then', 'than', 'into', 'some', 'such',
      'only', 'also', 'just', 'more', 'most', 'other', 'after', 'before'
    ]);

    const wordRegex = /\b([a-z]{4,})\b/g;
    let wordMatch;
    while ((wordMatch = wordRegex.exec(lowerContent)) !== null) {
      const word = wordMatch[1];
      if (!stopwords.has(word)) {
        keywords.add(word);
      }
    }

    // Return top 20 keywords
    return Array.from(keywords).slice(0, 20);
  }

  /**
   * Extract file patterns from content
   */
  private extractFilePatterns(content: string): string[] {
    const patterns = new Set<string>();

    // Extract directory patterns (src/xxx/, lib/xxx/)
    const dirRegex = /\b((?:src|lib|test|dist|build|public|components|utils|services|config|core|tools)(?:\/[\w-]+)*)\//g;
    let match;
    while ((match = dirRegex.exec(content)) !== null) {
      patterns.add(match[1] + '/**');
    }

    // Extract file extension patterns
    const extRegex = /\.(\w+)(?:\s|$|["'`])/g;
    const extensions = new Set<string>();
    while ((match = extRegex.exec(content)) !== null) {
      extensions.add(match[1]);
    }

    // Add extension patterns
    if (extensions.size > 0) {
      patterns.add(`**/*.{${Array.from(extensions).join(',')}}`);
    }

    return Array.from(patterns).slice(0, 10);
  }

  /**
   * Calculate relevance score between two sessions
   */
  calculateRelevance(session1: SessionMetadataCache, session2: SessionMetadataCache): number {
    const fileScore = this.calculateFileOverlap(session1, session2);
    const temporalScore = this.calculateTemporalProximity(session1, session2);
    const keywordScore = this.calculateSemanticSimilarity(session1, session2);
    const vectorScore = this.calculateVectorSimilarity(session1, session2);
    const intentScore = this.calculateIntentAlignment(session1, session2);

    return (
      fileScore * WEIGHTS.fileOverlap +
      temporalScore * WEIGHTS.temporalProximity +
      keywordScore * WEIGHTS.keywordSimilarity +
      vectorScore * WEIGHTS.vectorSimilarity +
      intentScore * WEIGHTS.intentAlignment
    );
  }

  /**
   * Calculate file path overlap score (Jaccard similarity)
   */
  private calculateFileOverlap(s1: SessionMetadataCache, s2: SessionMetadataCache): number {
    const files1 = new Set(s1.file_patterns || []);
    const files2 = new Set(s2.file_patterns || []);

    if (files1.size === 0 || files2.size === 0) return 0;

    const intersection = new Set([...files1].filter(f => files2.has(f)));
    const union = new Set([...files1, ...files2]);

    return intersection.size / union.size;
  }

  /**
   * Calculate temporal proximity score
   * 24h: 1.0, 7d: 0.7, 30d: 0.4, >30d: 0.1
   */
  private calculateTemporalProximity(s1: SessionMetadataCache, s2: SessionMetadataCache): number {
    if (!s1.created_at || !s2.created_at) return 0.1;

    const t1 = new Date(s1.created_at).getTime();
    const t2 = new Date(s2.created_at).getTime();
    const diffMs = Math.abs(t1 - t2);
    const diffHours = diffMs / (1000 * 60 * 60);

    if (diffHours <= 24) return 1.0;
    if (diffHours <= 24 * 7) return 0.7;
    if (diffHours <= 24 * 30) return 0.4;
    return 0.1;
  }

  /**
   * Calculate semantic similarity using keyword overlap (Jaccard similarity)
   */
  private calculateSemanticSimilarity(s1: SessionMetadataCache, s2: SessionMetadataCache): number {
    const kw1 = new Set(s1.keywords || []);
    const kw2 = new Set(s2.keywords || []);

    if (kw1.size === 0 || kw2.size === 0) return 0;

    const intersection = new Set([...kw1].filter(k => kw2.has(k)));
    const union = new Set([...kw1, ...kw2]);

    return intersection.size / union.size;
  }

  /**
   * Calculate intent alignment score
   * Based on title/summary keyword matching
   */
  private calculateIntentAlignment(s1: SessionMetadataCache, s2: SessionMetadataCache): number {
    const text1 = ((s1.title || '') + ' ' + (s1.summary || '')).toLowerCase();
    const text2 = ((s2.title || '') + ' ' + (s2.summary || '')).toLowerCase();

    if (!text1 || !text2) return 0;

    // Simple word-based TF-IDF approximation
    const words1 = text1.split(/\s+/).filter(w => w.length > 3);
    const words2 = text2.split(/\s+/).filter(w => w.length > 3);

    const set1 = new Set(words1);
    const set2 = new Set(words2);

    const intersection = new Set([...set1].filter(w => set2.has(w)));
    const union = new Set([...set1, ...set2]);

    return intersection.size / union.size;
  }

  /**
   * Calculate vector similarity using pre-computed embeddings from memory_chunks
   * Returns average cosine similarity of chunk embeddings
   */
  private calculateVectorSimilarity(s1: SessionMetadataCache, s2: SessionMetadataCache): number {
    const embedding1 = this.getSessionEmbedding(s1.session_id);
    const embedding2 = this.getSessionEmbedding(s2.session_id);

    // Graceful fallback if no embeddings available
    if (!embedding1 || !embedding2) {
      return 0;
    }

    return this.cosineSimilarity(embedding1, embedding2);
  }

  /**
   * Get session embedding by averaging all chunk embeddings
   */
  private getSessionEmbedding(sessionId: string): number[] | null {
    const chunks = this.coreMemoryStore.getChunks(sessionId);

    if (chunks.length === 0) {
      return null;
    }

    // Filter chunks that have embeddings
    const embeddedChunks = chunks.filter(chunk => chunk.embedding && chunk.embedding.length > 0);

    if (embeddedChunks.length === 0) {
      return null;
    }

    // Convert Buffer embeddings to number arrays and calculate average
    const embeddings = embeddedChunks.map(chunk => {
      // Convert Buffer to Float32Array
      const buffer = chunk.embedding!;
      const float32Array = new Float32Array(buffer.buffer, buffer.byteOffset, buffer.byteLength / 4);
      return Array.from(float32Array);
    });

    // Check all embeddings have same dimension
    const dimension = embeddings[0].length;
    if (!embeddings.every(emb => emb.length === dimension)) {
      console.warn(`[VectorSimilarity] Inconsistent embedding dimensions for session ${sessionId}`);
      return null;
    }

    // Calculate average embedding
    const avgEmbedding = new Array(dimension).fill(0);
    for (const embedding of embeddings) {
      for (let i = 0; i < dimension; i++) {
        avgEmbedding[i] += embedding[i];
      }
    }

    for (let i = 0; i < dimension; i++) {
      avgEmbedding[i] /= embeddings.length;
    }

    return avgEmbedding;
  }

  /**
   * Calculate cosine similarity between two vectors
   */
  private cosineSimilarity(a: number[], b: number[]): number {
    if (a.length !== b.length) {
      console.warn('[VectorSimilarity] Vector dimension mismatch');
      return 0;
    }

    let dotProduct = 0;
    let normA = 0;
    let normB = 0;

    for (let i = 0; i < a.length; i++) {
      dotProduct += a[i] * b[i];
      normA += a[i] * a[i];
      normB += b[i] * b[i];
    }

    normA = Math.sqrt(normA);
    normB = Math.sqrt(normB);

    if (normA === 0 || normB === 0) {
      return 0;
    }

    return dotProduct / (normA * normB);
  }

  /**
   * Find the most relevant existing cluster for a set of session IDs
   * Returns the cluster with highest session overlap
   */
  private findExistingClusterForSessions(sessionIds: string[]): SessionCluster | null {
    if (sessionIds.length === 0) return null;

    const clusterCounts = new Map<string, number>();
    let maxCount = 0;
    let bestClusterId: string | null = null;

    for (const sessionId of sessionIds) {
      const clusters = this.coreMemoryStore.getSessionClusters(sessionId);
      for (const cluster of clusters) {
        if (cluster.status !== 'active') continue;

        const count = (clusterCounts.get(cluster.id) || 0) + 1;
        clusterCounts.set(cluster.id, count);

        if (count > maxCount) {
          maxCount = count;
          bestClusterId = cluster.id;
        }
      }
    }

    if (bestClusterId) {
      return this.coreMemoryStore.getCluster(bestClusterId);
    }
    return null;
  }

  /**
   * Determine if a new cluster should merge with an existing one
   * Based on 70% session overlap threshold
   */
  private shouldMergeWithExisting(newClusterSessions: SessionMetadataCache[], existingCluster: SessionCluster): boolean {
    const MERGE_THRESHOLD = 0.7;

    const existingMembers = this.coreMemoryStore.getClusterMembers(existingCluster.id);
    const newSessionIds = new Set(newClusterSessions.map(s => s.session_id));
    const existingSessionIds = new Set(existingMembers.map(m => m.session_id));

    if (newSessionIds.size === 0) return false;

    const intersection = new Set([...newSessionIds].filter(id => existingSessionIds.has(id)));
    const overlapRatio = intersection.size / newSessionIds.size;

    return overlapRatio > MERGE_THRESHOLD;
  }

  /**
   * Run auto-clustering algorithm
   * Optimized to prevent duplicate clusters by checking existing clusters first
   */
  async autocluster(options?: ClusteringOptions): Promise<ClusteringResult> {
    // 1. Collect sessions based on user-specified scope (default: 'recent')
    const allSessions = await this.collectSessions(options);
    console.log(`[Clustering] Collected ${allSessions.length} sessions (scope: ${options?.scope || 'recent'})`);

    // 2. Filter out already-clustered sessions to prevent duplicates
    const sessions = allSessions.filter(s => {
      const clusters = this.coreMemoryStore.getSessionClusters(s.session_id);
      return clusters.length === 0;
    });
    console.log(`[Clustering] ${sessions.length} unclustered sessions after filtering`);

    // 3. Update metadata cache
    for (const session of sessions) {
      this.coreMemoryStore.upsertSessionMetadata(session);
    }

    // 4. Calculate relevance matrix
    const n = sessions.length;
    const relevanceMatrix: number[][] = Array(n).fill(0).map(() => Array(n).fill(0));

    let maxScore = 0;
    let avgScore = 0;
    let pairCount = 0;

    for (let i = 0; i < n; i++) {
      for (let j = i + 1; j < n; j++) {
        const score = this.calculateRelevance(sessions[i], sessions[j]);
        relevanceMatrix[i][j] = score;
        relevanceMatrix[j][i] = score;

        if (score > maxScore) maxScore = score;
        avgScore += score;
        pairCount++;
      }
    }

    if (pairCount > 0) {
      avgScore = avgScore / pairCount;
      console.log(`[Clustering] Relevance stats: max=${maxScore.toFixed(3)}, avg=${avgScore.toFixed(3)}, pairs=${pairCount}, threshold=${CLUSTER_THRESHOLD}`);
    }

    // 5. Agglomerative clustering
    const minClusterSize = options?.minClusterSize || 2;

    // Early return if not enough sessions
    if (sessions.length < minClusterSize) {
      console.log('[Clustering] Not enough unclustered sessions to form new clusters');
      return { clustersCreated: 0, sessionsProcessed: allSessions.length, sessionsClustered: 0 };
    }

    const newPotentialClusters = this.agglomerativeClustering(sessions, relevanceMatrix, CLUSTER_THRESHOLD);
    console.log(`[Clustering] Generated ${newPotentialClusters.length} potential clusters`);

    // 6. Process clusters: create new or merge with existing
    let clustersCreated = 0;
    let clustersMerged = 0;
    let sessionsClustered = 0;

    for (const clusterSessions of newPotentialClusters) {
      if (clusterSessions.length < minClusterSize) {
        continue; // Skip small clusters
      }

      const sessionIds = clusterSessions.map(s => s.session_id);
      const existingCluster = this.findExistingClusterForSessions(sessionIds);

      // Check if we should merge with an existing cluster
      if (existingCluster && this.shouldMergeWithExisting(clusterSessions, existingCluster)) {
        const existingMembers = this.coreMemoryStore.getClusterMembers(existingCluster.id);
        const existingSessionIds = new Set(existingMembers.map(m => m.session_id));

        // Only add sessions not already in the cluster
        const newSessions = clusterSessions.filter(s => !existingSessionIds.has(s.session_id));

        if (newSessions.length > 0) {
          newSessions.forEach((session, index) => {
            this.coreMemoryStore.addClusterMember({
              cluster_id: existingCluster.id,
              session_id: session.session_id,
              session_type: session.session_type as 'core_memory' | 'workflow' | 'cli_history' | 'native',
              sequence_order: existingMembers.length + index + 1,
              relevance_score: 1.0
            });
          });

          // Update cluster description
          this.coreMemoryStore.updateCluster(existingCluster.id, {
            description: `Auto-generated cluster with ${existingMembers.length + newSessions.length} sessions`
          });

          clustersMerged++;
          sessionsClustered += newSessions.length;
          console.log(`[Clustering] Merged ${newSessions.length} sessions into existing cluster '${existingCluster.name}'`);
        }
      } else {
        // Create new cluster
        const clusterName = this.generateClusterName(clusterSessions);
        const clusterIntent = this.generateClusterIntent(clusterSessions);

        const clusterRecord = this.coreMemoryStore.createCluster({
          name: clusterName,
          description: `Auto-generated cluster with ${clusterSessions.length} sessions`,
          intent: clusterIntent,
          status: 'active'
        });

        // Add members
        clusterSessions.forEach((session, index) => {
          this.coreMemoryStore.addClusterMember({
            cluster_id: clusterRecord.id,
            session_id: session.session_id,
            session_type: session.session_type as 'core_memory' | 'workflow' | 'cli_history' | 'native',
            sequence_order: index + 1,
            relevance_score: 1.0
          });
        });

        clustersCreated++;
        sessionsClustered += clusterSessions.length;
      }
    }

    console.log(`[Clustering] Summary: ${clustersCreated} created, ${clustersMerged} merged, ${allSessions.length - sessions.length} already clustered`);

    return {
      clustersCreated,
      sessionsProcessed: allSessions.length,
      sessionsClustered
    };
  }

  /**
   * Deduplicate clusters by merging similar ones
   * Clusters with same name or >50% member overlap are merged
   * @returns Statistics about deduplication
   */
  async deduplicateClusters(): Promise<{ merged: number; deleted: number; remaining: number }> {
    const clusters = this.coreMemoryStore.listClusters('active');
    console.log(`[Dedup] Analyzing ${clusters.length} active clusters`);

    if (clusters.length < 2) {
      return { merged: 0, deleted: 0, remaining: clusters.length };
    }

    // Group clusters by name (case-insensitive)
    const byName = new Map<string, typeof clusters>();
    for (const cluster of clusters) {
      const key = cluster.name.toLowerCase().trim();
      if (!byName.has(key)) {
        byName.set(key, []);
      }
      byName.get(key)!.push(cluster);
    }

    let merged = 0;
    let deleted = 0;

    // Merge clusters with same name
    for (const [name, group] of byName) {
      if (group.length < 2) continue;

      // Sort by created_at (oldest first) to keep the original
      group.sort((a, b) => a.created_at.localeCompare(b.created_at));
      const target = group[0];
      const sources = group.slice(1).map(c => c.id);

      console.log(`[Dedup] Merging ${sources.length} duplicate clusters named '${name}' into ${target.id}`);

      try {
        const membersMoved = this.coreMemoryStore.mergeClusters(target.id, sources);
        merged += sources.length;
        console.log(`[Dedup] Moved ${membersMoved} members, deleted ${sources.length} clusters`);
      } catch (error) {
        console.warn(`[Dedup] Failed to merge: ${(error as Error).message}`);
      }
    }

    // Check for clusters with high member overlap
    const remainingClusters = this.coreMemoryStore.listClusters('active');
    const clusterMembers = new Map<string, Set<string>>();

    for (const cluster of remainingClusters) {
      const members = this.coreMemoryStore.getClusterMembers(cluster.id);
      clusterMembers.set(cluster.id, new Set(members.map(m => m.session_id)));
    }

    // Find and merge overlapping clusters
    const processed = new Set<string>();
    for (let i = 0; i < remainingClusters.length; i++) {
      const clusterA = remainingClusters[i];
      if (processed.has(clusterA.id)) continue;

      const membersA = clusterMembers.get(clusterA.id)!;
      const toMerge: string[] = [];

      for (let j = i + 1; j < remainingClusters.length; j++) {
        const clusterB = remainingClusters[j];
        if (processed.has(clusterB.id)) continue;

        const membersB = clusterMembers.get(clusterB.id)!;
        const intersection = new Set([...membersA].filter(m => membersB.has(m)));

        // Calculate overlap ratio (based on smaller cluster)
        const minSize = Math.min(membersA.size, membersB.size);
        if (minSize > 0 && intersection.size / minSize >= 0.5) {
          toMerge.push(clusterB.id);
          processed.add(clusterB.id);
        }
      }

      if (toMerge.length > 0) {
        console.log(`[Dedup] Merging ${toMerge.length} overlapping clusters into ${clusterA.id}`);
        try {
          this.coreMemoryStore.mergeClusters(clusterA.id, toMerge);
          merged += toMerge.length;
        } catch (error) {
          console.warn(`[Dedup] Failed to merge overlapping: ${(error as Error).message}`);
        }
      }
    }

    // Delete empty clusters
    const finalClusters = this.coreMemoryStore.listClusters('active');
    for (const cluster of finalClusters) {
      const members = this.coreMemoryStore.getClusterMembers(cluster.id);
      if (members.length === 0) {
        this.coreMemoryStore.deleteCluster(cluster.id);
        deleted++;
        console.log(`[Dedup] Deleted empty cluster: ${cluster.id}`);
      }
    }

    const remaining = this.coreMemoryStore.listClusters('active').length;
    console.log(`[Dedup] Complete: ${merged} merged, ${deleted} deleted, ${remaining} remaining`);

    return { merged, deleted, remaining };
  }

  /**
   * Agglomerative clustering algorithm
   * Returns array of clusters (each cluster is array of sessions)
   */
  private agglomerativeClustering(
    sessions: SessionMetadataCache[],
    relevanceMatrix: number[][],
    threshold: number
  ): SessionMetadataCache[][] {
    const n = sessions.length;

    // Initialize: each session is its own cluster
    const clusters: Set<number>[] = sessions.map((_, i) => new Set([i]));

    while (true) {
      let maxScore = -1;
      let mergeI = -1;
      let mergeJ = -1;

      // Find pair of clusters with highest average linkage
      for (let i = 0; i < clusters.length; i++) {
        for (let j = i + 1; j < clusters.length; j++) {
          const score = this.averageLinkage(clusters[i], clusters[j], relevanceMatrix);
          if (score > maxScore) {
            maxScore = score;
            mergeI = i;
            mergeJ = j;
          }
        }
      }

      // Stop if no pair exceeds threshold
      if (maxScore < threshold) break;

      // Merge clusters
      const merged = new Set([...clusters[mergeI], ...clusters[mergeJ]]);
      clusters.splice(mergeJ, 1); // Remove j first (higher index)
      clusters.splice(mergeI, 1);
      clusters.push(merged);
    }

    // Convert cluster indices to sessions
    return clusters.map(cluster =>
      Array.from(cluster).map(i => sessions[i])
    );
  }

  /**
   * Calculate average linkage between two clusters
   */
  private averageLinkage(
    cluster1: Set<number>,
    cluster2: Set<number>,
    relevanceMatrix: number[][]
  ): number {
    let sum = 0;
    let count = 0;

    for (const i of cluster1) {
      for (const j of cluster2) {
        sum += relevanceMatrix[i][j];
        count++;
      }
    }

    return count > 0 ? sum / count : 0;
  }

  /**
   * Generate cluster name from members
   */
  private generateClusterName(members: SessionMetadataCache[]): string {
    // Count keyword frequency
    const keywordFreq = new Map<string, number>();
    for (const member of members) {
      for (const keyword of member.keywords || []) {
        keywordFreq.set(keyword, (keywordFreq.get(keyword) || 0) + 1);
      }
    }

    // Get top 2 keywords
    const sorted = Array.from(keywordFreq.entries())
      .sort((a, b) => b[1] - a[1])
      .map(([kw]) => kw);

    if (sorted.length >= 2) {
      return `${sorted[0]}-${sorted[1]}`;
    } else if (sorted.length === 1) {
      return sorted[0];
    } else {
      return 'unnamed-cluster';
    }
  }

  /**
   * Generate cluster intent from members
   */
  private generateClusterIntent(members: SessionMetadataCache[]): string {
    // Extract common action words from titles
    const actionWords = ['implement', 'refactor', 'fix', 'add', 'create', 'update', 'optimize'];
    const titles = members.map(m => (m.title || '').toLowerCase());

    for (const action of actionWords) {
      const count = titles.filter(t => t.includes(action)).length;
      if (count >= members.length / 2) {
        const topic = this.generateClusterName(members);
        return `${action.charAt(0).toUpperCase() + action.slice(1)} ${topic}`;
      }
    }

    return `Work on ${this.generateClusterName(members)}`;
  }

  /**
   * Get progressive disclosure index for hook
   * @param options - Configuration options
   * @param options.type - 'session-start' returns recent sessions, 'context' returns intent-matched sessions
   * @param options.sessionId - Current session ID (optional)
   * @param options.prompt - User prompt for intent matching (required for 'context' type)
   */
  async getProgressiveIndex(options: {
    type: 'session-start' | 'context';
    sessionId?: string;
    prompt?: string;
  }): Promise<string> {
    const { type, sessionId, prompt } = options;

    // For session-start: return recent sessions by time
    if (type === 'session-start') {
      return this.getRecentSessionsIndex();
    }

    // For context: return intent-matched sessions based on prompt
    if (type === 'context' && prompt) {
      return this.getIntentMatchedIndex(prompt, sessionId);
    }

    // Fallback to recent sessions
    return this.getRecentSessionsIndex();
  }

  /**
   * Get recent sessions index (for session-start)
   * Shows sessions grouped by clusters with progressive disclosure
   */
  private async getRecentSessionsIndex(): Promise<string> {
    // 1. Get all active clusters
    const allClusters = this.coreMemoryStore.listClusters('active');

    // Sort clusters by most recent activity (based on member last_accessed)
    const clustersWithActivity = allClusters.map(cluster => {
      const members = this.coreMemoryStore.getClusterMembers(cluster.id);
      const memberMetadata = members
        .map(m => this.coreMemoryStore.getSessionMetadata(m.session_id))
        .filter((m): m is SessionMetadataCache => m !== null);

      const lastActivity = memberMetadata.reduce((latest, m) => {
        const accessed = m.last_accessed || m.created_at || '';
        return accessed > latest ? accessed : latest;
      }, '');

      return { cluster, members, memberMetadata, lastActivity };
    }).sort((a, b) => b.lastActivity.localeCompare(a.lastActivity));

    // 2. Get unclustered recent sessions
    const allSessions = await this.collectSessions({ scope: 'recent' });
    const clusteredSessionIds = new Set<string>();
    clustersWithActivity.forEach(c => {
      c.members.forEach(m => clusteredSessionIds.add(m.session_id));
    });

    const unclusteredSessions = allSessions
      .filter(s => !clusteredSessionIds.has(s.session_id))
      .sort((a, b) => (b.created_at || '').localeCompare(a.created_at || ''))
      .slice(0, 3);

    // 3. Build output
    let output = `<ccw-session-context>\n## 📋 Session Context (Progressive Disclosure)\n\n`;

    // Show top 2 active clusters
    const topClusters = clustersWithActivity.slice(0, 2);
    if (topClusters.length > 0) {
      output += `### 🔗 Active Clusters\n\n`;

      for (const { cluster, memberMetadata } of topClusters) {
        output += `**${cluster.name}** (${memberMetadata.length} sessions)\n`;
        if (cluster.intent) {
          output += `> Intent: ${cluster.intent}\n`;
        }
        output += `\n| Session | Type | Title |\n|---------|------|-------|\n`;

        // Show top 3 members per cluster
        const displayMembers = memberMetadata.slice(0, 3);
        for (const m of displayMembers) {
          const type = m.session_type === 'core_memory' ? 'Core' :
                       m.session_type === 'workflow' ? 'Workflow' : 'CLI';
          const title = (m.title || '').substring(0, 35);
          output += `| ${m.session_id} | ${type} | ${title} |\n`;
        }

        if (memberMetadata.length > 3) {
          output += `| ... | ... | +${memberMetadata.length - 3} more |\n`;
        }
        output += `\n`;
      }
    }

    // Show unclustered recent sessions
    if (unclusteredSessions.length > 0) {
      output += `### 📝 Recent Sessions (Unclustered)\n\n`;
      output += `| Session | Type | Title | Date |\n`;
      output += `|---------|------|-------|------|\n`;

      for (const s of unclusteredSessions) {
        const type = s.session_type === 'core_memory' ? 'Core' :
                     s.session_type === 'workflow' ? 'Workflow' : 'CLI';
        const title = (s.title || '').substring(0, 30);
        const date = s.created_at ? new Date(s.created_at).toLocaleDateString() : '';
        output += `| ${s.session_id} | ${type} | ${title} | ${date} |\n`;
      }
      output += `\n`;
    }

    // If nothing found
    if (topClusters.length === 0 && unclusteredSessions.length === 0) {
      output += `No recent sessions found. Start a new workflow to begin tracking.\n\n`;
    }

    // Add MCP tools reference
    const topSession = topClusters[0]?.memberMetadata[0] || unclusteredSessions[0];
    const topClusterId = topClusters[0]?.cluster.id;

    output += `**MCP Tools**:\n\`\`\`\n`;
    if (topSession) {
      output += `# Resume session\nmcp__ccw-tools__core_memory({ "operation": "export", "id": "${topSession.session_id}" })\n\n`;
    }
    if (topClusterId) {
      output += `# Load cluster context\nmcp__ccw-tools__core_memory({ "operation": "search", "query": "cluster:${topClusterId}" })\n`;
    }
    output += `\`\`\`\n</ccw-session-context>`;

    return output;
  }

  /**
   * Get intent-matched sessions index (for context with prompt)
   * Shows sessions grouped by clusters and ranked by relevance
   */
  private async getIntentMatchedIndex(prompt: string, sessionId?: string): Promise<string> {
    const sessions = await this.collectSessions({ scope: 'all' });

    if (sessions.length === 0) {
      return `<ccw-session-context>
## 📋 Related Sessions

No sessions available for intent matching.
</ccw-session-context>`;
    }

    // Create a virtual session from the prompt for similarity calculation
    const promptSession: SessionMetadataCache = {
      session_id: 'prompt-virtual',
      session_type: 'native',
      title: prompt.substring(0, 100),
      summary: prompt.substring(0, 200),
      keywords: this.extractKeywords(prompt),
      token_estimate: Math.ceil(prompt.length / 4),
      file_patterns: this.extractFilePatterns(prompt),
      created_at: new Date().toISOString(),
      last_accessed: new Date().toISOString(),
      access_count: 0
    };

    // Build session-to-cluster mapping
    const sessionClusterMap = new Map<string, SessionCluster[]>();
    const allClusters = this.coreMemoryStore.listClusters('active');
    for (const cluster of allClusters) {
      const members = this.coreMemoryStore.getClusterMembers(cluster.id);
      for (const member of members) {
        const existing = sessionClusterMap.get(member.session_id) || [];
        existing.push(cluster);
        sessionClusterMap.set(member.session_id, existing);
      }
    }

    // Calculate relevance scores for all sessions
    const scoredSessions = sessions
      .filter(s => s.session_id !== sessionId) // Exclude current session
      .map(s => ({
        session: s,
        score: this.calculateRelevance(promptSession, s),
        clusters: sessionClusterMap.get(s.session_id) || []
      }))
      .filter(item => item.score >= 0.15) // Minimum relevance threshold (lowered for file-path-based keywords)
      .sort((a, b) => b.score - a.score)
      .slice(0, 8); // Top 8 relevant sessions

    if (scoredSessions.length === 0) {
      return `<ccw-session-context>
## 📋 Related Sessions

No sessions match current intent. Consider:
- Starting fresh with a new approach
- Using \`search\` to find sessions by keyword

**MCP Tools**:
\`\`\`
mcp__ccw-tools__core_memory({ "operation": "search", "query": "<keyword>" })
\`\`\`
</ccw-session-context>`;
    }

    // Group sessions by cluster
    const clusterGroups = new Map<string, { cluster: SessionCluster; sessions: typeof scoredSessions }>();
    const unclusteredSessions: typeof scoredSessions = [];

    for (const item of scoredSessions) {
      if (item.clusters.length > 0) {
        // Add to the highest-priority cluster
        const primaryCluster = item.clusters[0];
        const existing = clusterGroups.get(primaryCluster.id) || { cluster: primaryCluster, sessions: [] };
        existing.sessions.push(item);
        clusterGroups.set(primaryCluster.id, existing);
      } else {
        unclusteredSessions.push(item);
      }
    }

    // Sort cluster groups by best session score
    const sortedGroups = Array.from(clusterGroups.values())
      .sort((a, b) => Math.max(...b.sessions.map(s => s.score)) - Math.max(...a.sessions.map(s => s.score)));

    // Generate output
    let output = `<ccw-session-context>\n## 📋 Intent-Matched Sessions\n\n`;
    output += `**Detected Intent**: ${(promptSession.keywords || []).slice(0, 5).join(', ') || 'General'}\n\n`;

    // Show clustered sessions
    if (sortedGroups.length > 0) {
      output += `### 🔗 Matched Clusters\n\n`;

      for (const { cluster, sessions: clusterSessions } of sortedGroups.slice(0, 2)) {
        const avgScore = Math.round(clusterSessions.reduce((sum, s) => sum + s.score, 0) / clusterSessions.length * 100);
        output += `**${cluster.name}** (${avgScore}% avg match)\n`;
        if (cluster.intent) {
          output += `> ${cluster.intent}\n`;
        }
        output += `\n| Session | Match | Title |\n|---------|-------|-------|\n`;

        for (const item of clusterSessions.slice(0, 3)) {
          const matchPct = Math.round(item.score * 100);
          const title = (item.session.title || '').substring(0, 35);
          output += `| ${item.session.session_id} | ${matchPct}% | ${title} |\n`;
        }
        output += `\n`;
      }
    }

    // Show unclustered sessions
    if (unclusteredSessions.length > 0) {
      output += `### 📝 Individual Matches\n\n`;
      output += `| Session | Type | Match | Title |\n`;
      output += `|---------|------|-------|-------|\n`;

      for (const item of unclusteredSessions.slice(0, 4)) {
        const type = item.session.session_type === 'core_memory' ? 'Core' :
                     item.session.session_type === 'workflow' ? 'Workflow' : 'CLI';
        const matchPct = Math.round(item.score * 100);
        const title = (item.session.title || '').substring(0, 30);
        output += `| ${item.session.session_id} | ${type} | ${matchPct}% | ${title} |\n`;
      }
      output += `\n`;
    }

    // Add MCP tools reference
    const topSession = scoredSessions[0];
    const topCluster = sortedGroups[0]?.cluster;

    output += `**MCP Tools**:\n\`\`\`\n`;
    output += `# Resume top match\nmcp__ccw-tools__core_memory({ "operation": "export", "id": "${topSession.session.session_id}" })\n`;
    if (topCluster) {
      output += `\n# Load cluster context\nmcp__ccw-tools__core_memory({ "operation": "search", "query": "cluster:${topCluster.id}" })\n`;
    }
    output += `\`\`\`\n</ccw-session-context>`;

    return output;
  }

  /**
   * Legacy method for backward compatibility
   * @deprecated Use getProgressiveIndex({ type, sessionId, prompt }) instead
   */
  async getProgressiveIndexLegacy(sessionId?: string): Promise<string> {
    let activeCluster: SessionCluster | null = null;
    let members: SessionMetadataCache[] = [];

    if (sessionId) {
      const clusters = this.coreMemoryStore.getSessionClusters(sessionId);
      if (clusters.length > 0) {
        activeCluster = clusters[0];
        const clusterMembers = this.coreMemoryStore.getClusterMembers(activeCluster.id);
        members = clusterMembers
          .map(m => this.coreMemoryStore.getSessionMetadata(m.session_id))
          .filter((m): m is SessionMetadataCache => m !== null)
          .sort((a, b) => (a.created_at || '').localeCompare(b.created_at || ''));
      }
    }

    if (!activeCluster || members.length === 0) {
      return `<ccw-session-context>
## 📋 Related Sessions Index

No active cluster found. Start a new workflow or continue from recent sessions.

**MCP Tools**:
\`\`\`
# Search sessions
Use tool: mcp__ccw-tools__core_memory
Parameters: { "action": "search", "query": "<keyword>" }

# Trigger clustering
Parameters: { "action": "cluster", "scope": "auto" }
\`\`\`
</ccw-session-context>`;
    }

    // Generate table
    let table = `| # | Session | Type | Summary | Tokens |\n`;
    table += `|---|---------|------|---------|--------|\n`;

    members.forEach((m, idx) => {
      const type = m.session_type === 'core_memory' ? 'Core' :
                   m.session_type === 'workflow' ? 'Workflow' : 'CLI';
      const summary = (m.summary || '').substring(0, 40);
      const token = `~${m.token_estimate || 0}`;
      table += `| ${idx + 1} | ${m.session_id} | ${type} | ${summary} | ${token} |\n`;
    });

    // Generate timeline - show multiple recent sessions
    let timeline = '';
    if (members.length > 0) {
      const timelineEntries: string[] = [];
      const displayCount = Math.min(members.length, 3); // Show last 3 sessions

      for (let i = members.length - displayCount; i < members.length; i++) {
        const member = members[i];
        const date = member.created_at ? new Date(member.created_at).toLocaleDateString() : '';
        const title = member.title?.substring(0, 30) || 'Untitled';
        const isCurrent = i === members.length - 1;
        const marker = isCurrent ? ' ← Current' : '';
        timelineEntries.push(`${date} ─●─ ${member.session_id} (${title})${marker}`);
      }

      timeline = `\`\`\`\n${timelineEntries.join('\n        │\n')}\n\`\`\``;
    }

    return `<ccw-session-context>
## 📋 Related Sessions Index

### 🔗 Active Cluster: ${activeCluster.name} (${members.length} sessions)
**Intent**: ${activeCluster.intent || 'No intent specified'}

${table}

**Resume via MCP**:
\`\`\`
Use tool: mcp__ccw-tools__core_memory
Parameters: { "action": "load", "id": "${members[members.length - 1].session_id}" }

Or load entire cluster:
{ "action": "load-cluster", "clusterId": "${activeCluster.id}" }
\`\`\`

### 📊 Timeline
${timeline}

---
**Tip**: Use \`mcp__ccw-tools__core_memory({ action: "search", query: "<keyword>" })\` to find more sessions
</ccw-session-context>`;
  }

  /**
   * Parse workflow session files
   */
  private async parseWorkflowSessions(): Promise<SessionMetadataCache[]> {
    const sessions: SessionMetadataCache[] = [];
    const workflowDir = join(this.projectPath, '.workflow', 'sessions');

    if (!existsSync(workflowDir)) {
      return sessions;
    }

    try {
      const sessionDirs = readdirSync(workflowDir).filter(d => d.startsWith('WFS-'));

      for (const sessionDir of sessionDirs) {
        const sessionFile = join(workflowDir, sessionDir, 'session.json');
        if (!existsSync(sessionFile)) continue;

        try {
          const content = readFileSync(sessionFile, 'utf8');
          const sessionData = JSON.parse(content);

          const metadata: SessionMetadataCache = {
            session_id: sessionDir,
            session_type: 'workflow',
            title: sessionData.title || sessionDir,
            summary: (sessionData.description || '').substring(0, 200),
            keywords: this.extractKeywords(JSON.stringify(sessionData)),
            token_estimate: Math.ceil(JSON.stringify(sessionData).length / 4),
            file_patterns: this.extractFilePatterns(JSON.stringify(sessionData)),
            created_at: sessionData.created_at || statSync(sessionFile).mtime.toISOString(),
            last_accessed: new Date().toISOString(),
            access_count: 0
          };

          sessions.push(metadata);
        } catch (err) {
          console.warn(`[Clustering] Failed to parse ${sessionFile}:`, err);
        }
      }
    } catch (err) {
      console.warn('[Clustering] Failed to read workflow sessions:', err);
    }

    return sessions;
  }

  /**
   * Update metadata cache for all sessions
   */
  async refreshMetadataCache(): Promise<number> {
    const sessions = await this.collectSessions({ scope: 'all' });

    for (const session of sessions) {
      this.coreMemoryStore.upsertSessionMetadata(session);
    }

    return sessions.length;
  }
}