feat: Enhance CLI output handling with structured Intermediate Representation (IR)

- Introduced `CliOutputUnit` and `IOutputParser` interfaces for unified output processing. - Implemented `PlainTextParser` and `JsonLinesParser` for parsing raw CLI output into structured units. - Updated `executeCliTool` to utilize output parsers and handle structured output. - Added `flattenOutputUnits` utility for extracting clean output from structured data. - Enhanced `ConversationTurn` and `ExecutionRecord` interfaces to include structured output. - Created comprehensive documentation for CLI Output Converter usage and integration. - Improved error handling and type mapping for various output formats.
2026-02-13 02:41:50 +08:00 · 2026-01-08 17:26:40 +08:00
parent b86cdd6644
commit d0523684e5
22 changed files with 1618 additions and 111 deletions
--- a/ccw/src/tools/cli-output-converter.ts
+++ b/ccw/src/tools/cli-output-converter.ts
@@ -0,0 +1,474 @@
+/**
+ * CLI Output Converter
+ * Converts raw CLI tool output into structured Intermediate Representation (IR)
+ *
+ * Purpose: Decouple output parsing from consumption scenarios (View, Storage, Resume)
+ * Supports: Plain text, JSON Lines, and other structured formats
+ */
+
+// ========== Type Definitions ==========
+
+/**
+ * Unified output unit types for the intermediate representation layer
+ */
+export type CliOutputUnitType =
+  | 'stdout'         // Standard output text
+  | 'stderr'         // Standard error text
+  | 'thought'        // AI reasoning/thinking
+  | 'code'           // Code block content
+  | 'file_diff'      // File modification diff
+  | 'progress'       // Progress updates
+  | 'metadata'       // Session/execution metadata
+  | 'system';        // System events/messages
+
+/**
+ * Intermediate Representation unit
+ * Common structure for all CLI output chunks
+ */
+export interface CliOutputUnit<T = any> {
+  type: CliOutputUnitType;
+  content: T;
+  timestamp: string;  // ISO 8601 format
+}
+
+// ========== Parser Interface ==========
+
+/**
+ * Parser interface for converting raw output into IR
+ */
+export interface IOutputParser {
+  /**
+   * Parse a chunk of data from stdout/stderr stream
+   * @param chunk - Raw buffer from stream
+   * @param streamType - Source stream (stdout or stderr)
+   * @returns Array of parsed output units
+   */
+  parse(chunk: Buffer, streamType: 'stdout' | 'stderr'): CliOutputUnit[];
+
+  /**
+   * Flush any remaining buffered data
+   * Called when stream ends to ensure no data is lost
+   * @returns Array of remaining output units
+   */
+  flush(): CliOutputUnit[];
+}
+
+// ========== Plain Text Parser ==========
+
+/**
+ * PlainTextParser - Converts plain text output to IR
+ * Simply wraps text in appropriate type envelope
+ */
+export class PlainTextParser implements IOutputParser {
+  parse(chunk: Buffer, streamType: 'stdout' | 'stderr'): CliOutputUnit[] {
+    const text = chunk.toString('utf8');
+
+    if (!text) {
+      return [];
+    }
+
+    return [{
+      type: streamType,
+      content: text,
+      timestamp: new Date().toISOString()
+    }];
+  }
+
+  /**
+   * Flush any remaining buffered data
+   * Called when stream ends to ensure no data is lost
+   *
+   * Note: PlainTextParser does not buffer data internally, so this method
+   * always returns an empty array. Other parsers (e.g., JsonLinesParser)
+   * may have buffered incomplete lines that need to be flushed.
+   *
+   * @returns Array of remaining output units (always empty for PlainTextParser)
+   */
+  flush(): CliOutputUnit[] {
+    // Plain text parser has no internal buffer
+    return [];
+  }
+}
+
+// ========== JSON Lines Parser ==========
+
+/**
+ * JsonLinesParser - Parses newline-delimited JSON output
+ *
+ * Features:
+ * - Handles incomplete lines across chunks
+ * - Maps JSON events to appropriate IR types
+ * - Falls back to stdout for unparseable lines
+ * - Robust error handling for malformed JSON
+ */
+export class JsonLinesParser implements IOutputParser {
+  private buffer: string = '';
+
+  parse(chunk: Buffer, streamType: 'stdout' | 'stderr'): CliOutputUnit[] {
+    const text = chunk.toString('utf8');
+    this.buffer += text;
+
+    const units: CliOutputUnit[] = [];
+    const lines = this.buffer.split('\n');
+
+    // Keep the last incomplete line in buffer
+    this.buffer = lines.pop() || '';
+
+    for (const line of lines) {
+      const trimmed = line.trim();
+      if (!trimmed) {
+        continue;
+      }
+
+      // Try to parse as JSON
+      let parsed: any;
+      try {
+        parsed = JSON.parse(trimmed);
+      } catch {
+        // Not valid JSON, treat as plain text
+        units.push({
+          type: streamType,
+          content: line,
+          timestamp: new Date().toISOString()
+        });
+        continue;
+      }
+
+      // Map JSON structure to IR type
+      const unit = this.mapJsonToIR(parsed, streamType);
+      if (unit) {
+        units.push(unit);
+      }
+    }
+
+    return units;
+  }
+
+  flush(): CliOutputUnit[] {
+    const units: CliOutputUnit[] = [];
+
+    if (this.buffer.trim()) {
+      // Try to parse remaining buffer
+      try {
+        const parsed = JSON.parse(this.buffer.trim());
+        const unit = this.mapJsonToIR(parsed, 'stdout');
+        if (unit) {
+          units.push(unit);
+        }
+      } catch {
+        // Not valid JSON, return as plain text
+        units.push({
+          type: 'stdout',
+          content: this.buffer,
+          timestamp: new Date().toISOString()
+        });
+      }
+    }
+
+    this.buffer = '';
+    return units;
+  }
+
+  /**
+   * Map parsed JSON object to appropriate IR type
+   * Handles various JSON event formats from different CLI tools
+   */
+  private mapJsonToIR(json: any, fallbackStreamType: 'stdout' | 'stderr'): CliOutputUnit | null {
+    const timestamp = json.timestamp || new Date().toISOString();
+
+    // Detect type from JSON structure
+    if (json.type) {
+      switch (json.type) {
+        case 'thought':
+        case 'thinking':
+        case 'reasoning':
+          return {
+            type: 'thought',
+            content: json.content || json.text || json.message,
+            timestamp
+          };
+
+        case 'code':
+        case 'code_block':
+          return {
+            type: 'code',
+            content: json.content || json.code,
+            timestamp
+          };
+
+        case 'diff':
+        case 'file_diff':
+        case 'file_change':
+          return {
+            type: 'file_diff',
+            content: {
+              path: json.path || json.file,
+              diff: json.diff || json.content,
+              action: json.action || 'modify'
+            },
+            timestamp
+          };
+
+        case 'progress':
+        case 'status':
+          return {
+            type: 'progress',
+            content: {
+              message: json.message || json.content,
+              progress: json.progress,
+              total: json.total
+            },
+            timestamp
+          };
+
+        case 'metadata':
+        case 'session_meta':
+          return {
+            type: 'metadata',
+            content: json.payload || json.data || json,
+            timestamp
+          };
+
+        case 'system':
+        case 'event':
+          return {
+            type: 'system',
+            content: json.message || json.content || json,
+            timestamp
+          };
+      }
+    }
+
+    // Check for Codex JSONL format
+    if (json.type === 'response_item' && json.payload) {
+      const payloadType = json.payload.type;
+
+      if (payloadType === 'message') {
+        // User or assistant message
+        const content = json.payload.content
+          ?.map((c: any) => c.text || '')
+          .filter((t: string) => t)
+          .join('\n') || '';
+
+        return {
+          type: 'stdout',
+          content,
+          timestamp
+        };
+      }
+
+      if (payloadType === 'reasoning') {
+        return {
+          type: 'thought',
+          content: json.payload.summary || json.payload.content,
+          timestamp
+        };
+      }
+
+      if (payloadType === 'function_call' || payloadType === 'function_call_output') {
+        return {
+          type: 'system',
+          content: json.payload,
+          timestamp
+        };
+      }
+    }
+
+    // Check for Gemini/Qwen message format
+    if (json.role === 'user' || json.role === 'assistant') {
+      return {
+        type: 'stdout',
+        content: json.content || json.text || '',
+        timestamp
+      };
+    }
+
+    if (json.thoughts && Array.isArray(json.thoughts)) {
+      return {
+        type: 'thought',
+        content: json.thoughts.map((t: any) =>
+          typeof t === 'string' ? t : `${t.subject}: ${t.description}`
+        ).join('\n'),
+        timestamp
+      };
+    }
+
+    // Default: treat as stdout/stderr based on fallback
+    if (json.content || json.message || json.text) {
+      return {
+        type: fallbackStreamType,
+        content: json.content || json.message || json.text,
+        timestamp
+      };
+    }
+
+    // Unrecognized structure, return as metadata
+    return {
+      type: 'metadata',
+      content: json,
+      timestamp
+    };
+  }
+}
+
+// ========== Factory Function ==========
+
+/**
+ * Create an output parser instance based on format
+ * @param format - Output format type
+ * @returns Parser instance
+ */
+export function createOutputParser(format: 'text' | 'json-lines'): IOutputParser {
+  switch (format) {
+    case 'json-lines':
+      return new JsonLinesParser();
+    case 'text':
+    default:
+      return new PlainTextParser();
+  }
+}
+
+// ========== Utility Functions ==========
+
+/**
+ * Flatten output units into plain text string
+ * Useful for Resume scenario where we need concatenated context
+ *
+ * @param units - Array of output units to flatten
+ * @param options - Filtering and formatting options
+ * @returns Concatenated text content
+ */
+export function flattenOutputUnits(
+  units: CliOutputUnit[],
+  options?: {
+    includeTypes?: CliOutputUnitType[];
+    excludeTypes?: CliOutputUnitType[];
+    includeTimestamps?: boolean;
+    separator?: string;
+  }
+): string {
+  const {
+    includeTypes,
+    excludeTypes,
+    includeTimestamps = false,
+    separator = '\n'
+  } = options || {};
+
+  // Filter units by type
+  let filtered = units;
+  if (includeTypes && includeTypes.length > 0) {
+    filtered = filtered.filter(u => includeTypes.includes(u.type));
+  }
+  if (excludeTypes && excludeTypes.length > 0) {
+    filtered = filtered.filter(u => !excludeTypes.includes(u.type));
+  }
+
+  // Convert to text
+  const lines = filtered.map(unit => {
+    let text = '';
+
+    if (includeTimestamps) {
+      text += `[${unit.timestamp}] `;
+    }
+
+    // Extract text content based on type
+    if (typeof unit.content === 'string') {
+      text += unit.content;
+    } else if (typeof unit.content === 'object' && unit.content !== null) {
+      // Handle structured content with type-specific formatting
+      switch (unit.type) {
+        case 'file_diff':
+          // Format file diff with path and diff content
+          text += `File: ${unit.content.path}\n\`\`\`diff\n${unit.content.diff}\n\`\`\``;
+          break;
+
+        case 'code':
+          // Format code block with language
+          const lang = unit.content.language || '';
+          const code = unit.content.code || unit.content;
+          text += `\`\`\`${lang}\n${typeof code === 'string' ? code : JSON.stringify(code)}\n\`\`\``;
+          break;
+
+        case 'thought':
+          // Format thought/reasoning content
+          text += `[Thought] ${typeof unit.content === 'string' ? unit.content : JSON.stringify(unit.content)}`;
+          break;
+
+        case 'progress':
+          // Format progress updates
+          if (unit.content.message) {
+            text += unit.content.message;
+            if (unit.content.progress !== undefined && unit.content.total !== undefined) {
+              text += ` (${unit.content.progress}/${unit.content.total})`;
+            }
+          } else {
+            text += JSON.stringify(unit.content);
+          }
+          break;
+
+        case 'metadata':
+        case 'system':
+          // Metadata and system events are typically excluded from prompt context
+          // Include minimal representation if they passed filtering
+          text += JSON.stringify(unit.content);
+          break;
+
+        default:
+          // Fallback for unknown structured types
+          text += JSON.stringify(unit.content);
+      }
+    } else {
+      text += String(unit.content);
+    }
+
+    return text;
+  });
+
+  return lines.join(separator);
+}
+
+/**
+ * Extract specific content type from units
+ * Convenience helper for common extraction patterns
+ */
+export function extractContent(
+  units: CliOutputUnit[],
+  type: CliOutputUnitType
+): string[] {
+  return units
+    .filter(u => u.type === type)
+    .map(u => typeof u.content === 'string' ? u.content : JSON.stringify(u.content));
+}
+
+/**
+ * Get statistics about output units
+ * Useful for debugging and analytics
+ */
+export function getOutputStats(units: CliOutputUnit[]): {
+  total: number;
+  byType: Record<CliOutputUnitType, number>;
+  firstTimestamp?: string;
+  lastTimestamp?: string;
+} {
+  const byType: Record<string, number> = {};
+  let firstTimestamp: string | undefined;
+  let lastTimestamp: string | undefined;
+
+  for (const unit of units) {
+    byType[unit.type] = (byType[unit.type] || 0) + 1;
+
+    if (!firstTimestamp || unit.timestamp < firstTimestamp) {
+      firstTimestamp = unit.timestamp;
+    }
+    if (!lastTimestamp || unit.timestamp > lastTimestamp) {
+      lastTimestamp = unit.timestamp;
+    }
+  }
+
+  return {
+    total: units.length,
+    byType: byType as Record<CliOutputUnitType, number>,
+    firstTimestamp,
+    lastTimestamp
+  };
+}