Files
myclaude/codeagent-wrapper/logger.go
swe-agent[bot] c6cd20d2fd fix(parallel): 修复并行执行启动横幅重复打印问题
修复 GitHub Actions 失败的测试 TestRunParallelStartupLogsPrinted。

问题根源:
- 在 main.go 中有重复的启动横幅和日志路径打印逻辑
- executeConcurrent 内部也添加了相同的打印逻辑
- 导致横幅和任务日志被打印两次

修复内容:
1. 删除 main.go 中 --parallel 处理中的重复打印代码(行 184-194)
2. 保留 executeConcurrent 中的 printTaskStart 函数,实现:
   - 在任务启动时立即打印日志路径
   - 使用 mutex 保护并发打印,确保横幅只打印一次
   - 按实际执行顺序打印任务信息

测试结果:
- TestRunParallelStartupLogsPrinted: PASS
- TestRunNonParallelOutputsIncludeLogPathsIntegration: PASS
- TestRunStartupCleanupRemovesOrphansEndToEnd: PASS

影响范围:
- 修复了 --parallel 模式下的日志输出格式
- 不影响非并行模式的执行

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2025-12-09 17:02:59 +08:00

481 lines
12 KiB
Go

package main
import (
"bufio"
"context"
"errors"
"fmt"
"os"
"path/filepath"
"strconv"
"strings"
"sync"
"sync/atomic"
"time"
)
// Logger writes log messages asynchronously to a temp file.
// It is intentionally minimal: a buffered channel + single worker goroutine
// to avoid contention while keeping ordering guarantees.
type Logger struct {
path string
file *os.File
writer *bufio.Writer
ch chan logEntry
flushReq chan chan struct{}
done chan struct{}
closed atomic.Bool
closeOnce sync.Once
workerWG sync.WaitGroup
pendingWG sync.WaitGroup
flushMu sync.Mutex
}
type logEntry struct {
level string
msg string
}
// CleanupStats captures the outcome of a cleanupOldLogs run.
type CleanupStats struct {
Scanned int
Deleted int
Kept int
Errors int
DeletedFiles []string
KeptFiles []string
}
var (
processRunningCheck = isProcessRunning
processStartTimeFn = getProcessStartTime
removeLogFileFn = os.Remove
globLogFiles = filepath.Glob
fileStatFn = os.Lstat // Use Lstat to detect symlinks
evalSymlinksFn = filepath.EvalSymlinks
)
// NewLogger creates the async logger and starts the worker goroutine.
// The log file is created under os.TempDir() using the required naming scheme.
func NewLogger() (*Logger, error) {
return NewLoggerWithSuffix("")
}
// NewLoggerWithSuffix creates a logger with an optional suffix in the filename.
// Useful for tests that need isolated log files within the same process.
func NewLoggerWithSuffix(suffix string) (*Logger, error) {
filename := fmt.Sprintf("%s-%d", primaryLogPrefix(), os.Getpid())
if suffix != "" {
filename += "-" + suffix
}
filename += ".log"
path := filepath.Clean(filepath.Join(os.TempDir(), filename))
f, err := os.OpenFile(path, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0o600)
if err != nil {
return nil, err
}
l := &Logger{
path: path,
file: f,
writer: bufio.NewWriterSize(f, 4096),
ch: make(chan logEntry, 1000),
flushReq: make(chan chan struct{}, 1),
done: make(chan struct{}),
}
l.workerWG.Add(1)
go l.run()
return l, nil
}
// Path returns the underlying log file path (useful for tests/inspection).
func (l *Logger) Path() string {
if l == nil {
return ""
}
return l.path
}
// Info logs at INFO level.
func (l *Logger) Info(msg string) { l.log("INFO", msg) }
// Warn logs at WARN level.
func (l *Logger) Warn(msg string) { l.log("WARN", msg) }
// Debug logs at DEBUG level.
func (l *Logger) Debug(msg string) { l.log("DEBUG", msg) }
// Error logs at ERROR level.
func (l *Logger) Error(msg string) { l.log("ERROR", msg) }
// Close stops the worker and syncs the log file.
// The log file is NOT removed, allowing inspection after program exit.
// It is safe to call multiple times.
// Returns after a 5-second timeout if worker doesn't stop gracefully.
func (l *Logger) Close() error {
if l == nil {
return nil
}
var closeErr error
l.closeOnce.Do(func() {
l.closed.Store(true)
close(l.done)
close(l.ch)
// Wait for worker with timeout
workerDone := make(chan struct{})
go func() {
l.workerWG.Wait()
close(workerDone)
}()
select {
case <-workerDone:
// Worker stopped gracefully
case <-time.After(5 * time.Second):
// Worker timeout - proceed with cleanup anyway
closeErr = fmt.Errorf("logger worker timeout during close")
}
if err := l.writer.Flush(); err != nil && closeErr == nil {
closeErr = err
}
if err := l.file.Sync(); err != nil && closeErr == nil {
closeErr = err
}
if err := l.file.Close(); err != nil && closeErr == nil {
closeErr = err
}
// Log file is kept for debugging - NOT removed
// Users can manually clean up /tmp/<wrapper>-*.log files
})
return closeErr
}
// RemoveLogFile removes the log file. Should only be called after Close().
func (l *Logger) RemoveLogFile() error {
if l == nil {
return nil
}
return os.Remove(l.path)
}
// Flush waits for all pending log entries to be written. Primarily for tests.
// Returns after a 5-second timeout to prevent indefinite blocking.
func (l *Logger) Flush() {
if l == nil {
return
}
l.flushMu.Lock()
defer l.flushMu.Unlock()
// Wait for pending entries with timeout
done := make(chan struct{})
go func() {
l.pendingWG.Wait()
close(done)
}()
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
select {
case <-done:
// All pending entries processed
case <-ctx.Done():
// Timeout - return without full flush
return
}
// Trigger writer flush
flushDone := make(chan struct{})
select {
case l.flushReq <- flushDone:
// Wait for flush to complete
select {
case <-flushDone:
// Flush completed
case <-time.After(1 * time.Second):
// Flush timeout
}
case <-l.done:
// Logger is closing
case <-time.After(1 * time.Second):
// Timeout sending flush request
}
}
func (l *Logger) log(level, msg string) {
if l == nil {
return
}
if l.closed.Load() {
return
}
entry := logEntry{level: level, msg: msg}
l.flushMu.Lock()
l.pendingWG.Add(1)
l.flushMu.Unlock()
select {
case l.ch <- entry:
// Successfully sent to channel
case <-l.done:
// Logger is closing, drop this entry
l.pendingWG.Done()
return
}
}
func (l *Logger) run() {
defer l.workerWG.Done()
ticker := time.NewTicker(500 * time.Millisecond)
defer ticker.Stop()
for {
select {
case entry, ok := <-l.ch:
if !ok {
// Channel closed, final flush
_ = l.writer.Flush()
return
}
timestamp := time.Now().Format("2006-01-02 15:04:05.000")
pid := os.Getpid()
fmt.Fprintf(l.writer, "[%s] [PID:%d] %s: %s\n", timestamp, pid, entry.level, entry.msg)
l.pendingWG.Done()
case <-ticker.C:
_ = l.writer.Flush()
case flushDone := <-l.flushReq:
// Explicit flush request - flush writer and sync to disk
_ = l.writer.Flush()
_ = l.file.Sync()
close(flushDone)
}
}
}
// cleanupOldLogs scans os.TempDir() for wrapper log files and removes those
// whose owning process is no longer running (i.e., orphaned logs).
// It includes safety checks for:
// - PID reuse: Compares file modification time with process start time
// - Symlink attacks: Ensures files are within TempDir and not symlinks
func cleanupOldLogs() (CleanupStats, error) {
var stats CleanupStats
tempDir := os.TempDir()
prefixes := logPrefixes()
if len(prefixes) == 0 {
prefixes = []string{defaultWrapperName}
}
seen := make(map[string]struct{})
var matches []string
for _, prefix := range prefixes {
pattern := filepath.Join(tempDir, fmt.Sprintf("%s-*.log", prefix))
found, err := globLogFiles(pattern)
if err != nil {
logWarn(fmt.Sprintf("cleanupOldLogs: failed to list logs: %v", err))
return stats, fmt.Errorf("cleanupOldLogs: %w", err)
}
for _, path := range found {
if _, ok := seen[path]; ok {
continue
}
seen[path] = struct{}{}
matches = append(matches, path)
}
}
var removeErr error
for _, path := range matches {
stats.Scanned++
filename := filepath.Base(path)
// Security check: Verify file is not a symlink and is within tempDir
if shouldSkipFile, reason := isUnsafeFile(path, tempDir); shouldSkipFile {
stats.Kept++
stats.KeptFiles = append(stats.KeptFiles, filename)
if reason != "" {
logWarn(fmt.Sprintf("cleanupOldLogs: skipping %s: %s", filename, reason))
}
continue
}
pid, ok := parsePIDFromLog(path)
if !ok {
stats.Kept++
stats.KeptFiles = append(stats.KeptFiles, filename)
continue
}
// Check if process is running
if !processRunningCheck(pid) {
// Process not running, safe to delete
if err := removeLogFileFn(path); err != nil {
if errors.Is(err, os.ErrNotExist) {
// File already deleted by another process, don't count as success
stats.Kept++
stats.KeptFiles = append(stats.KeptFiles, filename+" (already deleted)")
continue
}
stats.Errors++
logWarn(fmt.Sprintf("cleanupOldLogs: failed to remove %s: %v", filename, err))
removeErr = errors.Join(removeErr, fmt.Errorf("failed to remove %s: %w", filename, err))
continue
}
stats.Deleted++
stats.DeletedFiles = append(stats.DeletedFiles, filename)
continue
}
// Process is running, check for PID reuse
if isPIDReused(path, pid) {
// PID was reused, the log file is orphaned
if err := removeLogFileFn(path); err != nil {
if errors.Is(err, os.ErrNotExist) {
stats.Kept++
stats.KeptFiles = append(stats.KeptFiles, filename+" (already deleted)")
continue
}
stats.Errors++
logWarn(fmt.Sprintf("cleanupOldLogs: failed to remove %s (PID reused): %v", filename, err))
removeErr = errors.Join(removeErr, fmt.Errorf("failed to remove %s: %w", filename, err))
continue
}
stats.Deleted++
stats.DeletedFiles = append(stats.DeletedFiles, filename)
continue
}
// Process is running and owns this log file
stats.Kept++
stats.KeptFiles = append(stats.KeptFiles, filename)
}
if removeErr != nil {
return stats, fmt.Errorf("cleanupOldLogs: %w", removeErr)
}
return stats, nil
}
// isUnsafeFile checks if a file is unsafe to delete (symlink or outside tempDir).
// Returns (true, reason) if the file should be skipped.
func isUnsafeFile(path string, tempDir string) (bool, string) {
// Check if file is a symlink
info, err := fileStatFn(path)
if err != nil {
if errors.Is(err, os.ErrNotExist) {
return true, "" // File disappeared, skip silently
}
return true, fmt.Sprintf("stat failed: %v", err)
}
// Check if it's a symlink
if info.Mode()&os.ModeSymlink != 0 {
return true, "refusing to delete symlink"
}
// Resolve any path traversal and verify it's within tempDir
resolvedPath, err := evalSymlinksFn(path)
if err != nil {
return true, fmt.Sprintf("path resolution failed: %v", err)
}
// Get absolute path of tempDir
absTempDir, err := filepath.Abs(tempDir)
if err != nil {
return true, fmt.Sprintf("tempDir resolution failed: %v", err)
}
// Ensure resolved path is within tempDir
relPath, err := filepath.Rel(absTempDir, resolvedPath)
if err != nil || strings.HasPrefix(relPath, "..") {
return true, "file is outside tempDir"
}
return false, ""
}
// isPIDReused checks if a PID has been reused by comparing file modification time
// with process start time. Returns true if the log file was created by a different
// process that previously had the same PID.
func isPIDReused(logPath string, pid int) bool {
// Get file modification time (when log was last written)
info, err := fileStatFn(logPath)
if err != nil {
// If we can't stat the file, be conservative and keep it
return false
}
fileModTime := info.ModTime()
// Get process start time
procStartTime := processStartTimeFn(pid)
if procStartTime.IsZero() {
// Can't determine process start time
// Check if file is very old (>7 days), likely from a dead process
if time.Since(fileModTime) > 7*24*time.Hour {
return true // File is old enough to be from a different process
}
return false // Be conservative for recent files
}
// If the log file was modified before the process started, PID was reused
// Add a small buffer (1 second) to account for clock skew and file system timing
return fileModTime.Add(1 * time.Second).Before(procStartTime)
}
func parsePIDFromLog(path string) (int, bool) {
name := filepath.Base(path)
prefixes := logPrefixes()
if len(prefixes) == 0 {
prefixes = []string{defaultWrapperName}
}
for _, prefix := range prefixes {
prefixWithDash := fmt.Sprintf("%s-", prefix)
if !strings.HasPrefix(name, prefixWithDash) || !strings.HasSuffix(name, ".log") {
continue
}
core := strings.TrimSuffix(strings.TrimPrefix(name, prefixWithDash), ".log")
if core == "" {
continue
}
pidPart := core
if idx := strings.IndexRune(core, '-'); idx != -1 {
pidPart = core[:idx]
}
if pidPart == "" {
continue
}
pid, err := strconv.Atoi(pidPart)
if err != nil || pid <= 0 {
continue
}
return pid, true
}
return 0, false
}