fix(executor): isolate log files per task in parallel mode

Previously, all parallel tasks shared the same log file path, making it difficult to debug individual task execution. This change creates a separate log file for each task using the naming convention: codeagent-wrapper-{pid}-{taskName}.log Changes: - Add withTaskLogger/taskLoggerFromContext for per-task logger injection - Modify executeConcurrentWithContext to create independent Logger per task - Update printTaskStart to display task-specific log paths - Extract defaultRunCodexTaskFn for proper test hook reset - Add runCodexTaskFn reset to resetTestHooks() Test coverage: 93.7% Generated with swe-agent-bot Co-Authored-By: swe-agent-bot <agent@swe-agent.ai>
2026-02-05 02:30:26 +08:00 · 2025-12-16 10:05:54 +08:00
parent 806bb04a35
commit d215c33549
7 changed files with 994 additions and 60 deletions
--- a/codeagent-wrapper/executor_concurrent_test.go
+++ b/codeagent-wrapper/executor_concurrent_test.go
@@ -1,12 +1,15 @@
 package main

 import (
+	"bufio"
 	"bytes"
 	"context"
 	"errors"
+	"fmt"
 	"io"
 	"os"
 	"os/exec"
+	"path/filepath"
 	"strings"
 	"sync"
 	"sync/atomic"
@@ -15,6 +18,12 @@ import (
 	"time"
 )

+var executorTestTaskCounter atomic.Int64
+
+func nextExecutorTestTaskID(prefix string) string {
+	return fmt.Sprintf("%s-%d", prefix, executorTestTaskCounter.Add(1))
+}
+
 type execFakeProcess struct {
 	pid     int
 	signals []os.Signal
@@ -76,6 +85,7 @@ type execFakeRunner struct {
 	stdout          io.ReadCloser
 	process         processHandle
 	stdin           io.WriteCloser
+	dir             string
 	waitErr         error
 	waitDelay       time.Duration
 	startErr        error
@@ -117,7 +127,7 @@ func (f *execFakeRunner) StdinPipe() (io.WriteCloser, error) {
 	return &writeCloserStub{}, nil
 }
 func (f *execFakeRunner) SetStderr(io.Writer) {}
-func (f *execFakeRunner) SetDir(string)      {}
+func (f *execFakeRunner) SetDir(dir string)   { f.dir = dir }
 func (f *execFakeRunner) Process() processHandle {
 	if f.process != nil {
 		return f.process
@@ -149,6 +159,10 @@ func TestExecutorHelperCoverage(t *testing.T) {
 		}
 		rcWithCmd := &realCmd{cmd: &exec.Cmd{}}
 		rcWithCmd.SetStderr(io.Discard)
+		rcWithCmd.SetDir("/tmp")
+		if rcWithCmd.cmd.Dir != "/tmp" {
+			t.Fatalf("expected SetDir to set cmd.Dir, got %q", rcWithCmd.cmd.Dir)
+		}
 		echoCmd := exec.Command("echo", "ok")
 		rcProc := &realCmd{cmd: echoCmd}
 		stdoutPipe, err := rcProc.StdoutPipe()
@@ -421,6 +435,63 @@ func TestExecutorRunCodexTaskWithContext(t *testing.T) {
 		_ = closeLogger()
 	})

+	t.Run("injectedLogger", func(t *testing.T) {
+		newCommandRunner = func(ctx context.Context, name string, args ...string) commandRunner {
+			return &execFakeRunner{
+				stdout:  newReasonReadCloser(`{"type":"item.completed","item":{"type":"agent_message","text":"injected"}}`),
+				process: &execFakeProcess{pid: 12},
+			}
+		}
+		_ = closeLogger()
+
+		injected, err := NewLoggerWithSuffix("executor-injected")
+		if err != nil {
+			t.Fatalf("NewLoggerWithSuffix() error = %v", err)
+		}
+		defer func() {
+			_ = injected.Close()
+			_ = os.Remove(injected.Path())
+		}()
+
+		ctx := withTaskLogger(context.Background(), injected)
+		res := runCodexTaskWithContext(ctx, TaskSpec{ID: "task-injected", Task: "payload", WorkDir: "."}, nil, nil, false, true, 1)
+		if res.ExitCode != 0 || res.LogPath != injected.Path() {
+			t.Fatalf("expected injected logger path, got %+v", res)
+		}
+		if activeLogger() != nil {
+			t.Fatalf("expected no global logger to be created when injected")
+		}
+
+		injected.Flush()
+		data, err := os.ReadFile(injected.Path())
+		if err != nil {
+			t.Fatalf("failed to read injected log file: %v", err)
+		}
+		if !strings.Contains(string(data), "task-injected") {
+			t.Fatalf("injected log missing task prefix, content: %s", string(data))
+		}
+	})
+
+	t.Run("backendSetsDirAndNilContext", func(t *testing.T) {
+		var rc *execFakeRunner
+		newCommandRunner = func(ctx context.Context, name string, args ...string) commandRunner {
+			rc = &execFakeRunner{
+				stdout:  newReasonReadCloser(`{"type":"item.completed","item":{"type":"agent_message","text":"backend"}}`),
+				process: &execFakeProcess{pid: 13},
+			}
+			return rc
+		}
+
+		_ = closeLogger()
+		res := runCodexTaskWithContext(nil, TaskSpec{ID: "task-backend", Task: "payload", WorkDir: "/tmp"}, ClaudeBackend{}, nil, false, false, 1)
+		if res.ExitCode != 0 || res.Message != "backend" {
+			t.Fatalf("unexpected result: %+v", res)
+		}
+		if rc == nil || rc.dir != "/tmp" {
+			t.Fatalf("expected backend to set cmd.Dir, got runner=%v dir=%q", rc, rc.dir)
+		}
+	})
+
 	t.Run("missingMessage", func(t *testing.T) {
 		newCommandRunner = func(ctx context.Context, name string, args ...string) commandRunner {
 			return &execFakeRunner{
@@ -435,6 +506,476 @@ func TestExecutorRunCodexTaskWithContext(t *testing.T) {
 	})
 }

+func TestExecutorParallelLogIsolation(t *testing.T) {
+	mainLogger, err := NewLoggerWithSuffix("executor-main")
+	if err != nil {
+		t.Fatalf("NewLoggerWithSuffix() error = %v", err)
+	}
+	setLogger(mainLogger)
+	t.Cleanup(func() {
+		_ = closeLogger()
+		_ = os.Remove(mainLogger.Path())
+	})
+
+	taskA := nextExecutorTestTaskID("iso-a")
+	taskB := nextExecutorTestTaskID("iso-b")
+	markerA := "ISOLATION_MARKER:" + taskA
+	markerB := "ISOLATION_MARKER:" + taskB
+
+	origRun := runCodexTaskFn
+	runCodexTaskFn = func(task TaskSpec, timeout int) TaskResult {
+		logger := taskLoggerFromContext(task.Context)
+		if logger == nil {
+			return TaskResult{TaskID: task.ID, ExitCode: 1, Error: "missing task logger"}
+		}
+		switch task.ID {
+		case taskA:
+			logger.Info(markerA)
+		case taskB:
+			logger.Info(markerB)
+		default:
+			logger.Info("unexpected task: " + task.ID)
+		}
+		return TaskResult{TaskID: task.ID, ExitCode: 0}
+	}
+	t.Cleanup(func() { runCodexTaskFn = origRun })
+
+	stderrR, stderrW, err := os.Pipe()
+	if err != nil {
+		t.Fatalf("os.Pipe() error = %v", err)
+	}
+	oldStderr := os.Stderr
+	os.Stderr = stderrW
+	defer func() { os.Stderr = oldStderr }()
+
+	results := executeConcurrentWithContext(nil, [][]TaskSpec{{{ID: taskA}, {ID: taskB}}}, 1, -1)
+
+	_ = stderrW.Close()
+	os.Stderr = oldStderr
+	stderrData, _ := io.ReadAll(stderrR)
+	_ = stderrR.Close()
+	stderrOut := string(stderrData)
+
+	if len(results) != 2 {
+		t.Fatalf("expected 2 results, got %d", len(results))
+	}
+
+	paths := map[string]string{}
+	for _, res := range results {
+		if res.ExitCode != 0 {
+			t.Fatalf("unexpected failure: %+v", res)
+		}
+		if res.LogPath == "" {
+			t.Fatalf("missing LogPath for task %q", res.TaskID)
+		}
+		paths[res.TaskID] = res.LogPath
+	}
+	if paths[taskA] == paths[taskB] {
+		t.Fatalf("expected distinct task log paths, got %q", paths[taskA])
+	}
+
+	if strings.Contains(stderrOut, mainLogger.Path()) {
+		t.Fatalf("stderr should not print main log path: %s", stderrOut)
+	}
+	if !strings.Contains(stderrOut, paths[taskA]) || !strings.Contains(stderrOut, paths[taskB]) {
+		t.Fatalf("stderr should include task log paths, got: %s", stderrOut)
+	}
+
+	mainLogger.Flush()
+	mainData, err := os.ReadFile(mainLogger.Path())
+	if err != nil {
+		t.Fatalf("failed to read main log: %v", err)
+	}
+	if strings.Contains(string(mainData), markerA) || strings.Contains(string(mainData), markerB) {
+		t.Fatalf("main log should not contain task markers, content: %s", string(mainData))
+	}
+
+	taskAData, err := os.ReadFile(paths[taskA])
+	if err != nil {
+		t.Fatalf("failed to read task A log: %v", err)
+	}
+	taskBData, err := os.ReadFile(paths[taskB])
+	if err != nil {
+		t.Fatalf("failed to read task B log: %v", err)
+	}
+	if !strings.Contains(string(taskAData), markerA) || strings.Contains(string(taskAData), markerB) {
+		t.Fatalf("task A log isolation failed, content: %s", string(taskAData))
+	}
+	if !strings.Contains(string(taskBData), markerB) || strings.Contains(string(taskBData), markerA) {
+		t.Fatalf("task B log isolation failed, content: %s", string(taskBData))
+	}
+
+	_ = os.Remove(paths[taskA])
+	_ = os.Remove(paths[taskB])
+}
+
+func TestConcurrentExecutorParallelLogIsolationAndClosure(t *testing.T) {
+	tempDir := t.TempDir()
+	t.Setenv("TMPDIR", tempDir)
+
+	oldArgs := os.Args
+	os.Args = []string{defaultWrapperName}
+	t.Cleanup(func() { os.Args = oldArgs })
+
+	mainLogger, err := NewLoggerWithSuffix("concurrent-main")
+	if err != nil {
+		t.Fatalf("NewLoggerWithSuffix() error = %v", err)
+	}
+	setLogger(mainLogger)
+	t.Cleanup(func() {
+		mainLogger.Flush()
+		_ = closeLogger()
+		_ = os.Remove(mainLogger.Path())
+	})
+
+	const taskCount = 16
+	const writersPerTask = 4
+	const logsPerWriter = 50
+	const expectedTaskLines = writersPerTask * logsPerWriter
+
+	taskIDs := make([]string, 0, taskCount)
+	tasks := make([]TaskSpec, 0, taskCount)
+	for i := 0; i < taskCount; i++ {
+		id := nextExecutorTestTaskID("iso")
+		taskIDs = append(taskIDs, id)
+		tasks = append(tasks, TaskSpec{ID: id})
+	}
+
+	type taskLoggerInfo struct {
+		taskID string
+		logger *Logger
+	}
+	loggerCh := make(chan taskLoggerInfo, taskCount)
+	readyCh := make(chan struct{}, taskCount)
+	startCh := make(chan struct{})
+
+	go func() {
+		for i := 0; i < taskCount; i++ {
+			<-readyCh
+		}
+		close(startCh)
+	}()
+
+	origRun := runCodexTaskFn
+	runCodexTaskFn = func(task TaskSpec, timeout int) TaskResult {
+		readyCh <- struct{}{}
+
+		logger := taskLoggerFromContext(task.Context)
+		loggerCh <- taskLoggerInfo{taskID: task.ID, logger: logger}
+		if logger == nil {
+			return TaskResult{TaskID: task.ID, ExitCode: 1, Error: "missing task logger"}
+		}
+
+		<-startCh
+
+		var wg sync.WaitGroup
+		wg.Add(writersPerTask)
+		for g := 0; g < writersPerTask; g++ {
+			go func(g int) {
+				defer wg.Done()
+				for i := 0; i < logsPerWriter; i++ {
+					logger.Info(fmt.Sprintf("TASK=%s g=%d i=%d", task.ID, g, i))
+				}
+			}(g)
+		}
+		wg.Wait()
+
+		return TaskResult{TaskID: task.ID, ExitCode: 0}
+	}
+	t.Cleanup(func() { runCodexTaskFn = origRun })
+
+	results := executeConcurrentWithContext(context.Background(), [][]TaskSpec{tasks}, 1, 0)
+
+	if len(results) != taskCount {
+		t.Fatalf("expected %d results, got %d", taskCount, len(results))
+	}
+
+	taskLogPaths := make(map[string]string, taskCount)
+	seenPaths := make(map[string]struct{}, taskCount)
+	for _, res := range results {
+		if res.ExitCode != 0 || res.Error != "" {
+			t.Fatalf("unexpected task failure: %+v", res)
+		}
+		if res.LogPath == "" {
+			t.Fatalf("missing LogPath for task %q", res.TaskID)
+		}
+		if _, ok := taskLogPaths[res.TaskID]; ok {
+			t.Fatalf("duplicate TaskID in results: %q", res.TaskID)
+		}
+		taskLogPaths[res.TaskID] = res.LogPath
+		if _, ok := seenPaths[res.LogPath]; ok {
+			t.Fatalf("expected unique log path per task; duplicate path %q", res.LogPath)
+		}
+		seenPaths[res.LogPath] = struct{}{}
+	}
+	if len(taskLogPaths) != taskCount {
+		t.Fatalf("expected %d unique task IDs, got %d", taskCount, len(taskLogPaths))
+	}
+
+	prefix := primaryLogPrefix()
+	pid := os.Getpid()
+	for _, id := range taskIDs {
+		path := taskLogPaths[id]
+		if path == "" {
+			t.Fatalf("missing log path for task %q", id)
+		}
+		if _, err := os.Stat(path); err != nil {
+			t.Fatalf("task log file not created for %q: %v", id, err)
+		}
+		wantBase := fmt.Sprintf("%s-%d-%s.log", prefix, pid, id)
+		if got := filepath.Base(path); got != wantBase {
+			t.Fatalf("unexpected log filename for %q: got %q, want %q", id, got, wantBase)
+		}
+	}
+
+	loggers := make(map[string]*Logger, taskCount)
+	for i := 0; i < taskCount; i++ {
+		info := <-loggerCh
+		if info.taskID == "" {
+			t.Fatalf("missing taskID in logger info")
+		}
+		if info.logger == nil {
+			t.Fatalf("missing logger in context for task %q", info.taskID)
+		}
+		if prev, ok := loggers[info.taskID]; ok && prev != info.logger {
+			t.Fatalf("task %q received multiple logger instances", info.taskID)
+		}
+		loggers[info.taskID] = info.logger
+	}
+	if len(loggers) != taskCount {
+		t.Fatalf("expected %d task loggers, got %d", taskCount, len(loggers))
+	}
+
+	for taskID, logger := range loggers {
+		if !logger.closed.Load() {
+			t.Fatalf("expected task logger to be closed for %q", taskID)
+		}
+		if logger.file == nil {
+			t.Fatalf("expected task logger file to be non-nil for %q", taskID)
+		}
+		if _, err := logger.file.Write([]byte("x")); err == nil {
+			t.Fatalf("expected task logger file to be closed for %q", taskID)
+		}
+	}
+
+	mainLogger.Flush()
+	mainData, err := os.ReadFile(mainLogger.Path())
+	if err != nil {
+		t.Fatalf("failed to read main log: %v", err)
+	}
+	mainText := string(mainData)
+	if !strings.Contains(mainText, "parallel: worker_limit=") {
+		t.Fatalf("expected main log to include concurrency planning, content: %s", mainText)
+	}
+	if strings.Contains(mainText, "TASK=") {
+		t.Fatalf("main log should not contain task output, content: %s", mainText)
+	}
+
+	for taskID, path := range taskLogPaths {
+		f, err := os.Open(path)
+		if err != nil {
+			t.Fatalf("failed to open task log for %q: %v", taskID, err)
+		}
+
+		scanner := bufio.NewScanner(f)
+		lines := 0
+		for scanner.Scan() {
+			line := scanner.Text()
+			if strings.Contains(line, "parallel:") {
+				t.Fatalf("task log should not contain main log entries for %q: %s", taskID, line)
+			}
+			gotID, ok := parseTaskIDFromLogLine(line)
+			if !ok {
+				t.Fatalf("task log entry missing task marker for %q: %s", taskID, line)
+			}
+			if gotID != taskID {
+				t.Fatalf("task log isolation failed: file=%q got TASK=%q want TASK=%q", path, gotID, taskID)
+			}
+			lines++
+		}
+		if err := scanner.Err(); err != nil {
+			_ = f.Close()
+			t.Fatalf("scanner error for %q: %v", taskID, err)
+		}
+		if err := f.Close(); err != nil {
+			t.Fatalf("failed to close task log for %q: %v", taskID, err)
+		}
+		if lines != expectedTaskLines {
+			t.Fatalf("unexpected task log line count for %q: got %d, want %d", taskID, lines, expectedTaskLines)
+		}
+	}
+
+	for _, path := range taskLogPaths {
+		_ = os.Remove(path)
+	}
+}
+
+func parseTaskIDFromLogLine(line string) (string, bool) {
+	const marker = "TASK="
+	idx := strings.Index(line, marker)
+	if idx == -1 {
+		return "", false
+	}
+	rest := line[idx+len(marker):]
+	end := strings.IndexByte(rest, ' ')
+	if end == -1 {
+		return rest, rest != ""
+	}
+	return rest[:end], rest[:end] != ""
+}
+
+func TestExecutorTaskLoggerContext(t *testing.T) {
+	if taskLoggerFromContext(nil) != nil {
+		t.Fatalf("expected nil logger from nil context")
+	}
+	if taskLoggerFromContext(context.Background()) != nil {
+		t.Fatalf("expected nil logger when context has no logger")
+	}
+
+	logger, err := NewLoggerWithSuffix("executor-taskctx")
+	if err != nil {
+		t.Fatalf("NewLoggerWithSuffix() error = %v", err)
+	}
+	defer func() {
+		_ = logger.Close()
+		_ = os.Remove(logger.Path())
+	}()
+
+	ctx := withTaskLogger(context.Background(), logger)
+	if got := taskLoggerFromContext(ctx); got != logger {
+		t.Fatalf("expected logger roundtrip, got %v", got)
+	}
+
+	if taskLoggerFromContext(withTaskLogger(context.Background(), nil)) != nil {
+		t.Fatalf("expected nil logger when injected logger is nil")
+	}
+}
+
+func TestExecutorExecuteConcurrentWithContextBranches(t *testing.T) {
+	devNull, err := os.OpenFile(os.DevNull, os.O_WRONLY, 0)
+	if err != nil {
+		t.Fatalf("failed to open %s: %v", os.DevNull, err)
+	}
+	oldStderr := os.Stderr
+	os.Stderr = devNull
+	t.Cleanup(func() {
+		os.Stderr = oldStderr
+		_ = devNull.Close()
+	})
+
+	t.Run("skipOnFailedDependencies", func(t *testing.T) {
+		root := nextExecutorTestTaskID("root")
+		child := nextExecutorTestTaskID("child")
+
+		orig := runCodexTaskFn
+		runCodexTaskFn = func(task TaskSpec, timeout int) TaskResult {
+			if task.ID == root {
+				return TaskResult{TaskID: task.ID, ExitCode: 1, Error: "boom"}
+			}
+			return TaskResult{TaskID: task.ID, ExitCode: 0}
+		}
+		t.Cleanup(func() { runCodexTaskFn = orig })
+
+		results := executeConcurrentWithContext(context.Background(), [][]TaskSpec{
+			{{ID: root}},
+			{{ID: child, Dependencies: []string{root}}},
+		}, 1, 0)
+
+		foundChild := false
+		for _, res := range results {
+			if res.LogPath != "" {
+				_ = os.Remove(res.LogPath)
+			}
+			if res.TaskID != child {
+				continue
+			}
+			foundChild = true
+			if res.ExitCode == 0 || !strings.Contains(res.Error, "skipped") {
+				t.Fatalf("expected skipped child task result, got %+v", res)
+			}
+		}
+		if !foundChild {
+			t.Fatalf("expected child task to be present in results")
+		}
+	})
+
+	t.Run("panicRecovered", func(t *testing.T) {
+		taskID := nextExecutorTestTaskID("panic")
+
+		orig := runCodexTaskFn
+		runCodexTaskFn = func(task TaskSpec, timeout int) TaskResult {
+			panic("boom")
+		}
+		t.Cleanup(func() { runCodexTaskFn = orig })
+
+		results := executeConcurrentWithContext(context.Background(), [][]TaskSpec{{{ID: taskID}}}, 1, 0)
+		if len(results) != 1 {
+			t.Fatalf("expected 1 result, got %d", len(results))
+		}
+		if results[0].ExitCode == 0 || !strings.Contains(results[0].Error, "panic") {
+			t.Fatalf("expected panic result, got %+v", results[0])
+		}
+		if results[0].LogPath == "" {
+			t.Fatalf("expected LogPath on panic result")
+		}
+		_ = os.Remove(results[0].LogPath)
+	})
+
+	t.Run("cancelWhileWaitingForWorker", func(t *testing.T) {
+		task1 := nextExecutorTestTaskID("slot")
+		task2 := nextExecutorTestTaskID("slot")
+
+		parentCtx, cancel := context.WithCancel(context.Background())
+		started := make(chan struct{})
+		unblock := make(chan struct{})
+		var startedOnce sync.Once
+
+		orig := runCodexTaskFn
+		runCodexTaskFn = func(task TaskSpec, timeout int) TaskResult {
+			startedOnce.Do(func() { close(started) })
+			<-unblock
+			return TaskResult{TaskID: task.ID, ExitCode: 0}
+		}
+		t.Cleanup(func() { runCodexTaskFn = orig })
+
+		go func() {
+			<-started
+			cancel()
+			time.Sleep(50 * time.Millisecond)
+			close(unblock)
+		}()
+
+		results := executeConcurrentWithContext(parentCtx, [][]TaskSpec{{{ID: task1}, {ID: task2}}}, 1, 1)
+		foundCancelled := false
+		for _, res := range results {
+			if res.LogPath != "" {
+				_ = os.Remove(res.LogPath)
+			}
+			if res.ExitCode == 130 {
+				foundCancelled = true
+			}
+		}
+		if !foundCancelled {
+			t.Fatalf("expected a task to be cancelled")
+		}
+	})
+
+	t.Run("loggerCreateFails", func(t *testing.T) {
+		taskID := nextExecutorTestTaskID("bad") + "/id"
+
+		orig := runCodexTaskFn
+		runCodexTaskFn = func(task TaskSpec, timeout int) TaskResult {
+			return TaskResult{TaskID: task.ID, ExitCode: 0}
+		}
+		t.Cleanup(func() { runCodexTaskFn = orig })
+
+		results := executeConcurrentWithContext(context.Background(), [][]TaskSpec{{{ID: taskID}}}, 1, 0)
+		if len(results) != 1 || results[0].ExitCode != 0 {
+			t.Fatalf("unexpected results: %+v", results)
+		}
+	})
+}
+
 func TestExecutorSignalAndTermination(t *testing.T) {
 	forceKillDelay.Store(0)
 	defer forceKillDelay.Store(5)