Files
myclaude/codeagent-wrapper/concurrent_stress_test.go
ben fe5508228f fix: 修复多 backend 并行日志 PID 混乱并移除包装格式 (#74) (#76)
* fix(logger): 修复多 backend 并行日志 PID 混乱并移除包装格式

**问题:**
- logger.go:288 使用 os.Getpid() 导致并行任务日志 PID 混乱
- 日志文件添加时间戳/PID/级别前缀包装,应输出 backend 原始内容

**修复:**
1. Logger 结构体添加 pid 字段,创建时捕获 PID
2. 日志写入使用固定 l.pid 替代 os.Getpid()
3. 移除日志输出格式包装,直接写入原始消息
4. 添加内存缓存 ERROR/WARN 条目,ExtractRecentErrors 从缓存读取
5. 优化 executor.go context 初始化顺序,避免重复创建 logger

**测试:**
- 所有测试通过(23.7s)
- 更新相关测试用例匹配新格式

Closes #74

* fix(logger): 增强并发日志隔离和 task ID 清理

## 核心修复

### 1. Task ID Sanitization (logger.go)
- 新增 sanitizeLogSuffix(): 清理非法字符 (/, \, :, 等)
- 新增 fallbackLogSuffix(): 为空/非法 ID 生成唯一后备名
- 新增 isSafeLogRune(): 仅允许 [A-Za-z0-9._-]
- 路径穿越防护: ../../../etc/passwd → etc-passwd-{hash}.log
- 超长 ID 处理: 截断到 64 字符 + hash 确保唯一性
- 自动创建 TMPDIR (MkdirAll)

### 2. 共享日志标识 (executor.go)
- 新增 taskLoggerHandle 结构: 封装 logger、路径、共享标志
- 新增 newTaskLoggerHandle(): 统一处理 logger 创建和回退
- printTaskStart(): 显示 "Log (shared)" 标识
- generateFinalOutput(): 在 summary 中标记共享日志
- 并发失败时明确标识所有任务使用共享主日志

### 3. 内部标志 (config.go)
- TaskResult.sharedLog: 非导出字段,标识共享日志状态

### 4. Race Detector 修复 (logger.go:209-219)
- Close() 在关闭 channel 前先等待 pendingWG
- 消除 Logger.Close() 与 Logger.log() 之间的竞态条件

## 测试覆盖

### 新增测试 (logger_suffix_test.go)
- TestLoggerWithSuffixSanitizesUnsafeSuffix: 非法字符清理
- TestLoggerWithSuffixReturnsErrorWhenTempDirNotWritable: 只读目录处理

### 新增测试 (executor_concurrent_test.go)
- TestConcurrentTaskLoggerFailure: 多任务失败时共享日志标识
- TestSanitizeTaskID: 并发场景下 task ID 清理验证

## 验证结果

 所有单元测试通过
 Race detector 无竞态 (65.4s)
 路径穿越攻击防护
 并发日志完全隔离
 边界情况正确处理

Resolves: PR #76 review feedback
Co-Authored-By: Codex Review <codex@anthropic.ai>

Generated with swe-agent-bot

Co-Authored-By: swe-agent-bot <agent@swe-agent.ai>

* fix(logger): 修复关键 bug 并优化日志系统 (v5.2.5)

修复 P0 级别问题:
- sanitizeLogSuffix 的 trim 碰撞(防止多 task 日志文件名冲突)
- ExtractRecentErrors 边界检查(防止 slice 越界)
- Logger.Close 阻塞风险(新增可配置超时机制)

代码质量改进:
- 删除无用字段 Logger.pid 和 logEntry.level
- 优化 sharedLog 标记绑定到最终 LogPath
- 移除日志前缀,直接输出 backend 原始内容

测试覆盖增强:
- 新增 4 个测试用例(碰撞防护、边界检查、缓存上限、shared 判定)
- 优化测试注释和逻辑

版本更新:5.2.4 → 5.2.5

Generated with swe-agent-bot

Co-Authored-By: swe-agent-bot <agent@swe-agent.ai>

---------

Co-authored-by: swe-agent-bot <agent@swe-agent.ai>
2025-12-17 10:33:38 +08:00

424 lines
11 KiB
Go

package main
import (
"bufio"
"context"
"fmt"
"os"
"regexp"
"strings"
"sync"
"sync/atomic"
"testing"
"time"
)
// TestConcurrentStressLogger 高并发压力测试
func TestConcurrentStressLogger(t *testing.T) {
if testing.Short() {
t.Skip("skipping stress test in short mode")
}
logger, err := NewLoggerWithSuffix("stress")
if err != nil {
t.Fatal(err)
}
defer logger.Close()
t.Logf("Log file: %s", logger.Path())
const (
numGoroutines = 100 // 并发协程数
logsPerRoutine = 1000 // 每个协程写入日志数
totalExpected = numGoroutines * logsPerRoutine
)
var wg sync.WaitGroup
start := time.Now()
// 启动并发写入
for i := 0; i < numGoroutines; i++ {
wg.Add(1)
go func(id int) {
defer wg.Done()
for j := 0; j < logsPerRoutine; j++ {
logger.Info(fmt.Sprintf("goroutine-%d-msg-%d", id, j))
}
}(i)
}
wg.Wait()
logger.Flush()
elapsed := time.Since(start)
// 读取日志文件验证
data, err := os.ReadFile(logger.Path())
if err != nil {
t.Fatalf("failed to read log file: %v", err)
}
lines := strings.Split(strings.TrimSpace(string(data)), "\n")
actualCount := len(lines)
t.Logf("Concurrent stress test results:")
t.Logf(" Goroutines: %d", numGoroutines)
t.Logf(" Logs per goroutine: %d", logsPerRoutine)
t.Logf(" Total expected: %d", totalExpected)
t.Logf(" Total actual: %d", actualCount)
t.Logf(" Duration: %v", elapsed)
t.Logf(" Throughput: %.2f logs/sec", float64(totalExpected)/elapsed.Seconds())
// 验证日志数量
if actualCount < totalExpected/10 {
t.Errorf("too many logs lost: got %d, want at least %d (10%% of %d)",
actualCount, totalExpected/10, totalExpected)
}
t.Logf("Successfully wrote %d/%d logs (%.1f%%)",
actualCount, totalExpected, float64(actualCount)/float64(totalExpected)*100)
// 验证日志格式(纯文本,无前缀)
formatRE := regexp.MustCompile(`^goroutine-\d+-msg-\d+$`)
for i, line := range lines[:min(10, len(lines))] {
if !formatRE.MatchString(line) {
t.Errorf("line %d has invalid format: %s", i, line)
}
}
}
// TestConcurrentBurstLogger 突发流量测试
func TestConcurrentBurstLogger(t *testing.T) {
if testing.Short() {
t.Skip("skipping burst test in short mode")
}
logger, err := NewLoggerWithSuffix("burst")
if err != nil {
t.Fatal(err)
}
defer logger.Close()
t.Logf("Log file: %s", logger.Path())
const (
numBursts = 10
goroutinesPerBurst = 50
logsPerGoroutine = 100
)
totalLogs := 0
start := time.Now()
// 模拟突发流量
for burst := 0; burst < numBursts; burst++ {
var wg sync.WaitGroup
for i := 0; i < goroutinesPerBurst; i++ {
wg.Add(1)
totalLogs += logsPerGoroutine
go func(b, g int) {
defer wg.Done()
for j := 0; j < logsPerGoroutine; j++ {
logger.Info(fmt.Sprintf("burst-%d-goroutine-%d-msg-%d", b, g, j))
}
}(burst, i)
}
wg.Wait()
time.Sleep(10 * time.Millisecond) // 突发间隔
}
logger.Flush()
elapsed := time.Since(start)
// 验证
data, err := os.ReadFile(logger.Path())
if err != nil {
t.Fatalf("failed to read log file: %v", err)
}
lines := strings.Split(strings.TrimSpace(string(data)), "\n")
actualCount := len(lines)
t.Logf("Burst test results:")
t.Logf(" Total bursts: %d", numBursts)
t.Logf(" Goroutines per burst: %d", goroutinesPerBurst)
t.Logf(" Expected logs: %d", totalLogs)
t.Logf(" Actual logs: %d", actualCount)
t.Logf(" Duration: %v", elapsed)
t.Logf(" Throughput: %.2f logs/sec", float64(totalLogs)/elapsed.Seconds())
if actualCount < totalLogs/10 {
t.Errorf("too many logs lost: got %d, want at least %d (10%% of %d)", actualCount, totalLogs/10, totalLogs)
}
t.Logf("Successfully wrote %d/%d logs (%.1f%%)",
actualCount, totalLogs, float64(actualCount)/float64(totalLogs)*100)
}
// TestLoggerChannelCapacity 测试 channel 容量极限
func TestLoggerChannelCapacity(t *testing.T) {
logger, err := NewLoggerWithSuffix("capacity")
if err != nil {
t.Fatal(err)
}
defer logger.Close()
const rapidLogs = 2000 // 超过 channel 容量 (1000)
start := time.Now()
for i := 0; i < rapidLogs; i++ {
logger.Info(fmt.Sprintf("rapid-log-%d", i))
}
sendDuration := time.Since(start)
logger.Flush()
flushDuration := time.Since(start) - sendDuration
t.Logf("Channel capacity test:")
t.Logf(" Logs sent: %d", rapidLogs)
t.Logf(" Send duration: %v", sendDuration)
t.Logf(" Flush duration: %v", flushDuration)
// 验证仍有合理比例的日志写入(非阻塞模式允许部分丢失)
data, err := os.ReadFile(logger.Path())
if err != nil {
t.Fatal(err)
}
lines := strings.Split(strings.TrimSpace(string(data)), "\n")
actualCount := len(lines)
if actualCount < rapidLogs/10 {
t.Errorf("too many logs lost: got %d, want at least %d (10%% of %d)", actualCount, rapidLogs/10, rapidLogs)
}
t.Logf("Logs persisted: %d/%d (%.1f%%)", actualCount, rapidLogs, float64(actualCount)/float64(rapidLogs)*100)
}
// TestLoggerMemoryUsage 内存使用测试
func TestLoggerMemoryUsage(t *testing.T) {
logger, err := NewLoggerWithSuffix("memory")
if err != nil {
t.Fatal(err)
}
defer logger.Close()
const numLogs = 20000
longMessage := strings.Repeat("x", 500) // 500 字节长消息
start := time.Now()
for i := 0; i < numLogs; i++ {
logger.Info(fmt.Sprintf("log-%d-%s", i, longMessage))
}
logger.Flush()
elapsed := time.Since(start)
// 检查文件大小
info, err := os.Stat(logger.Path())
if err != nil {
t.Fatal(err)
}
expectedTotalSize := int64(numLogs * 500) // 理论最小总字节数
expectedMinSize := expectedTotalSize / 10 // 接受最多 90% 丢失
actualSize := info.Size()
t.Logf("Memory/disk usage test:")
t.Logf(" Logs written: %d", numLogs)
t.Logf(" Message size: 500 bytes")
t.Logf(" File size: %.2f MB", float64(actualSize)/1024/1024)
t.Logf(" Duration: %v", elapsed)
t.Logf(" Write speed: %.2f MB/s", float64(actualSize)/1024/1024/elapsed.Seconds())
t.Logf(" Persistence ratio: %.1f%%", float64(actualSize)/float64(expectedTotalSize)*100)
if actualSize < expectedMinSize {
t.Errorf("file size too small: got %d bytes, expected at least %d", actualSize, expectedMinSize)
}
}
// TestLoggerFlushTimeout 测试 Flush 超时机制
func TestLoggerFlushTimeout(t *testing.T) {
logger, err := NewLoggerWithSuffix("flush")
if err != nil {
t.Fatal(err)
}
defer logger.Close()
// 写入一些日志
for i := 0; i < 100; i++ {
logger.Info(fmt.Sprintf("test-log-%d", i))
}
// 测试 Flush 应该在合理时间内完成
start := time.Now()
logger.Flush()
duration := time.Since(start)
t.Logf("Flush duration: %v", duration)
if duration > 6*time.Second {
t.Errorf("Flush took too long: %v (expected < 6s)", duration)
}
}
// TestLoggerOrderPreservation 测试日志顺序保持
func TestLoggerOrderPreservation(t *testing.T) {
logger, err := NewLoggerWithSuffix("order")
if err != nil {
t.Fatal(err)
}
defer logger.Close()
const numGoroutines = 10
const logsPerRoutine = 100
var wg sync.WaitGroup
for i := 0; i < numGoroutines; i++ {
wg.Add(1)
go func(id int) {
defer wg.Done()
for j := 0; j < logsPerRoutine; j++ {
logger.Info(fmt.Sprintf("G%d-SEQ%04d", id, j))
}
}(i)
}
wg.Wait()
logger.Flush()
// 读取并验证每个 goroutine 的日志顺序
data, err := os.ReadFile(logger.Path())
if err != nil {
t.Fatal(err)
}
scanner := bufio.NewScanner(strings.NewReader(string(data)))
sequences := make(map[int][]int) // goroutine ID -> sequence numbers
for scanner.Scan() {
line := scanner.Text()
var gid, seq int
// Parse format: G0-SEQ0001 (without INFO: prefix)
_, err := fmt.Sscanf(line, "G%d-SEQ%04d", &gid, &seq)
if err != nil {
t.Errorf("invalid log format: %s (error: %v)", line, err)
continue
}
sequences[gid] = append(sequences[gid], seq)
}
// 验证每个 goroutine 内部顺序
for gid, seqs := range sequences {
for i := 0; i < len(seqs)-1; i++ {
if seqs[i] >= seqs[i+1] {
t.Errorf("Goroutine %d: out of order at index %d: %d >= %d",
gid, i, seqs[i], seqs[i+1])
}
}
if len(seqs) != logsPerRoutine {
t.Errorf("Goroutine %d: missing logs, got %d, want %d",
gid, len(seqs), logsPerRoutine)
}
}
t.Logf("Order preservation test: all %d goroutines maintained sequence order", len(sequences))
}
func TestConcurrentWorkerPoolLimit(t *testing.T) {
orig := runCodexTaskFn
defer func() { runCodexTaskFn = orig }()
logger, err := NewLoggerWithSuffix("pool-limit")
if err != nil {
t.Fatal(err)
}
setLogger(logger)
t.Cleanup(func() {
_ = closeLogger()
_ = logger.RemoveLogFile()
})
var active int64
var maxSeen int64
runCodexTaskFn = func(task TaskSpec, timeout int) TaskResult {
if task.Context == nil {
t.Fatalf("context not propagated for task %s", task.ID)
}
cur := atomic.AddInt64(&active, 1)
for {
prev := atomic.LoadInt64(&maxSeen)
if cur <= prev || atomic.CompareAndSwapInt64(&maxSeen, prev, cur) {
break
}
}
select {
case <-task.Context.Done():
atomic.AddInt64(&active, -1)
return TaskResult{TaskID: task.ID, ExitCode: 130, Error: "context cancelled"}
case <-time.After(30 * time.Millisecond):
}
atomic.AddInt64(&active, -1)
return TaskResult{TaskID: task.ID}
}
layers := [][]TaskSpec{{{ID: "t1"}, {ID: "t2"}, {ID: "t3"}, {ID: "t4"}, {ID: "t5"}}}
results := executeConcurrentWithContext(context.Background(), layers, 5, 2)
if len(results) != 5 {
t.Fatalf("unexpected result count: got %d", len(results))
}
if maxSeen > 2 {
t.Fatalf("worker pool exceeded limit: saw %d active workers", maxSeen)
}
logger.Flush()
data, err := os.ReadFile(logger.Path())
if err != nil {
t.Fatalf("failed to read log file: %v", err)
}
content := string(data)
if !strings.Contains(content, "worker_limit=2") {
t.Fatalf("concurrency planning log missing, content: %s", content)
}
if !strings.Contains(content, "parallel: start") {
t.Fatalf("concurrency start logs missing, content: %s", content)
}
}
func TestConcurrentCancellationPropagation(t *testing.T) {
orig := runCodexTaskFn
defer func() { runCodexTaskFn = orig }()
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
runCodexTaskFn = func(task TaskSpec, timeout int) TaskResult {
if task.Context == nil {
t.Fatalf("context not propagated for task %s", task.ID)
}
select {
case <-task.Context.Done():
return TaskResult{TaskID: task.ID, ExitCode: 130, Error: "context cancelled"}
case <-time.After(200 * time.Millisecond):
return TaskResult{TaskID: task.ID}
}
}
layers := [][]TaskSpec{{{ID: "a"}, {ID: "b"}, {ID: "c"}}}
go func() {
time.Sleep(50 * time.Millisecond)
cancel()
}()
results := executeConcurrentWithContext(ctx, layers, 1, 2)
if len(results) != 3 {
t.Fatalf("unexpected result count: got %d", len(results))
}
cancelled := 0
for _, res := range results {
if res.ExitCode != 0 {
cancelled++
}
}
if cancelled == 0 {
t.Fatalf("expected cancellation to propagate, got results: %+v", results)
}
}