feat(auth): Optimize retry logic and add circuit breaker for team processing

- Reduce authorization retry attempts from 3 to 2 and optimize retry delays from exponential (3s, 5s, 8s, 12s) to fixed 3s backoff
- Implement team-level circuit breaker: skip member processing when 3+ consecutive 500 errors detected in same team
- Add consecutive 500 error tracking with atomic counter and reset on successful authorization
- Reduce ObtainAuthorizationCode timeout from 3 minutes to 90 seconds with updated error messages
- Optimize Codex API workspace selection retry: reduce max attempts from 5 to 3 with shorter backoff (2s, 4s instead of 3s, 5s, 8s, 12s)
- Reduce S2A OAuth submission retry delays from (4s, 6s) to (2s, 3s) for faster failure detection
- Optimize member stagger timing: reduce from 3s + idx*2s to 1s + idx*1s with reduced jitter (0-1s instead of 0-2s)
- Add early exit for exhausted retries in CompleteWithCodexAPI to prevent unnecessary outer retry attempts
- These changes improve responsiveness and reduce cascading failures during bulk team processing
This commit is contained in:
2026-02-07 23:19:06 +08:00
parent d6cd7660eb
commit 847574e89e
3 changed files with 36 additions and 14 deletions

View File

@@ -730,6 +730,9 @@ func processSingleTeam(idx int, req TeamProcessRequest) (result TeamProcessResul
var s2aSuccessCount int32
var s2aFailCount int32
// Team 级别 500 错误熔断器:当同 Team 多个成员连续遇到 500 时快速失败
var consecutive500Fails int32
// 入库并发控制信号量
s2aSem := make(chan struct{}, req.ConcurrentS2A)
@@ -779,14 +782,23 @@ func processSingleTeam(idx int, req TeamProcessRequest) (result TeamProcessResul
var s2aSuccess bool
var lastError string
for attempt := 0; attempt < 3; attempt++ { // 最多重试2
for attempt := 0; attempt < 2; attempt++ { // 最多重试1
// 检查停止信号
if isStopped() {
return false
}
// 熔断检查:同 Team 已有 3+ 个成员连续 500 失败,快速跳过
if atomic.LoadInt32(&consecutive500Fails) >= 3 {
logger.Warning(fmt.Sprintf("%s 同 Team 已有多个成员连续 500 失败,跳过入库", memberLogPrefix), memberEmail, "team")
atomic.AddInt32(&s2aFailCount, 1)
memberMu.Lock()
result.Errors = append(result.Errors, fmt.Sprintf("成员 %d 入库跳过: 服务器持续 500 (熔断)", memberIdx+1))
memberMu.Unlock()
return false
}
if attempt > 0 {
// 重试前等待一段时间,避免密集请求(可被停止信号中断)
retryDelay := time.Duration(3+attempt*2) * time.Second
// 重试前短暂等待(可被停止信号中断)
retryDelay := time.Duration(3) * time.Second
logger.Warning(fmt.Sprintf("%s 入库重试 (第%d次, 等待 %ds)", memberLogPrefix, attempt+1, int(retryDelay.Seconds())), memberEmail, "team")
select {
case <-time.After(retryDelay):
@@ -847,8 +859,14 @@ func processSingleTeam(idx int, req TeamProcessRequest) (result TeamProcessResul
if err != nil {
lastError = fmt.Sprintf("浏览器授权失败: %v", err)
logger.Error(fmt.Sprintf("%s %s", memberLogPrefix, lastError), memberEmail, "team")
// 跟踪 500 错误用于熔断
if strings.Contains(err.Error(), "500") || strings.Contains(err.Error(), "重试已耗尽") {
atomic.AddInt32(&consecutive500Fails, 1)
}
continue
}
// 授权成功,重置 500 计数
atomic.StoreInt32(&consecutive500Fails, 0)
// 提交到 S2A
_, err = auth.SubmitS2AOAuth(
@@ -964,8 +982,8 @@ func processSingleTeam(idx int, req TeamProcessRequest) (result TeamProcessResul
s2aWg.Add(1)
go func(idx int, e, p string) {
defer s2aWg.Done()
// 基础 3 秒 + 成员索引 * 2 秒错开 + 0~2 秒随机抖动,避免同 Team 多成员同时选工作区
stagger := 3*time.Second + time.Duration(idx*2)*time.Second + time.Duration(rand.Intn(2000))*time.Millisecond
// 基础 1 秒 + 成员索引 * 1 秒错开 + 0~1 秒随机抖动
stagger := 1*time.Second + time.Duration(idx)*time.Second + time.Duration(rand.Intn(1000))*time.Millisecond
select {
case <-time.After(stagger):
case <-teamProcessState.stopCh: