feat(auth): Optimize retry logic and add circuit breaker for team processing

- Reduce authorization retry attempts from 3 to 2 and optimize retry delays from exponential (3s, 5s, 8s, 12s) to fixed 3s backoff
- Implement team-level circuit breaker: skip member processing when 3+ consecutive 500 errors detected in same team
- Add consecutive 500 error tracking with atomic counter and reset on successful authorization
- Reduce ObtainAuthorizationCode timeout from 3 minutes to 90 seconds with updated error messages
- Optimize Codex API workspace selection retry: reduce max attempts from 5 to 3 with shorter backoff (2s, 4s instead of 3s, 5s, 8s, 12s)
- Reduce S2A OAuth submission retry delays from (4s, 6s) to (2s, 3s) for faster failure detection
- Optimize member stagger timing: reduce from 3s + idx*2s to 1s + idx*1s with reduced jitter (0-1s instead of 0-2s)
- Add early exit for exhausted retries in CompleteWithCodexAPI to prevent unnecessary outer retry attempts
- These changes improve responsiveness and reduce cascading failures during bulk team processing
This commit is contained in:
2026-02-07 23:19:06 +08:00
parent d6cd7660eb
commit 847574e89e
3 changed files with 36 additions and 14 deletions

View File

@@ -296,7 +296,7 @@ func (c *CodexAPIAuth) GetSessionID() string {
return c.sessionID
}
// ObtainAuthorizationCode 获取授权码(全局 3 分钟超时)
// ObtainAuthorizationCode 获取授权码(全局 90 秒超时)
func (c *CodexAPIAuth) ObtainAuthorizationCode() (string, error) {
type authResult struct {
code string
@@ -310,8 +310,8 @@ func (c *CodexAPIAuth) ObtainAuthorizationCode() (string, error) {
select {
case r := <-resultCh:
return r.code, r.err
case <-time.After(3 * time.Minute):
return "", fmt.Errorf("授权超时 (3分钟)")
case <-time.After(90 * time.Second):
return "", fmt.Errorf("授权超时 (90秒)")
}
}
@@ -519,13 +519,13 @@ func (c *CodexAPIAuth) obtainAuthorizationCodeInternal() (string, error) {
"workspace_id": c.workspaceID,
}
// 添加 500 错误重试机制 - 最多重试 5 次,指数退避 + 随机抖动
// 添加 500 错误重试机制 - 最多重试 3 次,退避 + 随机抖动
var lastErr error
for retry := 0; retry < 5; retry++ {
for retry := 0; retry < 3; retry++ {
if retry > 0 {
// 指数退避: 3s, 5s, 8s, 12s 基础延迟 + 0~3s 随机抖动
baseDelay := time.Duration(3+retry*2) * time.Second
jitter := time.Duration(rand.Intn(3000)) * time.Millisecond
// 退避: 2s, 4s 基础延迟 + 0~2s 随机抖动
baseDelay := time.Duration(2+retry*2) * time.Second
jitter := time.Duration(rand.Intn(2000)) * time.Millisecond
delay := baseDelay + jitter
c.logStep(StepSelectWorkspace, "第 %d 次重试选择工作区 (等待 %.1fs)...", retry+1, delay.Seconds())
time.Sleep(delay)
@@ -705,6 +705,10 @@ func CompleteWithCodexAPI(email, password, workspaceID, authURL, sessionID, prox
code, err := auth.ObtainAuthorizationCode()
if err != nil {
auth.tlsClient.Close()
// 如果内层重试已耗尽(持续 500直接返回不再外层重试
if strings.Contains(err.Error(), "重试已耗尽") || strings.Contains(err.Error(), "授权超时") {
return "", err
}
// 检查是否为 403 错误
if strings.Contains(err.Error(), "403") {
lastErr = err

View File

@@ -133,7 +133,7 @@ func SubmitS2AOAuth(s2aAPIBase, s2aAdminKey, sessionID, code, name string, concu
var lastErr error
for attempt := 0; attempt < 3; attempt++ {
if attempt > 0 {
time.Sleep(time.Duration(2+attempt*2) * time.Second) // 4s, 6s
time.Sleep(time.Duration(1+attempt) * time.Second) // 2s, 3s
}
req, _ := http.NewRequest("POST", apiURL, bytes.NewReader(body))