Skip to content

Commit 4e02c1b

Browse files
Prashanth DuddebandaPrashanth Duddebanda
authored andcommitted
Add protection against transient Fleet API issues
During Redis restarts or maintenance, Fleet API occasionally returns incomplete responses, causing ArgoCD to delete applications. Solution adds three-layer protection: - Detection: Identifies transient issues by pattern analysis - Retry: 3 attempts with exponential backoff - Cache: Falls back to last known good response Changes: - Add protection/ package for detection and caching - Update fleetclient.go with retry logic - Add configuration via environment variables - Add comprehensive test coverage (29 tests passing) Fixes application deletions during Redis HA failover
1 parent 462ddd8 commit 4e02c1b

10 files changed

Lines changed: 1215 additions & 32 deletions

File tree

README-PROTECTION.md

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
# Fleet API Protection
2+
3+
Adds protection against transient Fleet API issues that cause application deletions.
4+
5+
## Problem
6+
7+
During Redis restarts or maintenance, Fleet API occasionally returns incomplete responses.
8+
The plugin didn't validate or retry, causing ArgoCD to delete applications.
9+
10+
**Incident 2026-01-30:**
11+
- API returned 6 bindings instead of 12
12+
- 3 applications deleted
13+
- 2-12 minutes downtime each
14+
15+
## Solution
16+
17+
Three-layer protection:
18+
19+
1. **Detection** - Identifies transient issues by pattern:
20+
- Oscillation: count changes 2+ times in 10 minutes (12→6→12)
21+
- Sudden drop: >30% decrease
22+
23+
2. **Retry** - 3 attempts with exponential backoff (2s, 4s, 8s)
24+
25+
3. **Cache** - Falls back to last known good response (60 min TTL)
26+
27+
## Configuration
28+
29+
Environment variables with defaults:
30+
31+
```bash
32+
MAX_API_RETRIES=3
33+
RETRY_BASE_DELAY_SECONDS=2
34+
CACHE_MAX_AGE_MINUTES=60
35+
DETECTION_WINDOW_MINUTES=10
36+
OSCILLATION_THRESHOLD=2
37+
DROP_THRESHOLD_PERCENT=30
38+
```
39+
40+
## Testing
41+
42+
```bash
43+
cd fleet-argocd-plugin
44+
go test ./protection/...
45+
go test ./fleetclient/...
46+
```
47+
48+
All tests pass. Incident replay confirms 2026-01-30 pattern is detected.
49+
50+
## Changes
51+
52+
**Modified files (2):**
53+
- `fleetclient/fleetclient.go` - Added protection wrapper
54+
- `main.go` - Added config loading
55+
56+
**New files (4):**
57+
- `protection/detector.go` - Transient issue detection
58+
- `protection/cache.go` - Response caching
59+
- `protection/*_test.go` - Test coverage
60+
61+
Zero breaking changes. Protection is transparent in normal operation.

fleet-argocd-plugin/.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
fleet-sync

fleet-argocd-plugin/fleetclient/fleetclient.go

Lines changed: 94 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,13 @@ import (
1616
"bytes"
1717
"context"
1818
"fmt"
19+
"log"
20+
"math"
1921
"strings"
2022
"text/template"
2123
"time"
2224

25+
"fleet-management-tools/argocd-sync/protection"
2326
fleet "google.golang.org/api/gkehub/v1"
2427
corev1 "k8s.io/api/core/v1"
2528
"k8s.io/apimachinery/pkg/api/errors"
@@ -65,6 +68,16 @@ stringData:
6568
`
6669
)
6770

71+
// ProtectionConfig holds configuration for transient issue protection
72+
type ProtectionConfig struct {
73+
MaxRetries int
74+
RetryBaseDelay time.Duration
75+
CacheMaxAge time.Duration
76+
DetectionWindow time.Duration
77+
OscillationThreshold int
78+
DropThreshold float64
79+
}
80+
6881
// FleetSync is a client that periodically polls the GKE Fleet API and caches fleet information.
6982
type FleetSync struct {
7083
svc *fleet.Service
@@ -74,17 +87,38 @@ type FleetSync struct {
7487
MembershipTenancyMapCache map[string][]string
7588
// A cached map from Scope IDs to a list of Membership full resource names.
7689
ScopeTenancyMapCache map[string][]string
90+
91+
// Protection logic
92+
cache *protection.Cache
93+
detector *protection.Detector
94+
config *ProtectionConfig
7795
}
7896

79-
// NewFleetSync creates a new FleetSync and starts its periodical reconciliation.
80-
func NewFleetSync(ctx context.Context, projectNum string) (*FleetSync, error) {
97+
// NewFleetSync creates a new FleetSync with protection logic and starts its periodical reconciliation.
98+
func NewFleetSync(ctx context.Context, projectNum string, config *ProtectionConfig) (*FleetSync, error) {
8199
service, err := fleet.NewService(ctx)
82100
if err != nil {
83101
return nil, err
84102
}
103+
104+
// Default configuration if not provided
105+
if config == nil {
106+
config = &ProtectionConfig{
107+
MaxRetries: 3,
108+
RetryBaseDelay: 2 * time.Second,
109+
CacheMaxAge: 60 * time.Minute,
110+
DetectionWindow: 10 * time.Minute,
111+
OscillationThreshold: 2,
112+
DropThreshold: 0.3,
113+
}
114+
}
115+
85116
c := &FleetSync{
86117
svc: service,
87118
ProjectNum: projectNum,
119+
cache: protection.NewCache(config.CacheMaxAge),
120+
detector: protection.NewDetector(config.DetectionWindow, config.OscillationThreshold, config.DropThreshold),
121+
config: config,
88122
}
89123

90124
// Build the initial fleet topology before handling RPCs.
@@ -371,7 +405,65 @@ func (c *FleetSync) listScopes(ctx context.Context, project string) ([]*fleet.Sc
371405
}
372406

373407
// listMembershipBindings fetches the membership bindings under a given parent.
408+
// listMembershipBindings fetches the membership bindings with transient issue protection.
374409
func (c *FleetSync) listMembershipBindings(ctx context.Context, project string) ([]*fleet.MembershipBinding, error) {
410+
for attempt := 0; attempt < c.config.MaxRetries; attempt++ {
411+
// Call Fleet API
412+
bindings, err := c.listMembershipBindingsInternal(ctx, project)
413+
if err != nil {
414+
if attempt < c.config.MaxRetries-1 {
415+
delay := c.config.RetryBaseDelay * time.Duration(math.Pow(2, float64(attempt)))
416+
log.Printf("Fleet API error (attempt %d/%d), retrying in %v: %v",
417+
attempt+1, c.config.MaxRetries, delay, err)
418+
time.Sleep(delay)
419+
continue
420+
}
421+
422+
// All retries failed, try cache
423+
if cached, ok := c.cache.Get(); ok {
424+
log.Printf("Fleet API failed after %d attempts, using cached response (age: %v)",
425+
c.config.MaxRetries, c.cache.Age())
426+
return cached, nil
427+
}
428+
429+
return nil, fmt.Errorf("fleet API failed after %d attempts and no valid cache: %w",
430+
c.config.MaxRetries, err)
431+
}
432+
433+
// Check for transient issue
434+
itemCount := len(bindings)
435+
if isTransient, reason := c.detector.IsTransientIssue(itemCount); isTransient {
436+
log.Printf("Transient issue detected: %s", reason)
437+
438+
if attempt < c.config.MaxRetries-1 {
439+
delay := c.config.RetryBaseDelay * time.Duration(math.Pow(2, float64(attempt)))
440+
log.Printf("Retrying Fleet API (attempt %d/%d) in %v", attempt+1, c.config.MaxRetries, delay)
441+
time.Sleep(delay)
442+
continue
443+
}
444+
445+
// Retries exhausted, use cache
446+
if cached, ok := c.cache.Get(); ok {
447+
log.Printf("Transient issue persists after %d attempts, using cached response (age: %v)",
448+
c.config.MaxRetries, c.cache.Age())
449+
return cached, nil
450+
}
451+
452+
log.Printf("WARNING: Transient issue detected but no valid cache available")
453+
// You could return error here to block reconciliation entirely
454+
}
455+
456+
// Response looks good, cache it
457+
c.cache.Set(bindings)
458+
log.Printf("Fleet API success: %d membership bindings", itemCount)
459+
return bindings, nil
460+
}
461+
462+
return nil, fmt.Errorf("unexpected retry loop exit")
463+
}
464+
465+
// listMembershipBindingsInternal is the original implementation (renamed from listMembershipBindings)
466+
func (c *FleetSync) listMembershipBindingsInternal(ctx context.Context, project string) ([]*fleet.MembershipBinding, error) {
375467
var ret []*fleet.MembershipBinding
376468
parent := fmt.Sprintf("projects/%s/locations/-/memberships/-", project)
377469
call := c.svc.Projects.Locations.Memberships.Bindings.List(parent)

0 commit comments

Comments
 (0)