@@ -16,10 +16,13 @@ import (
1616 "bytes"
1717 "context"
1818 "fmt"
19+ "log"
20+ "math"
1921 "strings"
2022 "text/template"
2123 "time"
2224
25+ "fleet-management-tools/argocd-sync/protection"
2326 fleet "google.golang.org/api/gkehub/v1"
2427 corev1 "k8s.io/api/core/v1"
2528 "k8s.io/apimachinery/pkg/api/errors"
@@ -65,6 +68,16 @@ stringData:
6568`
6669)
6770
71+ // ProtectionConfig holds configuration for transient issue protection
72+ type ProtectionConfig struct {
73+ MaxRetries int
74+ RetryBaseDelay time.Duration
75+ CacheMaxAge time.Duration
76+ DetectionWindow time.Duration
77+ OscillationThreshold int
78+ DropThreshold float64
79+ }
80+
6881// FleetSync is a client that periodically polls the GKE Fleet API and caches fleet information.
6982type FleetSync struct {
7083 svc * fleet.Service
@@ -74,17 +87,38 @@ type FleetSync struct {
7487 MembershipTenancyMapCache map [string ][]string
7588 // A cached map from Scope IDs to a list of Membership full resource names.
7689 ScopeTenancyMapCache map [string ][]string
90+
91+ // Protection logic
92+ cache * protection.Cache
93+ detector * protection.Detector
94+ config * ProtectionConfig
7795}
7896
79- // NewFleetSync creates a new FleetSync and starts its periodical reconciliation.
80- func NewFleetSync (ctx context.Context , projectNum string ) (* FleetSync , error ) {
97+ // NewFleetSync creates a new FleetSync with protection logic and starts its periodical reconciliation.
98+ func NewFleetSync (ctx context.Context , projectNum string , config * ProtectionConfig ) (* FleetSync , error ) {
8199 service , err := fleet .NewService (ctx )
82100 if err != nil {
83101 return nil , err
84102 }
103+
104+ // Default configuration if not provided
105+ if config == nil {
106+ config = & ProtectionConfig {
107+ MaxRetries : 3 ,
108+ RetryBaseDelay : 2 * time .Second ,
109+ CacheMaxAge : 60 * time .Minute ,
110+ DetectionWindow : 10 * time .Minute ,
111+ OscillationThreshold : 2 ,
112+ DropThreshold : 0.3 ,
113+ }
114+ }
115+
85116 c := & FleetSync {
86117 svc : service ,
87118 ProjectNum : projectNum ,
119+ cache : protection .NewCache (config .CacheMaxAge ),
120+ detector : protection .NewDetector (config .DetectionWindow , config .OscillationThreshold , config .DropThreshold ),
121+ config : config ,
88122 }
89123
90124 // Build the initial fleet topology before handling RPCs.
@@ -371,7 +405,65 @@ func (c *FleetSync) listScopes(ctx context.Context, project string) ([]*fleet.Sc
371405}
372406
373407// listMembershipBindings fetches the membership bindings under a given parent.
408+ // listMembershipBindings fetches the membership bindings with transient issue protection.
374409func (c * FleetSync ) listMembershipBindings (ctx context.Context , project string ) ([]* fleet.MembershipBinding , error ) {
410+ for attempt := 0 ; attempt < c .config .MaxRetries ; attempt ++ {
411+ // Call Fleet API
412+ bindings , err := c .listMembershipBindingsInternal (ctx , project )
413+ if err != nil {
414+ if attempt < c .config .MaxRetries - 1 {
415+ delay := c .config .RetryBaseDelay * time .Duration (math .Pow (2 , float64 (attempt )))
416+ log .Printf ("Fleet API error (attempt %d/%d), retrying in %v: %v" ,
417+ attempt + 1 , c .config .MaxRetries , delay , err )
418+ time .Sleep (delay )
419+ continue
420+ }
421+
422+ // All retries failed, try cache
423+ if cached , ok := c .cache .Get (); ok {
424+ log .Printf ("Fleet API failed after %d attempts, using cached response (age: %v)" ,
425+ c .config .MaxRetries , c .cache .Age ())
426+ return cached , nil
427+ }
428+
429+ return nil , fmt .Errorf ("fleet API failed after %d attempts and no valid cache: %w" ,
430+ c .config .MaxRetries , err )
431+ }
432+
433+ // Check for transient issue
434+ itemCount := len (bindings )
435+ if isTransient , reason := c .detector .IsTransientIssue (itemCount ); isTransient {
436+ log .Printf ("Transient issue detected: %s" , reason )
437+
438+ if attempt < c .config .MaxRetries - 1 {
439+ delay := c .config .RetryBaseDelay * time .Duration (math .Pow (2 , float64 (attempt )))
440+ log .Printf ("Retrying Fleet API (attempt %d/%d) in %v" , attempt + 1 , c .config .MaxRetries , delay )
441+ time .Sleep (delay )
442+ continue
443+ }
444+
445+ // Retries exhausted, use cache
446+ if cached , ok := c .cache .Get (); ok {
447+ log .Printf ("Transient issue persists after %d attempts, using cached response (age: %v)" ,
448+ c .config .MaxRetries , c .cache .Age ())
449+ return cached , nil
450+ }
451+
452+ log .Printf ("WARNING: Transient issue detected but no valid cache available" )
453+ // You could return error here to block reconciliation entirely
454+ }
455+
456+ // Response looks good, cache it
457+ c .cache .Set (bindings )
458+ log .Printf ("Fleet API success: %d membership bindings" , itemCount )
459+ return bindings , nil
460+ }
461+
462+ return nil , fmt .Errorf ("unexpected retry loop exit" )
463+ }
464+
465+ // listMembershipBindingsInternal is the original implementation (renamed from listMembershipBindings)
466+ func (c * FleetSync ) listMembershipBindingsInternal (ctx context.Context , project string ) ([]* fleet.MembershipBinding , error ) {
375467 var ret []* fleet.MembershipBinding
376468 parent := fmt .Sprintf ("projects/%s/locations/-/memberships/-" , project )
377469 call := c .svc .Projects .Locations .Memberships .Bindings .List (parent )
0 commit comments