From 19ac6a0285dfbde9a7befdb965c19bd4609ecd59 Mon Sep 17 00:00:00 2001 From: l1b0k Date: Fri, 30 Aug 2024 12:24:25 +0800 Subject: [PATCH 1/3] daemon: add a wait check for mac in ecs metadata metadata sync is slow, have to wait it found Signed-off-by: l1b0k --- pkg/factory/aliyun/aliyun.go | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/pkg/factory/aliyun/aliyun.go b/pkg/factory/aliyun/aliyun.go index 06acf3ed..c1cd0cd1 100644 --- a/pkg/factory/aliyun/aliyun.go +++ b/pkg/factory/aliyun/aliyun.go @@ -191,6 +191,19 @@ func (a *Aliyun) CreateNetworkInterface(ipv4, ipv6 int, eniType string) (*daemon return r, nil, nil, err } + // wait mac + err = wait.PollUntilContextTimeout(ctx, metadataPollInterval, metadataWaitTimeout, true, func(ctx context.Context) (bool, error) { + macs, err := metadata.GetENIsMAC() + if err != nil { + klog.Errorf("metadata: error get mac: %v", err) + return false, nil + } + return sets.NewString(macs...).Has(r.MAC), nil + }) + if err != nil { + return r, nil, nil, err + } + prefix, err := metadata.GetVSwitchCIDR(eni.MacAddress) if err != nil { return r, nil, nil, err From b8b91be489c1d9b61b7890b5a0fd9ec0bc320b52 Mon Sep 17 00:00:00 2001 From: l1b0k Date: Thu, 29 Aug 2024 22:29:27 +0800 Subject: [PATCH 2/3] daemon add the orphan ip warning default 0s Warning ResourceInvalid node/cn-hangzhou.172.16.1.196 orphan ip found on ecs metadata, ip: 172.16.64.4, restart terway to resync data Signed-off-by: l1b0k --- pkg/eni/local.go | 38 +++++++++++++++++++++++++++++++++++++- pkg/eni/local_test.go | 29 +++++++++++++++++++++++++++++ pkg/eni/types.go | 2 +- 3 files changed, 67 insertions(+), 2 deletions(-) diff --git a/pkg/eni/local.go b/pkg/eni/local.go index 052d8cec..75bf724a 100644 --- a/pkg/eni/local.go +++ b/pkg/eni/local.go @@ -13,6 +13,7 @@ import ( "golang.org/x/time/rate" corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/util/cache" "k8s.io/apimachinery/pkg/util/sets" "k8s.io/apimachinery/pkg/util/wait" logf "sigs.k8s.io/controller-runtime/pkg/log" @@ -27,6 +28,8 @@ import ( "github.com/AliyunContainerService/terway/pkg/metric" ) +const defaultSyncPeriod = 1 * time.Minute + var _ NetworkInterface = &Local{} var _ Usage = &Local{} var _ ReportStatus = &Trunk{} @@ -177,7 +180,7 @@ func (l *Local) Run(ctx context.Context, podResources []daemon.PodResources, wg go l.notify(ctx) - go wait.JitterUntil(l.sync, 1*time.Minute, 1.0, true, ctx.Done()) + go wait.JitterUntil(l.sync, defaultSyncPeriod, 1.0, true, ctx.Done()) return nil } @@ -370,6 +373,7 @@ func (l *Local) sync() { syncIPLocked(l.ipv4, ipv4) syncIPLocked(l.ipv6, ipv6) + report() l.cond.Broadcast() } @@ -1037,8 +1041,40 @@ func syncIPLocked(lo Set, remote []netip.Addr) { } } } + orphanIP(lo, s) +} + +func orphanIP(lo Set, remote sets.Set[netip.Addr]) { + for key := range remote { + if _, ok := lo[key]; !ok { + + prev, ok := invalidIPCache.Get(key) + if !ok { + invalidIPCache.Add(key, 1, 5*defaultSyncPeriod) + } else { + invalidIPCache.Add(key, prev.(int)+1, 5*defaultSyncPeriod) + } + } else { + invalidIPCache.Remove(key) + } + } +} + +func report() { + for _, key := range invalidIPCache.Keys() { + count, ok := invalidIPCache.Get(key) + if !ok { + continue + } + if count.(int) > 1 { + _ = tracing.RecordNodeEvent(corev1.EventTypeWarning, string(types.ErrResourceInvalid), fmt.Sprintf("orphan ip found on ecs metadata, ip: %s", key)) + logf.Log.Info("orphan ip found on ecs metadata", "ip", key) + } + } } +var invalidIPCache = cache.NewLRUExpireCache(100) + func parseResourceID(id string) (string, string, error) { parts := strings.SplitN(id, ".", 2) if len(parts) < 2 { diff --git a/pkg/eni/local_test.go b/pkg/eni/local_test.go index e3a47202..c0d52a3d 100644 --- a/pkg/eni/local_test.go +++ b/pkg/eni/local_test.go @@ -10,6 +10,8 @@ import ( "github.com/stretchr/testify/assert" "golang.org/x/time/rate" + "k8s.io/apimachinery/pkg/util/cache" + "k8s.io/apimachinery/pkg/util/sets" "github.com/AliyunContainerService/terway/pkg/factory" "github.com/AliyunContainerService/terway/types" @@ -309,3 +311,30 @@ func Test_parseResourceID(t *testing.T) { }) } } + +func Test_orphanIP(t *testing.T) { + invalidIPCache = cache.NewLRUExpireCache(100) + + lo1 := map[netip.Addr]*IP{ + netip.MustParseAddr("127.0.0.1"): { + ip: netip.MustParseAddr("127.0.0.1"), + }, + } + + remote1 := sets.Set[netip.Addr]{ + netip.MustParseAddr("127.0.0.1"): {}, + netip.MustParseAddr("127.0.0.2"): {}, + } + + orphanIP(lo1, remote1) + + v, _ := invalidIPCache.Get(netip.MustParseAddr("127.0.0.1")) + assert.Equal(t, nil, v) + + v, _ = invalidIPCache.Get(netip.MustParseAddr("127.0.0.2")) + assert.Equal(t, 1, v) + + orphanIP(lo1, remote1) + v, _ = invalidIPCache.Get(netip.MustParseAddr("127.0.0.2")) + assert.Equal(t, 2, v) +} diff --git a/pkg/eni/types.go b/pkg/eni/types.go index 82c762a7..c9142c63 100644 --- a/pkg/eni/types.go +++ b/pkg/eni/types.go @@ -94,7 +94,7 @@ func (ip *IP) Allocatable() bool { return ip.Valid() && !ip.InUse() } -type Set map[any]*IP +type Set map[netip.Addr]*IP func (s Set) Idles() []*IP { var result []*IP From 00abe40ede0ea365af07a8d5cc367da4d25976ba Mon Sep 17 00:00:00 2001 From: l1b0k Date: Thu, 12 Sep 2024 17:16:30 +0800 Subject: [PATCH 3/3] cni: retry get mac on new eni Signed-off-by: l1b0k --- plugin/terway/cni.go | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/plugin/terway/cni.go b/plugin/terway/cni.go index ff9edd10..8433a8b3 100644 --- a/plugin/terway/cni.go +++ b/plugin/terway/cni.go @@ -2,6 +2,7 @@ package main import ( "context" + "errors" "fmt" "net" "runtime" @@ -9,6 +10,8 @@ import ( "google.golang.org/grpc/backoff" "google.golang.org/grpc/credentials/insecure" + "k8s.io/apimachinery/pkg/util/wait" + "k8s.io/client-go/util/retry" "github.com/AliyunContainerService/terway/pkg/link" "github.com/AliyunContainerService/terway/plugin/datapath" @@ -298,7 +301,17 @@ func parseSetupConf(args *skel.CmdArgs, alloc *rpc.NetConf, conf *types.CNIConf, if alloc.GetENIInfo() != nil { mac := alloc.GetENIInfo().GetMAC() if mac != "" { - deviceID, err = link.GetDeviceNumber(mac) + err = retry.OnError(wait.Backoff{ + Steps: 10, + Duration: 1 * time.Second, + Factor: 1.0, + Jitter: 0, + }, func(err error) bool { + return errors.Is(err, link.ErrNotFound) + }, func() error { + deviceID, err = link.GetDeviceNumber(mac) + return err + }) if err != nil { return nil, err }