// +build !providerless /* Copyright 2017 The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ package gce import ( "context" "fmt" "net/http" "strconv" "strings" "github.com/GoogleCloudPlatform/k8s-cloud-provider/pkg/cloud" "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/types" utilerrors "k8s.io/apimachinery/pkg/util/errors" "k8s.io/apimachinery/pkg/util/sets" servicehelpers "k8s.io/cloud-provider/service/helpers" utilnet "k8s.io/utils/net" compute "google.golang.org/api/compute/v1" "k8s.io/klog/v2" ) const ( errStrLbNoHosts = "cannot EnsureLoadBalancer() with no hosts" ) // ensureExternalLoadBalancer is the external implementation of LoadBalancer.EnsureLoadBalancer. // Our load balancers in GCE consist of four separate GCE resources - a static // IP address, a firewall rule, a target pool, and a forwarding rule. This // function has to manage all of them. // // Due to an interesting series of design decisions, this handles both creating // new load balancers and updating existing load balancers, recognizing when // each is needed. func (g *Cloud) ensureExternalLoadBalancer(clusterName string, clusterID string, apiService *v1.Service, existingFwdRule *compute.ForwardingRule, nodes []*v1.Node) (*v1.LoadBalancerStatus, error) { if len(nodes) == 0 { return nil, fmt.Errorf(errStrLbNoHosts) } hostNames := nodeNames(nodes) supportsNodesHealthCheck := supportsNodesHealthCheck(nodes) hosts, err := g.getInstancesByNames(hostNames) if err != nil { return nil, err } loadBalancerName := g.GetLoadBalancerName(context.TODO(), clusterName, apiService) requestedIP := apiService.Spec.LoadBalancerIP ports := apiService.Spec.Ports portStr := []string{} for _, p := range apiService.Spec.Ports { portStr = append(portStr, fmt.Sprintf("%s/%d", p.Protocol, p.Port)) } serviceName := types.NamespacedName{Namespace: apiService.Namespace, Name: apiService.Name} lbRefStr := fmt.Sprintf("%v(%v)", loadBalancerName, serviceName) klog.V(2).Infof("ensureExternalLoadBalancer(%s, %v, %v, %v, %v, %v)", lbRefStr, g.region, requestedIP, portStr, hostNames, apiService.Annotations) // Check the current and the desired network tiers. If they do not match, // tear down the existing resources with the wrong tier. netTier, err := g.getServiceNetworkTier(apiService) if err != nil { klog.Errorf("ensureExternalLoadBalancer(%s): Failed to get the desired network tier: %v.", lbRefStr, err) return nil, err } klog.V(4).Infof("ensureExternalLoadBalancer(%s): Desired network tier %q.", lbRefStr, netTier) // TODO: distinguish between unspecified and specified network tiers annotation properly in forwardingrule creation // Only delete ForwardingRule when network tier annotation is specified, otherwise leave it only to avoid wrongful // deletion against user intention when network tier annotation is not specified. if _, ok := apiService.Annotations[NetworkTierAnnotationKey]; ok { g.deleteWrongNetworkTieredResources(loadBalancerName, lbRefStr, netTier) } // Check if the forwarding rule exists, and if so, what its IP is. fwdRuleExists, fwdRuleNeedsUpdate, fwdRuleIP, err := g.forwardingRuleNeedsUpdate(loadBalancerName, g.region, requestedIP, ports) if err != nil { return nil, err } if !fwdRuleExists { klog.V(2).Infof("ensureExternalLoadBalancer(%s): Forwarding rule %v doesn't exist.", lbRefStr, loadBalancerName) } // Make sure we know which IP address will be used and have properly reserved // it as static before moving forward with the rest of our operations. // // We use static IP addresses when updating a load balancer to ensure that we // can replace the load balancer's other components without changing the // address its service is reachable on. We do it this way rather than always // keeping the static IP around even though this is more complicated because // it makes it less likely that we'll run into quota issues. Only 7 static // IP addresses are allowed per region by default. // // We could let an IP be allocated for us when the forwarding rule is created, // but we need the IP to set up the firewall rule, and we want to keep the // forwarding rule creation as the last thing that needs to be done in this // function in order to maintain the invariant that "if the forwarding rule // exists, the LB has been fully created". ipAddressToUse := "" // Through this process we try to keep track of whether it is safe to // release the IP that was allocated. If the user specifically asked for // an IP, we assume they are managing it themselves. Otherwise, we will // release the IP in case of early-terminating failure or upon successful // creating of the LB. // TODO(#36535): boil this logic down into a set of component functions // and key the flag values off of errors returned. isUserOwnedIP := false // if this is set, we never release the IP isSafeToReleaseIP := false defer func() { if isUserOwnedIP { return } if isSafeToReleaseIP { if err := g.DeleteRegionAddress(loadBalancerName, g.region); err != nil && !isNotFound(err) { klog.Errorf("ensureExternalLoadBalancer(%s): Failed to release static IP %s in region %v: %v.", lbRefStr, ipAddressToUse, g.region, err) } else if isNotFound(err) { klog.V(2).Infof("ensureExternalLoadBalancer(%s): IP address %s is not reserved.", lbRefStr, ipAddressToUse) } else { klog.Infof("ensureExternalLoadBalancer(%s): Released static IP %s.", lbRefStr, ipAddressToUse) } } else { klog.Warningf("ensureExternalLoadBalancer(%s): Orphaning static IP %s in region %v: %v.", lbRefStr, ipAddressToUse, g.region, err) } }() if requestedIP != "" { // If user requests a specific IP address, verify first. No mutation to // the GCE resources will be performed in the verification process. isUserOwnedIP, err = verifyUserRequestedIP(g, g.region, requestedIP, fwdRuleIP, lbRefStr, netTier) if err != nil { return nil, err } ipAddressToUse = requestedIP } if !isUserOwnedIP { // If we are not using the user-owned IP, either promote the // emphemeral IP used by the fwd rule, or create a new static IP. ipAddr, existed, err := ensureStaticIP(g, loadBalancerName, serviceName.String(), g.region, fwdRuleIP, netTier) if err != nil { return nil, fmt.Errorf("failed to ensure a static IP for load balancer (%s): %v", lbRefStr, err) } klog.Infof("ensureExternalLoadBalancer(%s): Ensured IP address %s (tier: %s).", lbRefStr, ipAddr, netTier) // If the IP was not owned by the user, but it already existed, it // could indicate that the previous update cycle failed. We can use // this IP and try to run through the process again, but we should // not release the IP unless it is explicitly flagged as OK. isSafeToReleaseIP = !existed ipAddressToUse = ipAddr } // Deal with the firewall next. The reason we do this here rather than last // is because the forwarding rule is used as the indicator that the load // balancer is fully created - it's what getLoadBalancer checks for. // Check if user specified the allow source range sourceRanges, err := servicehelpers.GetLoadBalancerSourceRanges(apiService) if err != nil { return nil, err } firewallExists, firewallNeedsUpdate, err := g.firewallNeedsUpdate(loadBalancerName, serviceName.String(), ipAddressToUse, ports, sourceRanges) if err != nil { return nil, err } if firewallNeedsUpdate { desc := makeFirewallDescription(serviceName.String(), ipAddressToUse) // Unlike forwarding rules and target pools, firewalls can be updated // without needing to be deleted and recreated. if firewallExists { klog.Infof("ensureExternalLoadBalancer(%s): Updating firewall.", lbRefStr) if err := g.updateFirewall(apiService, MakeFirewallName(loadBalancerName), desc, sourceRanges, ports, hosts); err != nil { return nil, err } klog.Infof("ensureExternalLoadBalancer(%s): Updated firewall.", lbRefStr) } else { klog.Infof("ensureExternalLoadBalancer(%s): Creating firewall.", lbRefStr) if err := g.createFirewall(apiService, MakeFirewallName(loadBalancerName), desc, sourceRanges, ports, hosts); err != nil { return nil, err } klog.Infof("ensureExternalLoadBalancer(%s): Created firewall.", lbRefStr) } } tpExists, tpNeedsRecreation, err := g.targetPoolNeedsRecreation(loadBalancerName, g.region, apiService.Spec.SessionAffinity) if err != nil { return nil, err } if !tpExists { klog.Infof("ensureExternalLoadBalancer(%s): Target pool for service doesn't exist.", lbRefStr) } // Check which health check needs to create and which health check needs to delete. // Health check management is coupled with target pool operation to prevent leaking. var hcToCreate, hcToDelete *compute.HttpHealthCheck hcLocalTrafficExisting, err := g.GetHTTPHealthCheck(loadBalancerName) if err != nil && !isHTTPErrorCode(err, http.StatusNotFound) { return nil, fmt.Errorf("error checking HTTP health check for load balancer (%s): %v", lbRefStr, err) } if path, healthCheckNodePort := servicehelpers.GetServiceHealthCheckPathPort(apiService); path != "" { klog.V(4).Infof("ensureExternalLoadBalancer(%s): Service needs local traffic health checks on: %d%s.", lbRefStr, healthCheckNodePort, path) if hcLocalTrafficExisting == nil { // This logic exists to detect a transition for non-OnlyLocal to OnlyLocal service // turn on the tpNeedsRecreation flag to delete/recreate fwdrule/tpool updating the // target pool to use local traffic health check. klog.V(2).Infof("ensureExternalLoadBalancer(%s): Updating from nodes health checks to local traffic health checks.", lbRefStr) if supportsNodesHealthCheck { hcToDelete = makeHTTPHealthCheck(MakeNodesHealthCheckName(clusterID), GetNodesHealthCheckPath(), GetNodesHealthCheckPort()) } tpNeedsRecreation = true } hcToCreate = makeHTTPHealthCheck(loadBalancerName, path, healthCheckNodePort) } else { klog.V(4).Infof("ensureExternalLoadBalancer(%s): Service needs nodes health checks.", lbRefStr) if hcLocalTrafficExisting != nil { // This logic exists to detect a transition from OnlyLocal to non-OnlyLocal service // and turn on the tpNeedsRecreation flag to delete/recreate fwdrule/tpool updating the // target pool to use nodes health check. klog.V(2).Infof("ensureExternalLoadBalancer(%s): Updating from local traffic health checks to nodes health checks.", lbRefStr) hcToDelete = hcLocalTrafficExisting tpNeedsRecreation = true } if supportsNodesHealthCheck { hcToCreate = makeHTTPHealthCheck(MakeNodesHealthCheckName(clusterID), GetNodesHealthCheckPath(), GetNodesHealthCheckPort()) } } // Now we get to some slightly more interesting logic. // First, neither target pools nor forwarding rules can be updated in place - // they have to be deleted and recreated. // Second, forwarding rules are layered on top of target pools in that you // can't delete a target pool that's currently in use by a forwarding rule. // Thus, we have to tear down the forwarding rule if either it or the target // pool needs to be updated. if fwdRuleExists && (fwdRuleNeedsUpdate || tpNeedsRecreation) { // Begin critical section. If we have to delete the forwarding rule, // and something should fail before we recreate it, don't release the // IP. That way we can come back to it later. isSafeToReleaseIP = false if err := g.DeleteRegionForwardingRule(loadBalancerName, g.region); err != nil && !isNotFound(err) { return nil, fmt.Errorf("failed to delete existing forwarding rule for load balancer (%s) update: %v", lbRefStr, err) } klog.Infof("ensureExternalLoadBalancer(%s): Deleted forwarding rule.", lbRefStr) } if err := g.ensureTargetPoolAndHealthCheck(tpExists, tpNeedsRecreation, apiService, loadBalancerName, clusterID, ipAddressToUse, hosts, hcToCreate, hcToDelete); err != nil { return nil, err } if tpNeedsRecreation || fwdRuleNeedsUpdate { klog.Infof("ensureExternalLoadBalancer(%s): Creating forwarding rule, IP %s (tier: %s).", lbRefStr, ipAddressToUse, netTier) if err := createForwardingRule(g, loadBalancerName, serviceName.String(), g.region, ipAddressToUse, g.targetPoolURL(loadBalancerName), ports, netTier); err != nil { return nil, fmt.Errorf("failed to create forwarding rule for load balancer (%s): %v", lbRefStr, err) } // End critical section. It is safe to release the static IP (which // just demotes it to ephemeral) now that it is attached. In the case // of a user-requested IP, the "is user-owned" flag will be set, // preventing it from actually being released. isSafeToReleaseIP = true klog.Infof("ensureExternalLoadBalancer(%s): Created forwarding rule, IP %s.", lbRefStr, ipAddressToUse) } status := &v1.LoadBalancerStatus{} status.Ingress = []v1.LoadBalancerIngress{{IP: ipAddressToUse}} return status, nil } // updateExternalLoadBalancer is the external implementation of LoadBalancer.UpdateLoadBalancer. func (g *Cloud) updateExternalLoadBalancer(clusterName string, service *v1.Service, nodes []*v1.Node) error { hosts, err := g.getInstancesByNames(nodeNames(nodes)) if err != nil { return err } loadBalancerName := g.GetLoadBalancerName(context.TODO(), clusterName, service) return g.updateTargetPool(loadBalancerName, hosts) } // ensureExternalLoadBalancerDeleted is the external implementation of LoadBalancer.EnsureLoadBalancerDeleted func (g *Cloud) ensureExternalLoadBalancerDeleted(clusterName, clusterID string, service *v1.Service) error { loadBalancerName := g.GetLoadBalancerName(context.TODO(), clusterName, service) serviceName := types.NamespacedName{Namespace: service.Namespace, Name: service.Name} lbRefStr := fmt.Sprintf("%v(%v)", loadBalancerName, serviceName) var hcNames []string if path, _ := servicehelpers.GetServiceHealthCheckPathPort(service); path != "" { hcToDelete, err := g.GetHTTPHealthCheck(loadBalancerName) if err != nil && !isHTTPErrorCode(err, http.StatusNotFound) { klog.Infof("ensureExternalLoadBalancerDeleted(%s): Failed to retrieve health check:%v.", lbRefStr, err) return err } // If we got 'StatusNotFound' LB was already deleted and it's safe to ignore. if err == nil { hcNames = append(hcNames, hcToDelete.Name) } } else { // EnsureLoadBalancerDeleted() could be triggered by changing service from // LoadBalancer type to others. In this case we have no idea whether it was // using local traffic health check or nodes health check. Attempt to delete // both to prevent leaking. hcNames = append(hcNames, loadBalancerName) hcNames = append(hcNames, MakeNodesHealthCheckName(clusterID)) } errs := utilerrors.AggregateGoroutines( func() error { klog.Infof("ensureExternalLoadBalancerDeleted(%s): Deleting firewall rule.", lbRefStr) fwName := MakeFirewallName(loadBalancerName) err := ignoreNotFound(g.DeleteFirewall(fwName)) if isForbidden(err) && g.OnXPN() { klog.V(4).Infof("ensureExternalLoadBalancerDeleted(%s): Do not have permission to delete firewall rule %v (on XPN). Raising event.", lbRefStr, fwName) g.raiseFirewallChangeNeededEvent(service, FirewallToGCloudDeleteCmd(fwName, g.NetworkProjectID())) return nil } return err }, // Even though we don't hold on to static IPs for load balancers, it's // possible that EnsureLoadBalancer left one around in a failed // creation/update attempt, so make sure we clean it up here just in case. func() error { klog.Infof("ensureExternalLoadBalancerDeleted(%s): Deleting IP address.", lbRefStr) return ignoreNotFound(g.DeleteRegionAddress(loadBalancerName, g.region)) }, func() error { klog.Infof("ensureExternalLoadBalancerDeleted(%s): Deleting forwarding rule.", lbRefStr) // The forwarding rule must be deleted before either the target pool can, // unfortunately, so we have to do these two serially. if err := ignoreNotFound(g.DeleteRegionForwardingRule(loadBalancerName, g.region)); err != nil { return err } klog.Infof("ensureExternalLoadBalancerDeleted(%s): Deleting target pool.", lbRefStr) if err := g.DeleteExternalTargetPoolAndChecks(service, loadBalancerName, g.region, clusterID, hcNames...); err != nil { return err } return nil }, ) if errs != nil { return utilerrors.Flatten(errs) } return nil } // DeleteExternalTargetPoolAndChecks Deletes an external load balancer pool and verifies the operation func (g *Cloud) DeleteExternalTargetPoolAndChecks(service *v1.Service, name, region, clusterID string, hcNames ...string) error { serviceName := types.NamespacedName{Namespace: service.Namespace, Name: service.Name} lbRefStr := fmt.Sprintf("%v(%v)", name, serviceName) if err := g.DeleteTargetPool(name, region); err != nil && isHTTPErrorCode(err, http.StatusNotFound) { klog.Infof("DeleteExternalTargetPoolAndChecks(%v): Target pool already deleted. Continuing to delete other resources.", lbRefStr) } else if err != nil { klog.Warningf("DeleteExternalTargetPoolAndChecks(%v): Failed to delete target pool, got error %s.", lbRefStr, err.Error()) return err } // Deletion of health checks is allowed only after the TargetPool reference is deleted for _, hcName := range hcNames { if err := func() error { // Check whether it is nodes health check, which has different name from the load-balancer. isNodesHealthCheck := hcName != name if isNodesHealthCheck { // Lock to prevent deleting necessary nodes health check before it gets attached // to target pool. g.sharedResourceLock.Lock() defer g.sharedResourceLock.Unlock() } klog.Infof("DeleteExternalTargetPoolAndChecks(%v): Deleting health check %v.", lbRefStr, hcName) if err := g.DeleteHTTPHealthCheck(hcName); err != nil { // Delete nodes health checks will fail if any other target pool is using it. if isInUsedByError(err) { klog.V(4).Infof("DeleteExternalTargetPoolAndChecks(%v): Health check %v is in used: %v.", lbRefStr, hcName, err) return nil } else if !isHTTPErrorCode(err, http.StatusNotFound) { klog.Warningf("DeleteExternalTargetPoolAndChecks(%v): Failed to delete health check %v: %v.", lbRefStr, hcName, err) return err } // StatusNotFound could happen when: // - This is the first attempt but we pass in a healthcheck that is already deleted // to prevent leaking. // - This is the first attempt but user manually deleted the heathcheck. // - This is a retry and in previous round we failed to delete the healthcheck firewall // after deleted the healthcheck. // We continue to delete the healthcheck firewall to prevent leaking. klog.V(4).Infof("DeleteExternalTargetPoolAndChecks(%v): Health check %v is already deleted.", lbRefStr, hcName) } // If health check is deleted without error, it means no load-balancer is using it. // So we should delete the health check firewall as well. fwName := MakeHealthCheckFirewallName(clusterID, hcName, isNodesHealthCheck) klog.Infof("DeleteExternalTargetPoolAndChecks(%v): Deleting health check firewall %v.", lbRefStr, fwName) if err := ignoreNotFound(g.DeleteFirewall(fwName)); err != nil { if isForbidden(err) && g.OnXPN() { klog.V(4).Infof("DeleteExternalTargetPoolAndChecks(%v): Do not have permission to delete firewall rule %v (on XPN). Raising event.", lbRefStr, fwName) g.raiseFirewallChangeNeededEvent(service, FirewallToGCloudDeleteCmd(fwName, g.NetworkProjectID())) return nil } return err } return nil }(); err != nil { return err } } return nil } // verifyUserRequestedIP checks the user-provided IP to see whether it meets // all the expected attributes for the load balancer, and returns an error if // the verification failed. It also returns a boolean to indicate whether the // IP address is considered owned by the user (i.e., not managed by the // controller. func verifyUserRequestedIP(s CloudAddressService, region, requestedIP, fwdRuleIP, lbRef string, desiredNetTier cloud.NetworkTier) (isUserOwnedIP bool, err error) { if requestedIP == "" { return false, nil } // If a specific IP address has been requested, we have to respect the // user's request and use that IP. If the forwarding rule was already using // a different IP, it will be harmlessly abandoned because it was only an // ephemeral IP (or it was a different static IP owned by the user, in which // case we shouldn't delete it anyway). existingAddress, err := s.GetRegionAddressByIP(region, requestedIP) if err != nil && !isNotFound(err) { klog.Errorf("verifyUserRequestedIP: failed to check whether the requested IP %q for LB %s exists: %v", requestedIP, lbRef, err) return false, err } if err == nil { // The requested IP is a static IP, owned and managed by the user. // Check if the network tier of the static IP matches the desired // network tier. netTierStr, err := s.getNetworkTierFromAddress(existingAddress.Name, region) if err != nil { return false, fmt.Errorf("failed to check the network tier of the IP %q: %v", requestedIP, err) } netTier := cloud.NetworkTierGCEValueToType(netTierStr) if netTier != desiredNetTier { klog.Errorf("verifyUserRequestedIP: requested static IP %q (name: %s) for LB %s has network tier %s, need %s.", requestedIP, existingAddress.Name, lbRef, netTier, desiredNetTier) return false, fmt.Errorf("requested IP %q belongs to the %s network tier; expected %s", requestedIP, netTier, desiredNetTier) } klog.V(4).Infof("verifyUserRequestedIP: the requested static IP %q (name: %s, tier: %s) for LB %s exists.", requestedIP, existingAddress.Name, netTier, lbRef) return true, nil } if requestedIP == fwdRuleIP { // The requested IP is not a static IP, but is currently assigned // to this forwarding rule, so we can just use it. klog.V(4).Infof("verifyUserRequestedIP: the requested IP %q is not static, but is currently in use by for LB %s", requestedIP, lbRef) return false, nil } // The requested IP is not static and it is not assigned to the // current forwarding rule. It might be attached to a different // rule or it might not be part of this project at all. Either // way, we can't use it. klog.Errorf("verifyUserRequestedIP: requested IP %q for LB %s is neither static nor assigned to the LB", requestedIP, lbRef) return false, fmt.Errorf("requested ip %q is neither static nor assigned to the LB", requestedIP) } func (g *Cloud) ensureTargetPoolAndHealthCheck(tpExists, tpNeedsRecreation bool, svc *v1.Service, loadBalancerName, clusterID, ipAddressToUse string, hosts []*gceInstance, hcToCreate, hcToDelete *compute.HttpHealthCheck) error { serviceName := types.NamespacedName{Namespace: svc.Namespace, Name: svc.Name} lbRefStr := fmt.Sprintf("%v(%v)", loadBalancerName, serviceName) if tpExists && tpNeedsRecreation { // Pass healthchecks to DeleteExternalTargetPoolAndChecks to cleanup health checks after cleaning up the target pool itself. var hcNames []string if hcToDelete != nil { hcNames = append(hcNames, hcToDelete.Name) } if err := g.DeleteExternalTargetPoolAndChecks(svc, loadBalancerName, g.region, clusterID, hcNames...); err != nil { return fmt.Errorf("failed to delete existing target pool for load balancer (%s) update: %v", lbRefStr, err) } klog.Infof("ensureTargetPoolAndHealthCheck(%s): Deleted target pool.", lbRefStr) } // Once we've deleted the resources (if necessary), build them back up (or for // the first time if they're new). if tpNeedsRecreation { createInstances := hosts if len(hosts) > maxTargetPoolCreateInstances { createInstances = createInstances[:maxTargetPoolCreateInstances] } if err := g.createTargetPoolAndHealthCheck(svc, loadBalancerName, serviceName.String(), ipAddressToUse, g.region, clusterID, createInstances, hcToCreate); err != nil { return fmt.Errorf("failed to create target pool for load balancer (%s): %v", lbRefStr, err) } if hcToCreate != nil { klog.Infof("ensureTargetPoolAndHealthCheck(%s): Created health checks %v.", lbRefStr, hcToCreate.Name) } if len(hosts) <= maxTargetPoolCreateInstances { klog.Infof("ensureTargetPoolAndHealthCheck(%s): Created target pool.", lbRefStr) } else { klog.Infof("ensureTargetPoolAndHealthCheck(%s): Created initial target pool (now updating the remaining %d hosts).", lbRefStr, len(hosts)-maxTargetPoolCreateInstances) if err := g.updateTargetPool(loadBalancerName, hosts); err != nil { return fmt.Errorf("failed to update target pool for load balancer (%s): %v", lbRefStr, err) } klog.Infof("ensureTargetPoolAndHealthCheck(%s): Updated target pool (with %d hosts).", lbRefStr, len(hosts)-maxTargetPoolCreateInstances) } } else if tpExists { // Ensure hosts are updated even if there is no other changes required on target pool. if err := g.updateTargetPool(loadBalancerName, hosts); err != nil { return fmt.Errorf("failed to update target pool for load balancer (%s): %v", lbRefStr, err) } klog.Infof("ensureTargetPoolAndHealthCheck(%s): Updated target pool (with %d hosts).", lbRefStr, len(hosts)) if hcToCreate != nil { if hc, err := g.ensureHTTPHealthCheck(hcToCreate.Name, hcToCreate.RequestPath, int32(hcToCreate.Port)); err != nil || hc == nil { return fmt.Errorf("failed to ensure health check for %v port %d path %v: %v", loadBalancerName, hcToCreate.Port, hcToCreate.RequestPath, err) } } } else { // Panic worthy. klog.Errorf("ensureTargetPoolAndHealthCheck(%s): target pool not exists and doesn't need to be created.", lbRefStr) } return nil } func (g *Cloud) createTargetPoolAndHealthCheck(svc *v1.Service, name, serviceName, ipAddress, region, clusterID string, hosts []*gceInstance, hc *compute.HttpHealthCheck) error { // health check management is coupled with targetPools to prevent leaks. A // target pool is the only thing that requires a health check, so we delete // associated checks on teardown, and ensure checks on setup. hcLinks := []string{} if hc != nil { // Check whether it is nodes health check, which has different name from the load-balancer. isNodesHealthCheck := hc.Name != name if isNodesHealthCheck { // Lock to prevent necessary nodes health check / firewall gets deleted. g.sharedResourceLock.Lock() defer g.sharedResourceLock.Unlock() } if err := g.ensureHTTPHealthCheckFirewall(svc, serviceName, ipAddress, region, clusterID, hosts, hc.Name, int32(hc.Port), isNodesHealthCheck); err != nil { return err } var err error hcRequestPath, hcPort := hc.RequestPath, hc.Port if hc, err = g.ensureHTTPHealthCheck(hc.Name, hc.RequestPath, int32(hc.Port)); err != nil || hc == nil { return fmt.Errorf("failed to ensure health check for %v port %d path %v: %v", name, hcPort, hcRequestPath, err) } hcLinks = append(hcLinks, hc.SelfLink) } var instances []string for _, host := range hosts { instances = append(instances, host.makeComparableHostPath()) } klog.Infof("Creating targetpool %v with %d healthchecks", name, len(hcLinks)) pool := &compute.TargetPool{ Name: name, Description: fmt.Sprintf(`{"kubernetes.io/service-name":"%s"}`, serviceName), Instances: instances, SessionAffinity: translateAffinityType(svc.Spec.SessionAffinity), HealthChecks: hcLinks, } if err := g.CreateTargetPool(pool, region); err != nil && !isHTTPErrorCode(err, http.StatusConflict) { return err } return nil } func (g *Cloud) updateTargetPool(loadBalancerName string, hosts []*gceInstance) error { pool, err := g.GetTargetPool(loadBalancerName, g.region) if err != nil { return err } existing := sets.NewString() for _, instance := range pool.Instances { existing.Insert(hostURLToComparablePath(instance)) } var toAdd []*compute.InstanceReference var toRemove []*compute.InstanceReference for _, host := range hosts { link := host.makeComparableHostPath() if !existing.Has(link) { toAdd = append(toAdd, &compute.InstanceReference{Instance: link}) } existing.Delete(link) } for link := range existing { toRemove = append(toRemove, &compute.InstanceReference{Instance: link}) } for len(toAdd) > 0 { // Do not remove more than maxInstancesPerTargetPoolUpdate in a single call. instancesCount := len(toAdd) if instancesCount > maxInstancesPerTargetPoolUpdate { instancesCount = maxInstancesPerTargetPoolUpdate } // The operation to add 1000 instances is fairly long (may take minutes), so // we don't need to worry about saturating QPS limits. if err := g.AddInstancesToTargetPool(loadBalancerName, g.region, toAdd[:instancesCount]); err != nil { return err } toAdd = toAdd[instancesCount:] } for len(toRemove) > 0 { // Do not remove more than maxInstancesPerTargetPoolUpdate in a single call. instancesCount := len(toRemove) if instancesCount > maxInstancesPerTargetPoolUpdate { instancesCount = maxInstancesPerTargetPoolUpdate } // The operation to remove 1000 instances is fairly long (may take minutes), so // we don't need to worry about saturating QPS limits. if err := g.RemoveInstancesFromTargetPool(loadBalancerName, g.region, toRemove[:instancesCount]); err != nil { return err } toRemove = toRemove[instancesCount:] } // Try to verify that the correct number of nodes are now in the target pool. // We've been bitten by a bug here before (#11327) where all nodes were // accidentally removed and want to make similar problems easier to notice. updatedPool, err := g.GetTargetPool(loadBalancerName, g.region) if err != nil { return err } if len(updatedPool.Instances) != len(hosts) { klog.Errorf("Unexpected number of instances (%d) in target pool %s after updating (expected %d). Instances in updated pool: %s", len(updatedPool.Instances), loadBalancerName, len(hosts), strings.Join(updatedPool.Instances, ",")) return fmt.Errorf("unexpected number of instances (%d) in target pool %s after update (expected %d)", len(updatedPool.Instances), loadBalancerName, len(hosts)) } return nil } func (g *Cloud) targetPoolURL(name string) string { return g.service.BasePath + strings.Join([]string{g.projectID, "regions", g.region, "targetPools", name}, "/") } func makeHTTPHealthCheck(name, path string, port int32) *compute.HttpHealthCheck { return &compute.HttpHealthCheck{ Name: name, Port: int64(port), RequestPath: path, Host: "", Description: makeHealthCheckDescription(name), CheckIntervalSec: gceHcCheckIntervalSeconds, TimeoutSec: gceHcTimeoutSeconds, HealthyThreshold: gceHcHealthyThreshold, UnhealthyThreshold: gceHcUnhealthyThreshold, } } // mergeHTTPHealthChecks reconciles HttpHealthCheck configures to be no smaller // than the default values. // E.g. old health check interval is 2s, new default is 8. // The HC interval will be reconciled to 8 seconds. // If the existing health check is larger than the default interval, // the configuration will be kept. func mergeHTTPHealthChecks(hc, newHC *compute.HttpHealthCheck) { if hc.CheckIntervalSec > newHC.CheckIntervalSec { newHC.CheckIntervalSec = hc.CheckIntervalSec } if hc.TimeoutSec > newHC.TimeoutSec { newHC.TimeoutSec = hc.TimeoutSec } if hc.UnhealthyThreshold > newHC.UnhealthyThreshold { newHC.UnhealthyThreshold = hc.UnhealthyThreshold } if hc.HealthyThreshold > newHC.HealthyThreshold { newHC.HealthyThreshold = hc.HealthyThreshold } } // needToUpdateHTTPHealthChecks checks whether the http healthcheck needs to be // updated. func needToUpdateHTTPHealthChecks(hc, newHC *compute.HttpHealthCheck) bool { switch { case hc.Port != newHC.Port, hc.RequestPath != newHC.RequestPath, hc.Description != newHC.Description, hc.CheckIntervalSec < newHC.CheckIntervalSec, hc.TimeoutSec < newHC.TimeoutSec, hc.UnhealthyThreshold < newHC.UnhealthyThreshold, hc.HealthyThreshold < newHC.HealthyThreshold: return true } return false } func (g *Cloud) ensureHTTPHealthCheck(name, path string, port int32) (hc *compute.HttpHealthCheck, err error) { newHC := makeHTTPHealthCheck(name, path, port) hc, err = g.GetHTTPHealthCheck(name) if hc == nil || err != nil && isHTTPErrorCode(err, http.StatusNotFound) { klog.Infof("Did not find health check %v, creating port %v path %v", name, port, path) if err = g.CreateHTTPHealthCheck(newHC); err != nil { return nil, err } hc, err = g.GetHTTPHealthCheck(name) if err != nil { klog.Errorf("Failed to get http health check %v", err) return nil, err } klog.Infof("Created HTTP health check %v healthCheckNodePort: %d", name, port) return hc, nil } // Validate health check fields klog.V(4).Infof("Checking http health check params %s", name) if needToUpdateHTTPHealthChecks(hc, newHC) { klog.Warningf("Health check %v exists but parameters have drifted - updating...", name) mergeHTTPHealthChecks(hc, newHC) if err := g.UpdateHTTPHealthCheck(newHC); err != nil { klog.Warningf("Failed to reconcile http health check %v parameters", name) return nil, err } klog.V(4).Infof("Corrected health check %v parameters successful", name) hc, err = g.GetHTTPHealthCheck(name) if err != nil { return nil, err } } return hc, nil } // Passing nil for requested IP is perfectly fine - it just means that no specific // IP is being requested. // Returns whether the forwarding rule exists, whether it needs to be updated, // what its IP address is (if it exists), and any error we encountered. func (g *Cloud) forwardingRuleNeedsUpdate(name, region string, loadBalancerIP string, ports []v1.ServicePort) (exists bool, needsUpdate bool, ipAddress string, err error) { fwd, err := g.GetRegionForwardingRule(name, region) if err != nil { if isHTTPErrorCode(err, http.StatusNotFound) { return false, true, "", nil } // Err on the side of caution in case of errors. Caller should notice the error and retry. // We never want to end up recreating resources because g api flaked. return true, false, "", fmt.Errorf("error getting load balancer's forwarding rule: %v", err) } // If the user asks for a specific static ip through the Service spec, // check that we're actually using it. // TODO: we report loadbalancer IP through status, so we want to verify if // that matches the forwarding rule as well. if loadBalancerIP != "" && loadBalancerIP != fwd.IPAddress { klog.Infof("LoadBalancer ip for forwarding rule %v was expected to be %v, but was actually %v", fwd.Name, fwd.IPAddress, loadBalancerIP) return true, true, fwd.IPAddress, nil } portRange, err := loadBalancerPortRange(ports) if err != nil { // Err on the side of caution in case of errors. Caller should notice the error and retry. // We never want to end up recreating resources because g api flaked. return true, false, "", err } if portRange != fwd.PortRange { klog.Infof("LoadBalancer port range for forwarding rule %v was expected to be %v, but was actually %v", fwd.Name, fwd.PortRange, portRange) return true, true, fwd.IPAddress, nil } // The service controller verified all the protocols match on the ports, just check the first one if string(ports[0].Protocol) != fwd.IPProtocol { klog.Infof("LoadBalancer protocol for forwarding rule %v was expected to be %v, but was actually %v", fwd.Name, fwd.IPProtocol, string(ports[0].Protocol)) return true, true, fwd.IPAddress, nil } return true, false, fwd.IPAddress, nil } // Doesn't check whether the hosts have changed, since host updating is handled // separately. func (g *Cloud) targetPoolNeedsRecreation(name, region string, affinityType v1.ServiceAffinity) (exists bool, needsRecreation bool, err error) { tp, err := g.GetTargetPool(name, region) if err != nil { if isHTTPErrorCode(err, http.StatusNotFound) { return false, true, nil } // Err on the side of caution in case of errors. Caller should notice the error and retry. // We never want to end up recreating resources because g api flaked. return true, false, fmt.Errorf("error getting load balancer's target pool: %v", err) } // TODO: If the user modifies their Service's session affinity, it *should* // reflect in the associated target pool. However, currently not setting the // session affinity on a target pool defaults it to the empty string while // not setting in on a Service defaults it to None. There is a lack of // documentation around the default setting for the target pool, so if we // find it's the undocumented empty string, don't blindly recreate the // target pool (which results in downtime). Fix this when we have formally // defined the defaults on either side. if tp.SessionAffinity != "" && translateAffinityType(affinityType) != tp.SessionAffinity { klog.Infof("LoadBalancer target pool %v changed affinity from %v to %v", name, tp.SessionAffinity, affinityType) return true, true, nil } return true, false, nil } func (h *gceInstance) makeComparableHostPath() string { return fmt.Sprintf("/zones/%s/instances/%s", h.Zone, h.Name) } func nodeNames(nodes []*v1.Node) []string { ret := make([]string, len(nodes)) for i, node := range nodes { ret[i] = node.Name } return ret } func hostURLToComparablePath(hostURL string) string { idx := strings.Index(hostURL, "/zones/") if idx < 0 { return "" } return hostURL[idx:] } func loadBalancerPortRange(ports []v1.ServicePort) (string, error) { if len(ports) == 0 { return "", fmt.Errorf("no ports specified for GCE load balancer") } // The service controller verified all the protocols match on the ports, just check and use the first one if ports[0].Protocol != v1.ProtocolTCP && ports[0].Protocol != v1.ProtocolUDP { return "", fmt.Errorf("invalid protocol %s, only TCP and UDP are supported", string(ports[0].Protocol)) } minPort := int32(65536) maxPort := int32(0) for i := range ports { if ports[i].Port < minPort { minPort = ports[i].Port } if ports[i].Port > maxPort { maxPort = ports[i].Port } } return fmt.Sprintf("%d-%d", minPort, maxPort), nil } // translate from what K8s supports to what the cloud provider supports for session affinity. func translateAffinityType(affinityType v1.ServiceAffinity) string { switch affinityType { case v1.ServiceAffinityClientIP: return gceAffinityTypeClientIP case v1.ServiceAffinityNone: return gceAffinityTypeNone default: klog.Errorf("Unexpected affinity type: %v", affinityType) return gceAffinityTypeNone } } func (g *Cloud) firewallNeedsUpdate(name, serviceName, ipAddress string, ports []v1.ServicePort, sourceRanges utilnet.IPNetSet) (exists bool, needsUpdate bool, err error) { fw, err := g.GetFirewall(MakeFirewallName(name)) if err != nil { if isHTTPErrorCode(err, http.StatusNotFound) { return false, true, nil } return false, false, fmt.Errorf("error getting load balancer's firewall: %v", err) } if fw.Description != makeFirewallDescription(serviceName, ipAddress) { return true, true, nil } if len(fw.Allowed) != 1 || (fw.Allowed[0].IPProtocol != "tcp" && fw.Allowed[0].IPProtocol != "udp") { return true, true, nil } // Make sure the allowed ports match. portNums, portRanges, _ := getPortsAndProtocol(ports) // This logic checks if the existing firewall rules contains either enumerated service ports or port ranges. // This is to prevent unnecessary noop updates to the firewall rule when the existing firewall rule is // set up via the previous pattern using enumerated ports instead of port ranges. if !equalStringSets(portNums, fw.Allowed[0].Ports) && !equalStringSets(portRanges, fw.Allowed[0].Ports) { return true, true, nil } // The service controller already verified that the protocol matches on all ports, no need to check. actualSourceRanges, err := utilnet.ParseIPNets(fw.SourceRanges...) if err != nil { // This really shouldn't happen... GCE has returned something unexpected klog.Warningf("Error parsing firewall SourceRanges: %v", fw.SourceRanges) // We don't return the error, because we can hopefully recover from this by reconfiguring the firewall return true, true, nil } if !sourceRanges.Equal(actualSourceRanges) { return true, true, nil } return true, false, nil } func (g *Cloud) ensureHTTPHealthCheckFirewall(svc *v1.Service, serviceName, ipAddress, region, clusterID string, hosts []*gceInstance, hcName string, hcPort int32, isNodesHealthCheck bool) error { // Prepare the firewall params for creating / checking. desc := fmt.Sprintf(`{"kubernetes.io/cluster-id":"%s"}`, clusterID) if !isNodesHealthCheck { desc = makeFirewallDescription(serviceName, ipAddress) } sourceRanges := l4LbSrcRngsFlag.ipn ports := []v1.ServicePort{{Protocol: "tcp", Port: hcPort}} fwName := MakeHealthCheckFirewallName(clusterID, hcName, isNodesHealthCheck) fw, err := g.GetFirewall(fwName) if err != nil { if !isHTTPErrorCode(err, http.StatusNotFound) { return fmt.Errorf("error getting firewall for health checks: %v", err) } klog.Infof("Creating firewall %v for health checks.", fwName) if err := g.createFirewall(svc, fwName, desc, sourceRanges, ports, hosts); err != nil { return err } klog.Infof("Created firewall %v for health checks.", fwName) return nil } // Validate firewall fields. if fw.Description != desc || len(fw.Allowed) != 1 || fw.Allowed[0].IPProtocol != string(ports[0].Protocol) || !equalStringSets(fw.Allowed[0].Ports, []string{strconv.Itoa(int(ports[0].Port))}) || !equalStringSets(fw.SourceRanges, sourceRanges.StringSlice()) { klog.Warningf("Firewall %v exists but parameters have drifted - updating...", fwName) if err := g.updateFirewall(svc, fwName, desc, sourceRanges, ports, hosts); err != nil { klog.Warningf("Failed to reconcile firewall %v parameters.", fwName) return err } klog.V(4).Infof("Corrected firewall %v parameters successful", fwName) } return nil } func createForwardingRule(s CloudForwardingRuleService, name, serviceName, region, ipAddress, target string, ports []v1.ServicePort, netTier cloud.NetworkTier) error { portRange, err := loadBalancerPortRange(ports) if err != nil { return err } desc := makeServiceDescription(serviceName) ipProtocol := string(ports[0].Protocol) rule := &compute.ForwardingRule{ Name: name, Description: desc, IPAddress: ipAddress, IPProtocol: ipProtocol, PortRange: portRange, Target: target, NetworkTier: netTier.ToGCEValue(), } err = s.CreateRegionForwardingRule(rule, region) if err != nil && !isHTTPErrorCode(err, http.StatusConflict) { return err } return nil } func (g *Cloud) createFirewall(svc *v1.Service, name, desc string, sourceRanges utilnet.IPNetSet, ports []v1.ServicePort, hosts []*gceInstance) error { firewall, err := g.firewallObject(name, desc, sourceRanges, ports, hosts) if err != nil { return err } if err = g.CreateFirewall(firewall); err != nil { if isHTTPErrorCode(err, http.StatusConflict) { return nil } else if isForbidden(err) && g.OnXPN() { klog.V(4).Infof("createFirewall(%v): do not have permission to create firewall rule (on XPN). Raising event.", firewall.Name) g.raiseFirewallChangeNeededEvent(svc, FirewallToGCloudCreateCmd(firewall, g.NetworkProjectID())) return nil } return err } return nil } func (g *Cloud) updateFirewall(svc *v1.Service, name, desc string, sourceRanges utilnet.IPNetSet, ports []v1.ServicePort, hosts []*gceInstance) error { firewall, err := g.firewallObject(name, desc, sourceRanges, ports, hosts) if err != nil { return err } if err = g.UpdateFirewall(firewall); err != nil { if isHTTPErrorCode(err, http.StatusConflict) { return nil } else if isForbidden(err) && g.OnXPN() { klog.V(4).Infof("updateFirewall(%v): do not have permission to update firewall rule (on XPN). Raising event.", firewall.Name) g.raiseFirewallChangeNeededEvent(svc, FirewallToGCloudUpdateCmd(firewall, g.NetworkProjectID())) return nil } return err } return nil } func (g *Cloud) firewallObject(name, desc string, sourceRanges utilnet.IPNetSet, ports []v1.ServicePort, hosts []*gceInstance) (*compute.Firewall, error) { // Concatenate service ports into port ranges. This help to workaround the gce firewall limitation where only // 100 ports or port ranges can be used in a firewall rule. _, portRanges, _ := getPortsAndProtocol(ports) // If the node tags to be used for this cluster have been predefined in the // provider config, just use them. Otherwise, invoke computeHostTags method to get the tags. hostTags := g.nodeTags if len(hostTags) == 0 { var err error if hostTags, err = g.computeHostTags(hosts); err != nil { return nil, fmt.Errorf("no node tags supplied and also failed to parse the given lists of hosts for tags. Abort creating firewall rule") } } firewall := &compute.Firewall{ Name: name, Description: desc, Network: g.networkURL, SourceRanges: sourceRanges.StringSlice(), TargetTags: hostTags, Allowed: []*compute.FirewallAllowed{ { // TODO: Make this more generic. Currently this method is only // used to create firewall rules for loadbalancers, which have // exactly one protocol, so we can never end up with a list of // mixed TCP and UDP ports. It should be possible to use a // single firewall rule for both a TCP and UDP lb. IPProtocol: strings.ToLower(string(ports[0].Protocol)), Ports: portRanges, }, }, } return firewall, nil } func ensureStaticIP(s CloudAddressService, name, serviceName, region, existingIP string, netTier cloud.NetworkTier) (ipAddress string, existing bool, err error) { // If the address doesn't exist, this will create it. // If the existingIP exists but is ephemeral, this will promote it to static. // If the address already exists, this will harmlessly return a StatusConflict // and we'll grab the IP before returning. existed := false desc := makeServiceDescription(serviceName) var creationErr error addressObj := &compute.Address{ Name: name, Description: desc, NetworkTier: netTier.ToGCEValue(), } if existingIP != "" { addressObj.Address = existingIP } creationErr = s.ReserveRegionAddress(addressObj, region) if creationErr != nil { // GCE returns StatusConflict if the name conflicts; it returns // StatusBadRequest if the IP conflicts. if !isHTTPErrorCode(creationErr, http.StatusConflict) && !isHTTPErrorCode(creationErr, http.StatusBadRequest) { return "", false, fmt.Errorf("error creating gce static IP address: %v", creationErr) } existed = true } // If address exists, get it by IP, because name might be different. // This can specifically happen if the IP was changed from ephemeral to static, // which results in a new name for the IP. if existingIP != "" { addr, err := s.GetRegionAddressByIP(region, existingIP) if err != nil { return "", false, fmt.Errorf("error getting static IP address: %v", err) } return addr.Address, existed, nil } // Otherwise, get address by name addr, err := s.GetRegionAddress(name, region) if err != nil { return "", false, fmt.Errorf("error getting static IP address: %v", err) } return addr.Address, existed, nil } func (g *Cloud) getServiceNetworkTier(svc *v1.Service) (cloud.NetworkTier, error) { tier, err := GetServiceNetworkTier(svc) if err != nil { // Returns an error if the annotation is invalid. return cloud.NetworkTier(""), err } return tier, nil } func (g *Cloud) deleteWrongNetworkTieredResources(lbName, lbRef string, desiredNetTier cloud.NetworkTier) error { logPrefix := fmt.Sprintf("deleteWrongNetworkTieredResources:(%s)", lbRef) if err := deleteFWDRuleWithWrongTier(g, g.region, lbName, logPrefix, desiredNetTier); err != nil { return err } if err := deleteAddressWithWrongTier(g, g.region, lbName, logPrefix, desiredNetTier); err != nil { return err } return nil } // deleteFWDRuleWithWrongTier checks the network tier of existing forwarding // rule and delete the rule if the tier does not matched the desired tier. func deleteFWDRuleWithWrongTier(s CloudForwardingRuleService, region, name, logPrefix string, desiredNetTier cloud.NetworkTier) error { tierStr, err := s.getNetworkTierFromForwardingRule(name, region) if isNotFound(err) { return nil } else if err != nil { return err } existingTier := cloud.NetworkTierGCEValueToType(tierStr) if existingTier == desiredNetTier { return nil } klog.V(2).Infof("%s: Network tiers do not match; existing forwarding rule: %q, desired: %q. Deleting the forwarding rule", logPrefix, existingTier, desiredNetTier) err = s.DeleteRegionForwardingRule(name, region) return ignoreNotFound(err) } // deleteAddressWithWrongTier checks the network tier of existing address // and delete the address if the tier does not matched the desired tier. func deleteAddressWithWrongTier(s CloudAddressService, region, name, logPrefix string, desiredNetTier cloud.NetworkTier) error { // We only check the IP address matching the reserved name that the // controller assigned to the LB. We make the assumption that an address of // such name is owned by the controller and is safe to release. Whether an // IP is owned by the user is not clearly defined in the current code, and // this assumption may not match some of the existing logic in the code. // However, this is okay since network tiering is still Alpha and will be // properly gated. // TODO(#51665): Re-evaluate the "ownership" of the IP address to ensure // we don't release IP unintentionally. tierStr, err := s.getNetworkTierFromAddress(name, region) if isNotFound(err) { return nil } else if err != nil { return err } existingTier := cloud.NetworkTierGCEValueToType(tierStr) if existingTier == desiredNetTier { return nil } klog.V(2).Infof("%s: Network tiers do not match; existing address: %q, desired: %q. Deleting the address", logPrefix, existingTier, desiredNetTier) err = s.DeleteRegionAddress(name, region) return ignoreNotFound(err) }