AWS: Fix race in security-group read/create

We need to find the ID for a named security group, or create a new one.

We do this by listing the security groups, and then doing a create if we
cannot find one.  This is a race though; against another thread if the
AWS API were consistent, but generally because the AWS API is actually
eventually consistent.

We wrap it in a retry loop.
pull/6/head
Justin Santa Barbara 2015-07-09 13:28:42 -04:00
parent d7bace23ff
commit 092d407a48
1 changed files with 52 additions and 29 deletions

View File

@ -50,6 +50,12 @@ const ProviderName = "aws"
// The tag name we use to differentiate multiple logically independent clusters running in the same AZ
const TagNameKubernetesCluster = "KubernetesCluster"
// We sometimes read to see if something exists; then try to create it if we didn't find it
// This can fail once in a consistent system if done in parallel
// In an eventually consistent system, it could fail unboundedly
// MaxReadThenCreateRetries sets the maxiumum number of attempts we will make
const MaxReadThenCreateRetries = 30
// Abstraction over AWS, to allow mocking/other implementations
type AWSServices interface {
Compute(region string) (EC2, error)
@ -1656,37 +1662,54 @@ func (s *AWSCloud) removeSecurityGroupIngress(securityGroupId string, removePerm
// Makes sure the security group exists
// Returns the security group id or error
func (s *AWSCloud) ensureSecurityGroup(name string, description string, vpcID string) (string, error) {
request := &ec2.DescribeSecurityGroupsInput{}
filters := []*ec2.Filter{
newEc2Filter("group-name", name),
newEc2Filter("vpc-id", vpcID),
}
request.Filters = s.addFilters(filters)
groupID := ""
attempt := 0
for {
attempt++
securityGroups, err := s.ec2.DescribeSecurityGroups(request)
if err != nil {
return "", err
}
if len(securityGroups) >= 1 {
if len(securityGroups) > 1 {
glog.Warning("Found multiple security groups with name:", name)
request := &ec2.DescribeSecurityGroupsInput{}
filters := []*ec2.Filter{
newEc2Filter("group-name", name),
newEc2Filter("vpc-id", vpcID),
}
request.Filters = s.addFilters(filters)
securityGroups, err := s.ec2.DescribeSecurityGroups(request)
if err != nil {
return "", err
}
if len(securityGroups) >= 1 {
if len(securityGroups) > 1 {
glog.Warning("Found multiple security groups with name:", name)
}
return orEmpty(securityGroups[0].GroupID), nil
}
createRequest := &ec2.CreateSecurityGroupInput{}
createRequest.VPCID = &vpcID
createRequest.GroupName = &name
createRequest.Description = &description
createResponse, err := s.ec2.CreateSecurityGroup(createRequest)
if err != nil {
ignore := false
if awsError, ok := err.(awserr.Error); ok {
if awsError.Code() == "InvalidGroup.Duplicate" && attempt < MaxReadThenCreateRetries {
glog.V(2).Infof("Got InvalidGroup.Duplicate while creating security group (race?); will retry")
ignore = true
}
}
if !ignore {
glog.Error("error creating security group: ", err)
return "", err
}
time.Sleep(1 * time.Second)
} else {
groupID = orEmpty(createResponse.GroupID)
break
}
return orEmpty(securityGroups[0].GroupID), nil
}
createRequest := &ec2.CreateSecurityGroupInput{}
createRequest.VPCID = &vpcID
createRequest.GroupName = &name
createRequest.Description = &description
createResponse, err := s.ec2.CreateSecurityGroup(createRequest)
if err != nil {
glog.Error("error creating security group: ", err)
return "", err
}
groupID := orEmpty(createResponse.GroupID)
if groupID == "" {
return "", fmt.Errorf("created security group, but id was not returned: %s", name)
}
@ -1702,7 +1725,7 @@ func (s *AWSCloud) ensureSecurityGroup(name string, description string, vpcID st
tagRequest := &ec2.CreateTagsInput{}
tagRequest.Resources = []*string{&groupID}
tagRequest.Tags = tags
_, err = s.createTags(tagRequest)
_, err := s.createTags(tagRequest)
if err != nil {
// Not clear how to recover fully from this; we're OK because we don't match on tags, but that is a little odd
return "", fmt.Errorf("error tagging security group: %v", err)