From 88a99b5beeaa51eaf646eb92d8f546f65f654008 Mon Sep 17 00:00:00 2001 From: Daniel Wallace Date: Wed, 25 Apr 2018 11:13:15 -0500 Subject: [PATCH] Fix for EC2 Rate Limit Failures Fix for ec2 rate limit failures described here: https://bugzilla.suse.com/show_bug.cgi?id=1088888 --- salt/utils/aws.py | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/salt/utils/aws.py b/salt/utils/aws.py index 059450e7ca..912f1466ba 100644 --- a/salt/utils/aws.py +++ b/salt/utils/aws.py @@ -20,6 +20,7 @@ import hmac import logging import salt.config import re +import random from salt.ext import six # Import Salt libs @@ -442,8 +443,9 @@ def query(params=None, setname=None, requesturl=None, location=None, ) headers = {} - attempts = 5 - while attempts > 0: + MAX_RETRIES = 6 + attempts = 0 + while attempts < MAX_RETRIES: log.debug('AWS Request: %s', requesturl) log.trace('AWS Request Parameters: %s', params_with_headers) try: @@ -461,15 +463,23 @@ def query(params=None, setname=None, requesturl=None, location=None, # check to see if we should retry the query err_code = data.get('Errors', {}).get('Error', {}).get('Code', '') - if attempts > 0 and err_code and err_code in AWS_RETRY_CODES: - attempts -= 1 + if attempts < MAX_RETRIES and err_code and err_code in AWS_RETRY_CODES: + attempts += 1 log.error( 'AWS Response Status Code and Error: [%s %s] %s; ' 'Attempts remaining: %s', exc.response.status_code, exc, data, attempts ) - # Wait a bit before continuing to prevent throttling - time.sleep(2) + # backoff an exponential amount of time to throttle requests + # during "API Rate Exceeded" failures as suggested by the AWS documentation here: + # https://docs.aws.amazon.com/AWSEC2/latest/APIReference/query-api-troubleshooting.html + # and also here: + # https://docs.aws.amazon.com/general/latest/gr/api-retries.html + # Failure to implement this approach results in a failure rate of >30% when using salt-cloud with + # "--parallel" when creating 50 or more instances with a fixed delay of 2 seconds. + # A failure rate of >10% is observed when using the salt-api with an asyncronous client + # specified (runner_async). + time.sleep(random.uniform(1, 2**attempts)) continue log.error( -- 2.13.7