From 38c13259b34d2e33c6f06ee89139ebf9bfc44931 Mon Sep 17 00:00:00 2001 From: Rob Date: Sat, 26 May 2018 03:12:39 +1000 Subject: [PATCH] [AWS] New module: aws_glue_job (#39493) * New module - glue_job * Review fixes --- .../modules/cloud/amazon/aws_glue_job.py | 365 ++++++++++++++++++ 1 file changed, 365 insertions(+) create mode 100644 lib/ansible/modules/cloud/amazon/aws_glue_job.py diff --git a/lib/ansible/modules/cloud/amazon/aws_glue_job.py b/lib/ansible/modules/cloud/amazon/aws_glue_job.py new file mode 100644 index 0000000000..bb23569c2a --- /dev/null +++ b/lib/ansible/modules/cloud/amazon/aws_glue_job.py @@ -0,0 +1,365 @@ +#!/usr/bin/python + +# Copyright: (c) 2018, Rob White (@wimnat) +# GNU General Public License v3.0+ (see COPYING or https://www.gnu.org/licenses/gpl-3.0.txt) + +ANSIBLE_METADATA = {'metadata_version': '1.1', + 'status': ['preview'], + 'supported_by': 'community'} + +DOCUMENTATION = ''' +--- +module: aws_glue_job +short_description: Manage an AWS Glue job +description: + - Manage an AWS Glue job. See U(https://aws.amazon.com/glue/) for details. +version_added: "2.6" +requirements: [ boto3 ] +author: "Rob White (@wimnat)" +options: + allocated_capacity: + description: + - The number of AWS Glue data processing units (DPUs) to allocate to this Job. From 2 to 100 DPUs + can be allocated; the default is 10. A DPU is a relative measure of processing power that consists + of 4 vCPUs of compute capacity and 16 GB of memory. + required: false + command_name: + description: + - The name of the job command. This must be 'glueetl'. + required: false + default: glueetl + command_script_location: + description: + - The S3 path to a script that executes a job. + required: true + connections: + description: + - A list of Glue connections used for this job. + required: false + default_arguments: + description: + - A dict of default arguments for this job. You can specify arguments here that your own job-execution + script consumes, as well as arguments that AWS Glue itself consumes. + required: false + description: + description: + - Description of the job being defined. + required: false + max_concurrent_runs: + description: + - The maximum number of concurrent runs allowed for the job. The default is 1. An error is returned when + this threshold is reached. The maximum value you can specify is controlled by a service limit. + required: false + max_retries: + description: + - The maximum number of times to retry this job if it fails. + required: false + name: + description: + - The name you assign to this job definition. It must be unique in your account. + required: true + role: + description: + - The name or ARN of the IAM role associated with this job. + required: true + state: + description: + - Create or delete the AWS Glue job. + required: true + choices: [ 'present', 'absent' ] + timeout: + description: + - The job timeout in minutes. + required: false +extends_documentation_fragment: + - aws + - ec2 +''' + +EXAMPLES = ''' +# Note: These examples do not set authentication details, see the AWS Guide for details. + +# Create an AWS Glue job +- aws_glue_job: + command_script_location: s3bucket/script.py + name: my-glue-job + role: my-iam-role + state: present + +# Delete an AWS Glue job +- aws_glue_job: + name: my-glue-job + state: absent + +''' + +RETURN = ''' +allocated_capacity: + description: The number of AWS Glue data processing units (DPUs) allocated to runs of this job. From 2 to + 100 DPUs can be allocated; the default is 10. A DPU is a relative measure of processing power + that consists of 4 vCPUs of compute capacity and 16 GB of memory. + returned: when state is present + type: int + sample: 10 +command: + description: The JobCommand that executes this job. + returned: when state is present + type: complex + contains: + name: + description: The name of the job command. + returned: when state is present + type: string + sample: glueetl + script_location: + description: Specifies the S3 path to a script that executes a job. + returned: when state is present + type: string + sample: mybucket/myscript.py +connections: + description: The connections used for this job. + returned: when state is present + type: dict + sample: "{ Connections: [ 'list', 'of', 'connections' ] }" +created_on: + description: The time and date that this job definition was created. + returned: when state is present + type: string + sample: "2018-04-21T05:19:58.326000+00:00" +default_arguments: + description: The default arguments for this job, specified as name-value pairs. + returned: when state is present + type: dict + sample: "{ 'mykey1': 'myvalue1' }" +description: + description: Description of the job being defined. + returned: when state is present + type: string + sample: My first Glue job +job_name: + description: The name of the AWS Glue job. + returned: always + type: string + sample: my-glue-job +execution_property: + description: An ExecutionProperty specifying the maximum number of concurrent runs allowed for this job. + returned: always + type: complex + contains: + max_concurrent_runs: + description: The maximum number of concurrent runs allowed for the job. The default is 1. An error is + returned when this threshold is reached. The maximum value you can specify is controlled by + a service limit. + returned: when state is present + type: int + sample: 1 +last_modified_on: + description: The last point in time when this job definition was modified. + returned: when state is present + type: string + sample: "2018-04-21T05:19:58.326000+00:00" +max_retries: + description: The maximum number of times to retry this job after a JobRun fails. + returned: when state is present + type: int + sample: 5 +name: + description: The name assigned to this job definition. + returned: when state is present + type: string + sample: my-glue-job +role: + description: The name or ARN of the IAM role associated with this job. + returned: when state is present + type: string + sample: my-iam-role +timeout: + description: The job timeout in minutes. + returned: when state is present + type: int + sample: 300 +''' + +from ansible.module_utils.aws.core import AnsibleAWSModule +from ansible.module_utils.ec2 import camel_dict_to_snake_dict + +# Non-ansible imports +import copy +try: + from botocore.exceptions import BotoCoreError, ClientError +except ImportError: + pass + + +def _get_glue_job(connection, module, glue_job_name): + """ + Get an AWS Glue job based on name. If not found, return None. + + :param connection: AWS boto3 glue connection + :param module: Ansible module + :param glue_job_name: Name of Glue job to get + :return: boto3 Glue job dict or None if not found + """ + + try: + return connection.get_job(JobName=glue_job_name)['Job'] + except (BotoCoreError, ClientError) as e: + if e.response['Error']['Code'] == 'EntityNotFoundException': + return None + else: + module.fail_json_aws(e) + + +def _compare_glue_job_params(user_params, current_params): + """ + Compare Glue job params. If there is a difference, return True immediately else return False + + :param user_params: the Glue job parameters passed by the user + :param current_params: the Glue job parameters currently configured + :return: True if any parameter is mismatched else False + """ + + # Weirdly, boto3 doesn't return some keys if the value is empty e.g. Description + # To counter this, add the key if it's missing with a blank value + + if 'Description' not in current_params: + current_params['Description'] = "" + if 'DefaultArguments' not in current_params: + current_params['DefaultArguments'] = dict() + + if 'AllocatedCapacity' in user_params and user_params['AllocatedCapacity'] != current_params['AllocatedCapacity']: + return True + if 'Command' in user_params and user_params['Command']['ScriptLocation'] != current_params['Command']['ScriptLocation']: + return True + if 'Connections' in user_params and set(user_params['Connections']) != set(current_params['Connections']): + return True + if 'DefaultArguments' in user_params and set(user_params['DefaultArguments']) != set(current_params['DefaultArguments']): + return True + if 'Description' in user_params and user_params['Description'] != current_params['Description']: + return True + if 'ExecutionProperty' in user_params and user_params['ExecutionProperty']['MaxConcurrentRuns'] != current_params['ExecutionProperty']['MaxConcurrentRuns']: + return True + if 'MaxRetries' in user_params and user_params['MaxRetries'] != current_params['MaxRetries']: + return True + if 'Timeout' in user_params and user_params['Timeout'] != current_params['Timeout']: + return True + + return False + + +def create_or_update_glue_job(connection, module, glue_job): + """ + Create or update an AWS Glue job + + :param connection: AWS boto3 glue connection + :param module: Ansible module + :param glue_job: a dict of AWS Glue job parameters or None + :return: + """ + + changed = False + params = dict() + params['Name'] = module.params.get("name") + params['Role'] = module.params.get("role") + if module.params.get("allocated_capacity") is not None: + params['AllocatedCapacity'] = module.params.get("allocated_capacity") + if module.params.get("command_script_location") is not None: + params['Command'] = {'Name': module.params.get("command_name"), 'ScriptLocation': module.params.get("command_script_location")} + if module.params.get("connections") is not None: + params['Connections'] = {'Connections': module.params.get("connections")} + if module.params.get("default_arguments") is not None: + params['DefaultArguments'] = module.params.get("default_arguments") + if module.params.get("description") is not None: + params['Description'] = module.params.get("description") + if module.params.get("max_concurrent_runs") is not None: + params['ExecutionProperty'] = {'MaxConcurrentRuns': module.params.get("max_concurrent_runs")} + if module.params.get("max_retries") is not None: + params['MaxRetries'] = module.params.get("max_retries") + if module.params.get("timeout") is not None: + params['Timeout'] = module.params.get("timeout") + + # If glue_job is not None then check if it needs to be modified, else create it + if glue_job: + if _compare_glue_job_params(params, glue_job): + try: + # Update job needs slightly modified params + update_params = {'JobName': params['Name'], 'JobUpdate': copy.deepcopy(params)} + del update_params['JobUpdate']['Name'] + connection.update_job(**update_params) + changed = True + except (BotoCoreError, ClientError) as e: + module.fail_json_aws(e) + else: + try: + connection.create_job(**params) + changed = True + except (BotoCoreError, ClientError) as e: + module.fail_json_aws(e) + + # If changed, get the Glue job again + if changed: + glue_job = _get_glue_job(connection, module, params['Name']) + + module.exit_json(changed=changed, **camel_dict_to_snake_dict(glue_job)) + + +def delete_glue_job(connection, module, glue_job): + """ + Delete an AWS Glue job + + :param connection: AWS boto3 glue connection + :param module: Ansible module + :param glue_job: a dict of AWS Glue job parameters or None + :return: + """ + + changed = False + + if glue_job: + try: + connection.delete_job(JobName=glue_job['Name']) + changed = True + except (BotoCoreError, ClientError) as e: + module.fail_json_aws(e) + + module.exit_json(changed=changed) + + +def main(): + + argument_spec = ( + dict( + allocated_capacity=dict(type='int'), + command_name=dict(type='str', default='glueetl'), + command_script_location=dict(type='str'), + connections=dict(type='list'), + default_arguments=dict(type='dict'), + description=dict(type='str'), + max_concurrent_runs=dict(type='int'), + max_retries=dict(type='int'), + name=dict(required=True, type='str'), + role=dict(type='str'), + state=dict(required=True, choices=['present', 'absent'], type='str'), + timeout=dict(type='int') + ) + ) + + module = AnsibleAWSModule(argument_spec=argument_spec, + required_if=[ + ('state', 'present', ['role', 'command_script_location']) + ] + ) + + connection = module.client('glue') + + state = module.params.get("state") + + glue_job = _get_glue_job(connection, module, module.params.get("name")) + + if state == 'present': + create_or_update_glue_job(connection, module, glue_job) + else: + delete_glue_job(connection, module, glue_job) + +if __name__ == '__main__': + main()