From ef65d3dbf6304adf92cdceaec17152d812ebd3a7 Mon Sep 17 00:00:00 2001 From: Marcus Cobden Date: Sat, 9 Aug 2014 23:47:08 +0100 Subject: [PATCH 1/5] Optimise string handling in ansible.utils._clean_data --- lib/ansible/utils/__init__.py | 98 ++++++++++++++++------------------- 1 file changed, 46 insertions(+), 52 deletions(-) diff --git a/lib/ansible/utils/__init__.py b/lib/ansible/utils/__init__.py index 5da39acab7..9a861c86ea 100644 --- a/lib/ansible/utils/__init__.py +++ b/lib/ansible/utils/__init__.py @@ -33,7 +33,7 @@ from ansible.module_utils.splitter import split_args, unquote import ansible.constants as C import ast import time -import StringIO +import cStringIO import stat import termios import tty @@ -46,6 +46,7 @@ import getpass import sys import json import subprocess +import contextlib from vault import VaultLib @@ -55,7 +56,9 @@ MAX_FILE_SIZE_FOR_DIFF=1*1024*1024 # caching the compilation of the regex used # to check for lookup calls within data -LOOKUP_REGEX=re.compile(r'lookup\s*\(') +LOOKUP_REGEX = re.compile(r'lookup\s*\(') +PRINT_CODE_REGEX = re.compile(r'(?:{[{%]|[%}]})') +CODE_REGEX = re.compile(r'(?:{%|%})') try: import json @@ -355,64 +358,55 @@ def _clean_data(orig_data, from_remote=False, from_inventory=False): if not isinstance(orig_data, basestring): return orig_data - data = StringIO.StringIO("") - # when the data is marked as having come from a remote, we always # replace any print blocks (ie. {{var}}), however when marked as coming # from inventory we only replace print blocks that contain a call to # a lookup plugin (ie. {{lookup('foo','bar'))}}) replace_prints = from_remote or (from_inventory and '{{' in orig_data and LOOKUP_REGEX.search(orig_data) is not None) - # these variables keep track of opening block locations, as we only - # want to replace matched pairs of print/block tags - print_openings = [] - block_openings = [] + regex = PRINT_CODE_REGEX if replace_prints else ONLY_CODE_REGEX - for idx,c in enumerate(orig_data): - # if the current character is an opening brace, check to - # see if this is a jinja2 token. Otherwise, if the current - # character is a closing brace, we backup one character to - # see if we have a closing. - if c == '{' and idx < len(orig_data) - 1: - token = orig_data[idx:idx+2] - # if so, and we want to replace this block, push - # this token's location onto the appropriate array - if token == '{{' and replace_prints: - print_openings.append(idx) - elif token == '{%': - block_openings.append(idx) - # finally we write the data to the buffer and write - data.seek(0, os.SEEK_END) - data.write(c) - elif c == '}' and idx > 0: - token = orig_data[idx-1:idx+1] - prev_idx = -1 - if token == '%}' and len(block_openings) > 0: - prev_idx = block_openings.pop() - elif token == '}}' and len(print_openings) > 0: - prev_idx = print_openings.pop() - # if we have a closing token, and we have previously found - # the opening to the same kind of block represented by this - # token, replace both occurrences, otherwise we just write - # the current character to the buffer - if prev_idx != -1: - # replace the opening - data.seek(prev_idx, os.SEEK_SET) - data.write('{#') - # replace the closing - data.seek(-1, os.SEEK_END) - data.write('#}') + with contextlib.closing(cStringIO.StringIO()) as data: + # these variables keep track of opening block locations, as we only + # want to replace matched pairs of print/block tags + last_pos = 0 + print_openings = [] + block_openings = [] + for mo in regex.finditer(orig_data): + token = mo.group(0) + token_start = mo.start(0) + token_end = mo.end(0) + + if token[0] == '{': + if token == '{%': + block_openings.append(token_start) + elif token == '{{': + print_openings.append(token_start) + data.write(orig_data[last_pos:token_end]) + elif token[1] == '}': + prev_idx = None + if token == '%}' and block_openings: + prev_idx = block_openings.pop() + elif token == '}}' and print_openings: + prev_idx = print_openings.pop() + + data.write(orig_data[last_pos:token_start]) + if prev_idx is not None: + # replace the opening + data.seek(prev_idx, os.SEEK_SET) + data.write('{#') + # replace the closing + data.seek(0, os.SEEK_END) + data.write('#}') + else: + data.write(token) else: - data.seek(0, os.SEEK_END) - data.write(c) - else: - # not a jinja2 token, so we just write the current char - # to the output buffer - data.seek(0, os.SEEK_END) - data.write(c) - return_data = data.getvalue() - data.close() - return return_data + assert False, 'Unhandled regex match' + last_pos = token_end + + data.write(orig_data[last_pos:]) + + return data.getvalue() def _clean_data_struct(orig_data, from_remote=False, from_inventory=False): ''' From 84114e5c0b9badfda59e889649aaeb3d15e4ce20 Mon Sep 17 00:00:00 2001 From: Marcus Cobden Date: Mon, 11 Aug 2014 08:43:40 +0100 Subject: [PATCH 2/5] Fix copy/paste error --- lib/ansible/utils/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/ansible/utils/__init__.py b/lib/ansible/utils/__init__.py index 9a861c86ea..44207134bc 100644 --- a/lib/ansible/utils/__init__.py +++ b/lib/ansible/utils/__init__.py @@ -364,7 +364,7 @@ def _clean_data(orig_data, from_remote=False, from_inventory=False): # a lookup plugin (ie. {{lookup('foo','bar'))}}) replace_prints = from_remote or (from_inventory and '{{' in orig_data and LOOKUP_REGEX.search(orig_data) is not None) - regex = PRINT_CODE_REGEX if replace_prints else ONLY_CODE_REGEX + regex = PRINT_CODE_REGEX if replace_prints else CODE_REGEX with contextlib.closing(cStringIO.StringIO()) as data: # these variables keep track of opening block locations, as we only From e3dbca937860e070c556d510f60448f30281886d Mon Sep 17 00:00:00 2001 From: Marcus Cobden Date: Mon, 11 Aug 2014 08:46:47 +0100 Subject: [PATCH 3/5] Test escaping strings with two variables --- test/units/TestUtils.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/test/units/TestUtils.py b/test/units/TestUtils.py index b091603626..a33e2080e7 100644 --- a/test/units/TestUtils.py +++ b/test/units/TestUtils.py @@ -729,6 +729,10 @@ class TestUtils(unittest.TestCase): ansible.utils._clean_data('this string has a {{variable}}', from_remote=True), 'this string has a {#variable#}' ) + self.assertEqual( + ansible.utils._clean_data('this string {{has}} two {{variables}} in it', from_remote=True), + 'this string {#has#} two {#variables#} in it' + ) self.assertEqual( ansible.utils._clean_data('this string has a {{variable with a\nnewline}}', from_remote=True), 'this string has a {#variable with a\nnewline#}' From c8bfd157f87e804ae69e2dbb3b0b56ee647b94eb Mon Sep 17 00:00:00 2001 From: Marcus Cobden Date: Mon, 11 Aug 2014 08:48:37 +0100 Subject: [PATCH 4/5] Switch cStringIO to StringIO for unicode support The performance difference isn't too bad --- lib/ansible/utils/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/ansible/utils/__init__.py b/lib/ansible/utils/__init__.py index 44207134bc..cd11318f9c 100644 --- a/lib/ansible/utils/__init__.py +++ b/lib/ansible/utils/__init__.py @@ -33,7 +33,7 @@ from ansible.module_utils.splitter import split_args, unquote import ansible.constants as C import ast import time -import cStringIO +import StringIO import stat import termios import tty @@ -366,7 +366,7 @@ def _clean_data(orig_data, from_remote=False, from_inventory=False): regex = PRINT_CODE_REGEX if replace_prints else CODE_REGEX - with contextlib.closing(cStringIO.StringIO()) as data: + with contextlib.closing(StringIO.StringIO()) as data: # these variables keep track of opening block locations, as we only # want to replace matched pairs of print/block tags last_pos = 0 From c47d1f526584722729b3cd31c556f3b8928bfc65 Mon Sep 17 00:00:00 2001 From: Marcus Cobden Date: Mon, 11 Aug 2014 09:07:21 +0100 Subject: [PATCH 5/5] Pre-load whole string and use seek to alter tags --- lib/ansible/utils/__init__.py | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/lib/ansible/utils/__init__.py b/lib/ansible/utils/__init__.py index cd11318f9c..ab1493d023 100644 --- a/lib/ansible/utils/__init__.py +++ b/lib/ansible/utils/__init__.py @@ -366,23 +366,21 @@ def _clean_data(orig_data, from_remote=False, from_inventory=False): regex = PRINT_CODE_REGEX if replace_prints else CODE_REGEX - with contextlib.closing(StringIO.StringIO()) as data: + with contextlib.closing(StringIO.StringIO(orig_data)) as data: # these variables keep track of opening block locations, as we only # want to replace matched pairs of print/block tags - last_pos = 0 print_openings = [] block_openings = [] for mo in regex.finditer(orig_data): token = mo.group(0) token_start = mo.start(0) - token_end = mo.end(0) if token[0] == '{': if token == '{%': block_openings.append(token_start) elif token == '{{': print_openings.append(token_start) - data.write(orig_data[last_pos:token_end]) + elif token[1] == '}': prev_idx = None if token == '%}' and block_openings: @@ -390,21 +388,16 @@ def _clean_data(orig_data, from_remote=False, from_inventory=False): elif token == '}}' and print_openings: prev_idx = print_openings.pop() - data.write(orig_data[last_pos:token_start]) if prev_idx is not None: # replace the opening data.seek(prev_idx, os.SEEK_SET) data.write('{#') # replace the closing - data.seek(0, os.SEEK_END) + data.seek(token_start, os.SEEK_SET) data.write('#}') - else: - data.write(token) + else: assert False, 'Unhandled regex match' - last_pos = token_end - - data.write(orig_data[last_pos:]) return data.getvalue()