From ef65d3dbf6304adf92cdceaec17152d812ebd3a7 Mon Sep 17 00:00:00 2001
From: Marcus Cobden <mcobden@cisco.com>
Date: Sat, 9 Aug 2014 23:47:08 +0100
Subject: [PATCH 1/5] Optimise string handling in ansible.utils._clean_data

---
 lib/ansible/utils/__init__.py | 98 ++++++++++++++++-------------------
 1 file changed, 46 insertions(+), 52 deletions(-)

diff --git a/lib/ansible/utils/__init__.py b/lib/ansible/utils/__init__.py
index 5da39acab7..9a861c86ea 100644
--- a/lib/ansible/utils/__init__.py
+++ b/lib/ansible/utils/__init__.py
@@ -33,7 +33,7 @@ from ansible.module_utils.splitter import split_args, unquote
 import ansible.constants as C
 import ast
 import time
-import StringIO
+import cStringIO
 import stat
 import termios
 import tty
@@ -46,6 +46,7 @@ import getpass
 import sys
 import json
 import subprocess
+import contextlib
 
 from vault import VaultLib
 
@@ -55,7 +56,9 @@ MAX_FILE_SIZE_FOR_DIFF=1*1024*1024
 
 # caching the compilation of the regex used
 # to check for lookup calls within data
-LOOKUP_REGEX=re.compile(r'lookup\s*\(')
+LOOKUP_REGEX = re.compile(r'lookup\s*\(')
+PRINT_CODE_REGEX = re.compile(r'(?:{[{%]|[%}]})')
+CODE_REGEX = re.compile(r'(?:{%|%})')
 
 try:
     import json
@@ -355,64 +358,55 @@ def _clean_data(orig_data, from_remote=False, from_inventory=False):
     if not isinstance(orig_data, basestring):
         return orig_data
 
-    data = StringIO.StringIO("")
-
     # when the data is marked as having come from a remote, we always
     # replace any print blocks (ie. {{var}}), however when marked as coming
     # from inventory we only replace print blocks that contain a call to
     # a lookup plugin (ie. {{lookup('foo','bar'))}})
     replace_prints = from_remote or (from_inventory and '{{' in orig_data and LOOKUP_REGEX.search(orig_data) is not None)
 
-    # these variables keep track of opening block locations, as we only
-    # want to replace matched pairs of print/block tags
-    print_openings = []
-    block_openings = []
+    regex = PRINT_CODE_REGEX if replace_prints else ONLY_CODE_REGEX
 
-    for idx,c in enumerate(orig_data):
-        # if the current character is an opening brace, check to
-        # see if this is a jinja2 token. Otherwise, if the current
-        # character is a closing brace, we backup one character to
-        # see if we have a closing.
-        if c == '{' and idx < len(orig_data) - 1:
-            token = orig_data[idx:idx+2]
-            # if so, and we want to replace this block, push
-            # this token's location onto the appropriate array
-            if token == '{{' and replace_prints:
-                print_openings.append(idx)
-            elif token == '{%':
-                block_openings.append(idx)
-            # finally we write the data to the buffer and write
-            data.seek(0, os.SEEK_END)
-            data.write(c)
-        elif c == '}' and idx > 0:
-            token = orig_data[idx-1:idx+1]
-            prev_idx = -1
-            if token == '%}' and len(block_openings) > 0:
-                prev_idx = block_openings.pop()
-            elif token == '}}' and len(print_openings) > 0:
-                prev_idx = print_openings.pop()
-            # if we have a closing token, and we have previously found
-            # the opening to the same kind of block represented by this
-            # token, replace both occurrences, otherwise we just write
-            # the current character to the buffer
-            if prev_idx != -1:
-                # replace the opening
-                data.seek(prev_idx, os.SEEK_SET)
-                data.write('{#')
-                # replace the closing
-                data.seek(-1, os.SEEK_END)
-                data.write('#}')
+    with contextlib.closing(cStringIO.StringIO()) as data:
+        # these variables keep track of opening block locations, as we only
+        # want to replace matched pairs of print/block tags
+        last_pos = 0
+        print_openings = []
+        block_openings = []
+        for mo in regex.finditer(orig_data):
+            token = mo.group(0)
+            token_start = mo.start(0)
+            token_end = mo.end(0)
+
+            if token[0] == '{':
+                if token == '{%':
+                    block_openings.append(token_start)
+                elif token == '{{':
+                    print_openings.append(token_start)
+                data.write(orig_data[last_pos:token_end])
+            elif token[1] == '}':
+                prev_idx = None
+                if token == '%}' and block_openings:
+                    prev_idx = block_openings.pop()
+                elif token == '}}' and print_openings:
+                    prev_idx = print_openings.pop()
+
+                data.write(orig_data[last_pos:token_start])
+                if prev_idx is not None:
+                    # replace the opening
+                    data.seek(prev_idx, os.SEEK_SET)
+                    data.write('{#')
+                    # replace the closing
+                    data.seek(0, os.SEEK_END)
+                    data.write('#}')
+                else:
+                    data.write(token)
             else:
-                data.seek(0, os.SEEK_END)
-                data.write(c)
-        else:
-            # not a jinja2 token, so we just write the current char
-            # to the output buffer
-            data.seek(0, os.SEEK_END)
-            data.write(c)
-    return_data = data.getvalue()
-    data.close()
-    return return_data
+                assert False, 'Unhandled regex match'
+            last_pos = token_end
+
+        data.write(orig_data[last_pos:])
+
+        return data.getvalue()
 
 def _clean_data_struct(orig_data, from_remote=False, from_inventory=False):
     '''

From 84114e5c0b9badfda59e889649aaeb3d15e4ce20 Mon Sep 17 00:00:00 2001
From: Marcus Cobden <mcobden@cisco.com>
Date: Mon, 11 Aug 2014 08:43:40 +0100
Subject: [PATCH 2/5] Fix copy/paste error

---
 lib/ansible/utils/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/ansible/utils/__init__.py b/lib/ansible/utils/__init__.py
index 9a861c86ea..44207134bc 100644
--- a/lib/ansible/utils/__init__.py
+++ b/lib/ansible/utils/__init__.py
@@ -364,7 +364,7 @@ def _clean_data(orig_data, from_remote=False, from_inventory=False):
     # a lookup plugin (ie. {{lookup('foo','bar'))}})
     replace_prints = from_remote or (from_inventory and '{{' in orig_data and LOOKUP_REGEX.search(orig_data) is not None)
 
-    regex = PRINT_CODE_REGEX if replace_prints else ONLY_CODE_REGEX
+    regex = PRINT_CODE_REGEX if replace_prints else CODE_REGEX
 
     with contextlib.closing(cStringIO.StringIO()) as data:
         # these variables keep track of opening block locations, as we only

From e3dbca937860e070c556d510f60448f30281886d Mon Sep 17 00:00:00 2001
From: Marcus Cobden <mcobden@cisco.com>
Date: Mon, 11 Aug 2014 08:46:47 +0100
Subject: [PATCH 3/5] Test escaping strings with two variables

---
 test/units/TestUtils.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/test/units/TestUtils.py b/test/units/TestUtils.py
index b091603626..a33e2080e7 100644
--- a/test/units/TestUtils.py
+++ b/test/units/TestUtils.py
@@ -729,6 +729,10 @@ class TestUtils(unittest.TestCase):
             ansible.utils._clean_data('this string has a {{variable}}', from_remote=True),
             'this string has a {#variable#}'
         )
+        self.assertEqual(
+            ansible.utils._clean_data('this string {{has}} two {{variables}} in it', from_remote=True),
+            'this string {#has#} two {#variables#} in it'
+        )
         self.assertEqual(
             ansible.utils._clean_data('this string has a {{variable with a\nnewline}}', from_remote=True),
             'this string has a {#variable with a\nnewline#}'

From c8bfd157f87e804ae69e2dbb3b0b56ee647b94eb Mon Sep 17 00:00:00 2001
From: Marcus Cobden <mcobden@cisco.com>
Date: Mon, 11 Aug 2014 08:48:37 +0100
Subject: [PATCH 4/5] Switch cStringIO to StringIO for unicode support

The performance difference isn't too bad
---
 lib/ansible/utils/__init__.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/ansible/utils/__init__.py b/lib/ansible/utils/__init__.py
index 44207134bc..cd11318f9c 100644
--- a/lib/ansible/utils/__init__.py
+++ b/lib/ansible/utils/__init__.py
@@ -33,7 +33,7 @@ from ansible.module_utils.splitter import split_args, unquote
 import ansible.constants as C
 import ast
 import time
-import cStringIO
+import StringIO
 import stat
 import termios
 import tty
@@ -366,7 +366,7 @@ def _clean_data(orig_data, from_remote=False, from_inventory=False):
 
     regex = PRINT_CODE_REGEX if replace_prints else CODE_REGEX
 
-    with contextlib.closing(cStringIO.StringIO()) as data:
+    with contextlib.closing(StringIO.StringIO()) as data:
         # these variables keep track of opening block locations, as we only
         # want to replace matched pairs of print/block tags
         last_pos = 0

From c47d1f526584722729b3cd31c556f3b8928bfc65 Mon Sep 17 00:00:00 2001
From: Marcus Cobden <mcobden@cisco.com>
Date: Mon, 11 Aug 2014 09:07:21 +0100
Subject: [PATCH 5/5] Pre-load whole string and use seek to alter tags

---
 lib/ansible/utils/__init__.py | 15 ++++-----------
 1 file changed, 4 insertions(+), 11 deletions(-)

diff --git a/lib/ansible/utils/__init__.py b/lib/ansible/utils/__init__.py
index cd11318f9c..ab1493d023 100644
--- a/lib/ansible/utils/__init__.py
+++ b/lib/ansible/utils/__init__.py
@@ -366,23 +366,21 @@ def _clean_data(orig_data, from_remote=False, from_inventory=False):
 
     regex = PRINT_CODE_REGEX if replace_prints else CODE_REGEX
 
-    with contextlib.closing(StringIO.StringIO()) as data:
+    with contextlib.closing(StringIO.StringIO(orig_data)) as data:
         # these variables keep track of opening block locations, as we only
         # want to replace matched pairs of print/block tags
-        last_pos = 0
         print_openings = []
         block_openings = []
         for mo in regex.finditer(orig_data):
             token = mo.group(0)
             token_start = mo.start(0)
-            token_end = mo.end(0)
 
             if token[0] == '{':
                 if token == '{%':
                     block_openings.append(token_start)
                 elif token == '{{':
                     print_openings.append(token_start)
-                data.write(orig_data[last_pos:token_end])
+
             elif token[1] == '}':
                 prev_idx = None
                 if token == '%}' and block_openings:
@@ -390,21 +388,16 @@ def _clean_data(orig_data, from_remote=False, from_inventory=False):
                 elif token == '}}' and print_openings:
                     prev_idx = print_openings.pop()
 
-                data.write(orig_data[last_pos:token_start])
                 if prev_idx is not None:
                     # replace the opening
                     data.seek(prev_idx, os.SEEK_SET)
                     data.write('{#')
                     # replace the closing
-                    data.seek(0, os.SEEK_END)
+                    data.seek(token_start, os.SEEK_SET)
                     data.write('#}')
-                else:
-                    data.write(token)
+
             else:
                 assert False, 'Unhandled regex match'
-            last_pos = token_end
-
-        data.write(orig_data[last_pos:])
 
         return data.getvalue()