New filter plugin - unicode_normalization (#3359)

* Initial commit * Adding maintainer in BOTMETA * Adding changelog fragment * Updating filter_guide * Applying initial review suggestions
2024-09-14 20:13:21 +02:00 · 2021-09-12 07:46:53 -04:00 · 2021-09-12 07:46:53 -04:00 · 29e4066944
commit 29e4066944
parent 612543919e
7 changed files with 122 additions and 0 deletions
--- a/.github/BOTMETA.yml
+++ b/.github/BOTMETA.yml
@ -131,6 +131,8 @@ files:
  $filters/random_mac.py: {}
  $filters/time.py:
    maintainers: resmo
+  $filters/unicode_normalize.py:
+    maintainers: Ajpantuso
  $filters/version_sort.py:
    maintainers: ericzolf
  $inventories/:
--- a/changelogs/fragments/3359-add-unicode_normalize-filter.yml
+++ b/changelogs/fragments/3359-add-unicode_normalize-filter.yml
@ -0,0 +1,4 @@
+---
+add plugin.filter:
+  - name: unicode_normalize
+    description: Normalizes unicode strings to facilitate comparison of characters with normalized forms
--- a/docs/docsite/rst/filter_guide.rst
+++ b/docs/docsite/rst/filter_guide.rst
@ -751,3 +751,34 @@ To extract ports from all clusters with name containing 'server1':
        server_name_query: "domain.server[?contains(name,'server1')].port"

 .. note:: while using ``starts_with`` and ``contains``, you have to use `` to_json | from_json `` filter for correct parsing of data structure.
+
+Working with Unicode
+---------------------
+
+`Unicode <https://unicode.org/main.html>`_ makes it possible to produce two strings which may be visually equivalent, but are comprised of distinctly different characters/character sequences. To address this ``Unicode`` defines `normalization forms <https://unicode.org/reports/tr15/>`_ which avoid these distinctions by choosing a unique character sequence for a given visual representation.
+
+You can use the ``community.general.unicode_normalize`` filter to normalize ``Unicode`` strings within your playbooks.
+
+.. code-block:: yaml+jinja
+
+    - name: Compare Unicode representations
+      debug:
+        msg: "{{ with_combining_character | community.general.unicode_normalize == without_combining_character }}"
+      vars:
+        with_combining_character: "{{ 'Mayagu\u0308ez' }}"
+        without_combining_character: Mayagüez
+
+This produces:
+
+.. code-block:: ansible-output
+
+    TASK [Compare Unicode representations] ********************************************************
+    ok: [localhost] => {
+        "msg": true
+    }
+
+The ``community.general.unicode_normalize`` filter accepts a keyword argument to select the ``Unicode`` form used to normalize the input string.
+
+:form: One of ``'NFC'`` (default), ``'NFD'``, ``'NFKC'``, or ``'NFKD'``. See the `Unicode reference <https://unicode.org/reports/tr15/>`_ for more information.
+
+.. versionadded:: 3.7.0
--- a/plugins/filter/unicode_normalize.py
+++ b/plugins/filter/unicode_normalize.py
@ -0,0 +1,40 @@
+# -*- coding: utf-8 -*-
+
+# Copyright: (c) 2021, Andrew Pantuso (@ajpantuso) <ajpantuso@gmail.com>
+# GNU General Public License v3.0+ (see COPYING or https://www.gnu.org/licenses/gpl-3.0.txt)
+
+from __future__ import absolute_import, division, print_function
+__metaclass__ = type
+
+from unicodedata import normalize
+
+from ansible.errors import AnsibleFilterError, AnsibleFilterTypeError
+from ansible.module_utils.six import text_type
+
+
+def unicode_normalize(data, form='NFC'):
+    """Applies normalization to 'unicode' strings.
+
+    Args:
+        data: A unicode string piped into the Jinja filter
+        form: One of ('NFC', 'NFD', 'NFKC', 'NFKD').
+              See https://docs.python.org/3/library/unicodedata.html#unicodedata.normalize for more information.
+
+    Returns:
+        A normalized unicode string of the specified 'form'.
+    """
+
+    if not isinstance(data, text_type):
+        raise AnsibleFilterTypeError("%s is not a valid input type" % type(data))
+
+    if form not in ('NFC', 'NFD', 'NFKC', 'NFKD'):
+        raise AnsibleFilterError("%s is not a valid form" % form)
+
+    return normalize(form, data)
+
+
+class FilterModule(object):
+    def filters(self):
+        return {
+            'unicode_normalize': unicode_normalize,
+        }
--- a/tests/integration/targets/filter_unicode_normalize/aliases
+++ b/tests/integration/targets/filter_unicode_normalize/aliases
@ -0,0 +1,2 @@
+shippable/posix/group2
+skip/python2.6  # filters are controller only, and we no longer support Python 2.6 on the controller
--- a/tests/integration/targets/filter_unicode_normalize/tasks/main.yml
+++ b/tests/integration/targets/filter_unicode_normalize/tasks/main.yml
@ -0,0 +1,39 @@
+####################################################################
+# WARNING: These are designed specifically for Ansible tests       #
+# and should not be used as examples of how to write Ansible roles #
+####################################################################
+
+- name: Test 'NFC' normalization
+  assert:
+    that:
+      - u_umlaut != u_umlaut_combining
+      - u_umlaut_combining != (u_umlaut_combining | community.general.unicode_normalize)
+      - u_umlaut == (u_umlaut_combining | community.general.unicode_normalize)
+
+- name: Test 'NFKC' normalization
+  assert:
+    that:
+      - latin_capital_i != roman_numeral_one
+      - latin_capital_i == (roman_numeral_one | community.general.unicode_normalize(form='NFKC'))
+
+- name: Register invalid input type
+  debug:
+    msg: "{{ 1 | community.general.unicode_normalize }}"
+  ignore_errors: true
+  register: invalid_input_type
+
+- name: Assert an invalid input type causes failure
+  assert:
+    that:
+      - invalid_input_type is failed
+
+- name: Register invalid form selection
+  debug:
+    msg: "{{ 'arbitrary text' | community.general.unicode_normalize(form='invalid') }}"
+  ignore_errors: true
+  register: invalid_form_selection
+
+- name: Assert invalid form selection causes failure
+  assert:
+    that:
+      - invalid_form_selection is failed
--- a/tests/integration/targets/filter_unicode_normalize/vars/main.yml
+++ b/tests/integration/targets/filter_unicode_normalize/vars/main.yml
@ -0,0 +1,4 @@
+u_umlaut: "{{ '\u00fc' }}"
+u_umlaut_combining: "{{ 'u' + '\u0308' }}"
+roman_numeral_one: "{{ '\u2160' }}"
+latin_capital_i: "{{ '\u0049' }}"