diff --git a/.github/BOTMETA.yml b/.github/BOTMETA.yml index 3dfca22e73..df2520e263 100644 --- a/.github/BOTMETA.yml +++ b/.github/BOTMETA.yml @@ -131,6 +131,8 @@ files: $filters/random_mac.py: {} $filters/time.py: maintainers: resmo + $filters/unicode_normalize.py: + maintainers: Ajpantuso $filters/version_sort.py: maintainers: ericzolf $inventories/: diff --git a/changelogs/fragments/3359-add-unicode_normalize-filter.yml b/changelogs/fragments/3359-add-unicode_normalize-filter.yml new file mode 100644 index 0000000000..33aa06dc92 --- /dev/null +++ b/changelogs/fragments/3359-add-unicode_normalize-filter.yml @@ -0,0 +1,4 @@ +--- +add plugin.filter: + - name: unicode_normalize + description: Normalizes unicode strings to facilitate comparison of characters with normalized forms diff --git a/docs/docsite/rst/filter_guide.rst b/docs/docsite/rst/filter_guide.rst index 201b275aae..dab8464439 100644 --- a/docs/docsite/rst/filter_guide.rst +++ b/docs/docsite/rst/filter_guide.rst @@ -751,3 +751,34 @@ To extract ports from all clusters with name containing 'server1': server_name_query: "domain.server[?contains(name,'server1')].port" .. note:: while using ``starts_with`` and ``contains``, you have to use `` to_json | from_json `` filter for correct parsing of data structure. + +Working with Unicode +--------------------- + +`Unicode `_ makes it possible to produce two strings which may be visually equivalent, but are comprised of distinctly different characters/character sequences. To address this ``Unicode`` defines `normalization forms `_ which avoid these distinctions by choosing a unique character sequence for a given visual representation. + +You can use the ``community.general.unicode_normalize`` filter to normalize ``Unicode`` strings within your playbooks. + +.. code-block:: yaml+jinja + + - name: Compare Unicode representations + debug: + msg: "{{ with_combining_character | community.general.unicode_normalize == without_combining_character }}" + vars: + with_combining_character: "{{ 'Mayagu\u0308ez' }}" + without_combining_character: Mayagüez + +This produces: + +.. code-block:: ansible-output + + TASK [Compare Unicode representations] ******************************************************** + ok: [localhost] => { + "msg": true + } + +The ``community.general.unicode_normalize`` filter accepts a keyword argument to select the ``Unicode`` form used to normalize the input string. + +:form: One of ``'NFC'`` (default), ``'NFD'``, ``'NFKC'``, or ``'NFKD'``. See the `Unicode reference `_ for more information. + +.. versionadded:: 3.7.0 diff --git a/plugins/filter/unicode_normalize.py b/plugins/filter/unicode_normalize.py new file mode 100644 index 0000000000..9afbf29e3f --- /dev/null +++ b/plugins/filter/unicode_normalize.py @@ -0,0 +1,40 @@ +# -*- coding: utf-8 -*- + +# Copyright: (c) 2021, Andrew Pantuso (@ajpantuso) +# GNU General Public License v3.0+ (see COPYING or https://www.gnu.org/licenses/gpl-3.0.txt) + +from __future__ import absolute_import, division, print_function +__metaclass__ = type + +from unicodedata import normalize + +from ansible.errors import AnsibleFilterError, AnsibleFilterTypeError +from ansible.module_utils.six import text_type + + +def unicode_normalize(data, form='NFC'): + """Applies normalization to 'unicode' strings. + + Args: + data: A unicode string piped into the Jinja filter + form: One of ('NFC', 'NFD', 'NFKC', 'NFKD'). + See https://docs.python.org/3/library/unicodedata.html#unicodedata.normalize for more information. + + Returns: + A normalized unicode string of the specified 'form'. + """ + + if not isinstance(data, text_type): + raise AnsibleFilterTypeError("%s is not a valid input type" % type(data)) + + if form not in ('NFC', 'NFD', 'NFKC', 'NFKD'): + raise AnsibleFilterError("%s is not a valid form" % form) + + return normalize(form, data) + + +class FilterModule(object): + def filters(self): + return { + 'unicode_normalize': unicode_normalize, + } diff --git a/tests/integration/targets/filter_unicode_normalize/aliases b/tests/integration/targets/filter_unicode_normalize/aliases new file mode 100644 index 0000000000..f04737b845 --- /dev/null +++ b/tests/integration/targets/filter_unicode_normalize/aliases @@ -0,0 +1,2 @@ +shippable/posix/group2 +skip/python2.6 # filters are controller only, and we no longer support Python 2.6 on the controller diff --git a/tests/integration/targets/filter_unicode_normalize/tasks/main.yml b/tests/integration/targets/filter_unicode_normalize/tasks/main.yml new file mode 100644 index 0000000000..948ca74b4b --- /dev/null +++ b/tests/integration/targets/filter_unicode_normalize/tasks/main.yml @@ -0,0 +1,39 @@ +#################################################################### +# WARNING: These are designed specifically for Ansible tests # +# and should not be used as examples of how to write Ansible roles # +#################################################################### + +- name: Test 'NFC' normalization + assert: + that: + - u_umlaut != u_umlaut_combining + - u_umlaut_combining != (u_umlaut_combining | community.general.unicode_normalize) + - u_umlaut == (u_umlaut_combining | community.general.unicode_normalize) + +- name: Test 'NFKC' normalization + assert: + that: + - latin_capital_i != roman_numeral_one + - latin_capital_i == (roman_numeral_one | community.general.unicode_normalize(form='NFKC')) + +- name: Register invalid input type + debug: + msg: "{{ 1 | community.general.unicode_normalize }}" + ignore_errors: true + register: invalid_input_type + +- name: Assert an invalid input type causes failure + assert: + that: + - invalid_input_type is failed + +- name: Register invalid form selection + debug: + msg: "{{ 'arbitrary text' | community.general.unicode_normalize(form='invalid') }}" + ignore_errors: true + register: invalid_form_selection + +- name: Assert invalid form selection causes failure + assert: + that: + - invalid_form_selection is failed diff --git a/tests/integration/targets/filter_unicode_normalize/vars/main.yml b/tests/integration/targets/filter_unicode_normalize/vars/main.yml new file mode 100644 index 0000000000..88d19b20db --- /dev/null +++ b/tests/integration/targets/filter_unicode_normalize/vars/main.yml @@ -0,0 +1,4 @@ +u_umlaut: "{{ '\u00fc' }}" +u_umlaut_combining: "{{ 'u' + '\u0308' }}" +roman_numeral_one: "{{ '\u2160' }}" +latin_capital_i: "{{ '\u0049' }}"