mirror of
https://github.com/ansible-collections/community.general.git
synced 2024-09-14 20:13:21 +02:00
New filter plugin - unicode_normalization (#3359)
* Initial commit * Adding maintainer in BOTMETA * Adding changelog fragment * Updating filter_guide * Applying initial review suggestions
This commit is contained in:
parent
612543919e
commit
29e4066944
7 changed files with 122 additions and 0 deletions
2
.github/BOTMETA.yml
vendored
2
.github/BOTMETA.yml
vendored
|
@ -131,6 +131,8 @@ files:
|
|||
$filters/random_mac.py: {}
|
||||
$filters/time.py:
|
||||
maintainers: resmo
|
||||
$filters/unicode_normalize.py:
|
||||
maintainers: Ajpantuso
|
||||
$filters/version_sort.py:
|
||||
maintainers: ericzolf
|
||||
$inventories/:
|
||||
|
|
|
@ -0,0 +1,4 @@
|
|||
---
|
||||
add plugin.filter:
|
||||
- name: unicode_normalize
|
||||
description: Normalizes unicode strings to facilitate comparison of characters with normalized forms
|
|
@ -751,3 +751,34 @@ To extract ports from all clusters with name containing 'server1':
|
|||
server_name_query: "domain.server[?contains(name,'server1')].port"
|
||||
|
||||
.. note:: while using ``starts_with`` and ``contains``, you have to use `` to_json | from_json `` filter for correct parsing of data structure.
|
||||
|
||||
Working with Unicode
|
||||
---------------------
|
||||
|
||||
`Unicode <https://unicode.org/main.html>`_ makes it possible to produce two strings which may be visually equivalent, but are comprised of distinctly different characters/character sequences. To address this ``Unicode`` defines `normalization forms <https://unicode.org/reports/tr15/>`_ which avoid these distinctions by choosing a unique character sequence for a given visual representation.
|
||||
|
||||
You can use the ``community.general.unicode_normalize`` filter to normalize ``Unicode`` strings within your playbooks.
|
||||
|
||||
.. code-block:: yaml+jinja
|
||||
|
||||
- name: Compare Unicode representations
|
||||
debug:
|
||||
msg: "{{ with_combining_character | community.general.unicode_normalize == without_combining_character }}"
|
||||
vars:
|
||||
with_combining_character: "{{ 'Mayagu\u0308ez' }}"
|
||||
without_combining_character: Mayagüez
|
||||
|
||||
This produces:
|
||||
|
||||
.. code-block:: ansible-output
|
||||
|
||||
TASK [Compare Unicode representations] ********************************************************
|
||||
ok: [localhost] => {
|
||||
"msg": true
|
||||
}
|
||||
|
||||
The ``community.general.unicode_normalize`` filter accepts a keyword argument to select the ``Unicode`` form used to normalize the input string.
|
||||
|
||||
:form: One of ``'NFC'`` (default), ``'NFD'``, ``'NFKC'``, or ``'NFKD'``. See the `Unicode reference <https://unicode.org/reports/tr15/>`_ for more information.
|
||||
|
||||
.. versionadded:: 3.7.0
|
||||
|
|
40
plugins/filter/unicode_normalize.py
Normal file
40
plugins/filter/unicode_normalize.py
Normal file
|
@ -0,0 +1,40 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright: (c) 2021, Andrew Pantuso (@ajpantuso) <ajpantuso@gmail.com>
|
||||
# GNU General Public License v3.0+ (see COPYING or https://www.gnu.org/licenses/gpl-3.0.txt)
|
||||
|
||||
from __future__ import absolute_import, division, print_function
|
||||
__metaclass__ = type
|
||||
|
||||
from unicodedata import normalize
|
||||
|
||||
from ansible.errors import AnsibleFilterError, AnsibleFilterTypeError
|
||||
from ansible.module_utils.six import text_type
|
||||
|
||||
|
||||
def unicode_normalize(data, form='NFC'):
|
||||
"""Applies normalization to 'unicode' strings.
|
||||
|
||||
Args:
|
||||
data: A unicode string piped into the Jinja filter
|
||||
form: One of ('NFC', 'NFD', 'NFKC', 'NFKD').
|
||||
See https://docs.python.org/3/library/unicodedata.html#unicodedata.normalize for more information.
|
||||
|
||||
Returns:
|
||||
A normalized unicode string of the specified 'form'.
|
||||
"""
|
||||
|
||||
if not isinstance(data, text_type):
|
||||
raise AnsibleFilterTypeError("%s is not a valid input type" % type(data))
|
||||
|
||||
if form not in ('NFC', 'NFD', 'NFKC', 'NFKD'):
|
||||
raise AnsibleFilterError("%s is not a valid form" % form)
|
||||
|
||||
return normalize(form, data)
|
||||
|
||||
|
||||
class FilterModule(object):
|
||||
def filters(self):
|
||||
return {
|
||||
'unicode_normalize': unicode_normalize,
|
||||
}
|
|
@ -0,0 +1,2 @@
|
|||
shippable/posix/group2
|
||||
skip/python2.6 # filters are controller only, and we no longer support Python 2.6 on the controller
|
|
@ -0,0 +1,39 @@
|
|||
####################################################################
|
||||
# WARNING: These are designed specifically for Ansible tests #
|
||||
# and should not be used as examples of how to write Ansible roles #
|
||||
####################################################################
|
||||
|
||||
- name: Test 'NFC' normalization
|
||||
assert:
|
||||
that:
|
||||
- u_umlaut != u_umlaut_combining
|
||||
- u_umlaut_combining != (u_umlaut_combining | community.general.unicode_normalize)
|
||||
- u_umlaut == (u_umlaut_combining | community.general.unicode_normalize)
|
||||
|
||||
- name: Test 'NFKC' normalization
|
||||
assert:
|
||||
that:
|
||||
- latin_capital_i != roman_numeral_one
|
||||
- latin_capital_i == (roman_numeral_one | community.general.unicode_normalize(form='NFKC'))
|
||||
|
||||
- name: Register invalid input type
|
||||
debug:
|
||||
msg: "{{ 1 | community.general.unicode_normalize }}"
|
||||
ignore_errors: true
|
||||
register: invalid_input_type
|
||||
|
||||
- name: Assert an invalid input type causes failure
|
||||
assert:
|
||||
that:
|
||||
- invalid_input_type is failed
|
||||
|
||||
- name: Register invalid form selection
|
||||
debug:
|
||||
msg: "{{ 'arbitrary text' | community.general.unicode_normalize(form='invalid') }}"
|
||||
ignore_errors: true
|
||||
register: invalid_form_selection
|
||||
|
||||
- name: Assert invalid form selection causes failure
|
||||
assert:
|
||||
that:
|
||||
- invalid_form_selection is failed
|
|
@ -0,0 +1,4 @@
|
|||
u_umlaut: "{{ '\u00fc' }}"
|
||||
u_umlaut_combining: "{{ 'u' + '\u0308' }}"
|
||||
roman_numeral_one: "{{ '\u2160' }}"
|
||||
latin_capital_i: "{{ '\u0049' }}"
|
Loading…
Reference in a new issue