diff --git a/lib/ansible/utils/unicode.py b/lib/ansible/utils/unicode.py new file mode 100644 index 0000000000..b2fcf65161 --- /dev/null +++ b/lib/ansible/utils/unicode.py @@ -0,0 +1,248 @@ +# (c) 2012-2014, Toshio Kuraotmi +# +# This file is part of Ansible +# +# Ansible is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Ansible is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Ansible. If not, see . + +# Make coding more python3-ish +from __future__ import (absolute_import, division, print_function) +__metaclass__ = type + +# to_bytes and to_unicode were written by Toshio Kuratomi for the +# python-kitchen library https://pypi.python.org/pypi/kitchen +# They are licensed in kitchen under the terms of the GPLv2+ +# They were copied and modified for use in ansible by Toshio in Jan 2015 +# (simply removing the deprecated features) + +#: Aliases for the utf-8 codec +_UTF8_ALIASES = frozenset(('utf-8', 'UTF-8', 'utf8', 'UTF8', 'utf_8', 'UTF_8', + 'utf', 'UTF', 'u8', 'U8')) +#: Aliases for the latin-1 codec +_LATIN1_ALIASES = frozenset(('latin-1', 'LATIN-1', 'latin1', 'LATIN1', + 'latin', 'LATIN', 'l1', 'L1', 'cp819', 'CP819', '8859', 'iso8859-1', + 'ISO8859-1', 'iso-8859-1', 'ISO-8859-1')) + +# EXCEPTION_CONVERTERS is defined below due to using to_unicode + +def to_unicode(obj, encoding='utf-8', errors='replace', nonstring=None): + '''Convert an object into a :class:`unicode` string + + :arg obj: Object to convert to a :class:`unicode` string. This should + normally be a byte :class:`str` + :kwarg encoding: What encoding to try converting the byte :class:`str` as. + Defaults to :term:`utf-8` + :kwarg errors: If errors are found while decoding, perform this action. + Defaults to ``replace`` which replaces the invalid bytes with + a character that means the bytes were unable to be decoded. Other + values are the same as the error handling schemes in the `codec base + classes + `_. + For instance ``strict`` which raises an exception and ``ignore`` which + simply omits the non-decodable characters. + :kwarg nonstring: How to treat nonstring values. Possible values are: + + :simplerepr: Attempt to call the object's "simple representation" + method and return that value. Python-2.3+ has two methods that + try to return a simple representation: :meth:`object.__unicode__` + and :meth:`object.__str__`. We first try to get a usable value + from :meth:`object.__unicode__`. If that fails we try the same + with :meth:`object.__str__`. + :empty: Return an empty :class:`unicode` string + :strict: Raise a :exc:`TypeError` + :passthru: Return the object unchanged + :repr: Attempt to return a :class:`unicode` string of the repr of the + object + + Default is ``simplerepr`` + + :raises TypeError: if :attr:`nonstring` is ``strict`` and + a non-:class:`basestring` object is passed in or if :attr:`nonstring` + is set to an unknown value + :raises UnicodeDecodeError: if :attr:`errors` is ``strict`` and + :attr:`obj` is not decodable using the given encoding + :returns: :class:`unicode` string or the original object depending on the + value of :attr:`nonstring`. + + Usually this should be used on a byte :class:`str` but it can take both + byte :class:`str` and :class:`unicode` strings intelligently. Nonstring + objects are handled in different ways depending on the setting of the + :attr:`nonstring` parameter. + + The default values of this function are set so as to always return + a :class:`unicode` string and never raise an error when converting from + a byte :class:`str` to a :class:`unicode` string. However, when you do + not pass validly encoded text (or a nonstring object), you may end up with + output that you don't expect. Be sure you understand the requirements of + your data, not just ignore errors by passing it through this function. + ''' + # Could use isbasestring/isunicode here but we want this code to be as + # fast as possible + if isinstance(obj, basestring): + if isinstance(obj, unicode): + return obj + if encoding in _UTF8_ALIASES: + return unicode(obj, 'utf-8', errors) + if encoding in _LATIN1_ALIASES: + return unicode(obj, 'latin-1', errors) + return obj.decode(encoding, errors) + + if not nonstring: + nonstring = 'simplerepr' + if nonstring == 'empty': + return u'' + elif nonstring == 'passthru': + return obj + elif nonstring == 'simplerepr': + try: + simple = obj.__unicode__() + except (AttributeError, UnicodeError): + simple = None + if not simple: + try: + simple = str(obj) + except UnicodeError: + try: + simple = obj.__str__() + except (UnicodeError, AttributeError): + simple = u'' + if isbytestring(simple): + return unicode(simple, encoding, errors) + return simple + elif nonstring in ('repr', 'strict'): + obj_repr = repr(obj) + if isbytestring(obj_repr): + obj_repr = unicode(obj_repr, encoding, errors) + if nonstring == 'repr': + return obj_repr + raise TypeError('to_unicode was given "%(obj)s" which is neither' + ' a byte string (str) or a unicode string' % + {'obj': obj_repr.encode(encoding, 'replace')}) + + raise TypeError('nonstring value, %(param)s, is not set to a valid' + ' action' % {'param': nonstring}) + +def to_bytes(obj, encoding='utf-8', errors='replace', nonstring=None): + '''Convert an object into a byte :class:`str` + + :arg obj: Object to convert to a byte :class:`str`. This should normally + be a :class:`unicode` string. + :kwarg encoding: Encoding to use to convert the :class:`unicode` string + into a byte :class:`str`. Defaults to :term:`utf-8`. + :kwarg errors: If errors are found while encoding, perform this action. + Defaults to ``replace`` which replaces the invalid bytes with + a character that means the bytes were unable to be encoded. Other + values are the same as the error handling schemes in the `codec base + classes + `_. + For instance ``strict`` which raises an exception and ``ignore`` which + simply omits the non-encodable characters. + :kwarg nonstring: How to treat nonstring values. Possible values are: + + :simplerepr: Attempt to call the object's "simple representation" + method and return that value. Python-2.3+ has two methods that + try to return a simple representation: :meth:`object.__unicode__` + and :meth:`object.__str__`. We first try to get a usable value + from :meth:`object.__str__`. If that fails we try the same + with :meth:`object.__unicode__`. + :empty: Return an empty byte :class:`str` + :strict: Raise a :exc:`TypeError` + :passthru: Return the object unchanged + :repr: Attempt to return a byte :class:`str` of the :func:`repr` of the + object + + Default is ``simplerepr``. + + :raises TypeError: if :attr:`nonstring` is ``strict`` and + a non-:class:`basestring` object is passed in or if :attr:`nonstring` + is set to an unknown value. + :raises UnicodeEncodeError: if :attr:`errors` is ``strict`` and all of the + bytes of :attr:`obj` are unable to be encoded using :attr:`encoding`. + :returns: byte :class:`str` or the original object depending on the value + of :attr:`nonstring`. + + .. warning:: + + If you pass a byte :class:`str` into this function the byte + :class:`str` is returned unmodified. It is **not** re-encoded with + the specified :attr:`encoding`. The easiest way to achieve that is:: + + to_bytes(to_unicode(text), encoding='utf-8') + + The initial :func:`to_unicode` call will ensure text is + a :class:`unicode` string. Then, :func:`to_bytes` will turn that into + a byte :class:`str` with the specified encoding. + + Usually, this should be used on a :class:`unicode` string but it can take + either a byte :class:`str` or a :class:`unicode` string intelligently. + Nonstring objects are handled in different ways depending on the setting + of the :attr:`nonstring` parameter. + + The default values of this function are set so as to always return a byte + :class:`str` and never raise an error when converting from unicode to + bytes. However, when you do not pass an encoding that can validly encode + the object (or a non-string object), you may end up with output that you + don't expect. Be sure you understand the requirements of your data, not + just ignore errors by passing it through this function. + ''' + # Could use isbasestring, isbytestring here but we want this to be as fast + # as possible + if isinstance(obj, basestring): + if isinstance(obj, str): + return obj + return obj.encode(encoding, errors) + if not nonstring: + nonstring = 'simplerepr' + + if nonstring == 'empty': + return '' + elif nonstring == 'passthru': + return obj + elif nonstring == 'simplerepr': + try: + simple = str(obj) + except UnicodeError: + try: + simple = obj.__str__() + except (AttributeError, UnicodeError): + simple = None + if not simple: + try: + simple = obj.__unicode__() + except (AttributeError, UnicodeError): + simple = '' + if isunicodestring(simple): + simple = simple.encode(encoding, 'replace') + return simple + elif nonstring in ('repr', 'strict'): + try: + obj_repr = obj.__repr__() + except (AttributeError, UnicodeError): + obj_repr = '' + if isunicodestring(obj_repr): + obj_repr = obj_repr.encode(encoding, errors) + else: + obj_repr = str(obj_repr) + if nonstring == 'repr': + return obj_repr + raise TypeError('to_bytes was given "%(obj)s" which is neither' + ' a unicode string or a byte string (str)' % {'obj': obj_repr}) + + raise TypeError('nonstring value, %(param)s, is not set to a valid' + ' action' % {'param': nonstring}) + + +# force the return value of a function to be unicode. Use with partial to +# ensure that a filter will return unicode values. +def unicode_wrap(func, *args, **kwargs): + return to_unicode(func(*args, **kwargs), nonstring='passthru')