Georgian Script Casing in Python 3.7 and Unicode 11
The problem is in all versions starting with python 3.7, let's go back a little and follow what happened. So with the new version of Unicode 11 (June 5, 2018) we have some major changes for the Georgian script. Georgian was considered a monocameral (non-casing) script. Therefore, Georgian letters were gc=Lo (Letter, Other) and starting from the version Unicode 11.0, those Georgian letters are now gc=Ll (Letter, Lowercase). In python 3.7 first release (June 27, 2018), we have implementation of Unicode 11 and manipulation on the Georgian scripts (capitalize, titlecasing, uppercase) gives us strange symbols on the output.
ex. shown below
Python 3.7.5 (default, Oct 17 2019, 12:21:00)
[GCC 8.3.1 20190223 (Red Hat 8.3.1-2)] on linux
Type "help", "copyright", "credits" or "license" for more information.
>>> 'ა'
'ა'
>>> 'ა'.upper()
'Ლ'
>>> print('ა'.upper())
Ლ
>>> print('ლაშა'.upper())
ᲚᲐᲨᲐᲐ
>>> print('ლაშა'.capitalize())
Ლაშა
Casting issue on the Python issue tracker
The bug is in capfirst
link and upper
link methods
These is django builtin tags, which is used to capitalize the first letter of a string and converts a string into all uppercase. When i used to this tags in template gives me strange symbols on the output.
Also, this bug affects the Django ain because this filter tag is used in model forms and templates in Django admin.
Bug fix into Django
capfirst
method existing implementation into django core.
@keep_lazy_text
def capfirst(x):
"""Capitalize the first letter of a string."""
return x and str(x)[0].upper() + str(x)[1:]
Which should be replaced with
@keep_lazy_text
def capfirst(x):
"""
Capitalize the first letter of a string.
Georgian is that the primary orthography does not use titlecasing,
This is unique among bicameral systems in the Unicode Standard,
so casing implementations should be prepared for this exception.
See https://www.unicode.org/versions/Unicode11.0.0/#Migration
for documentation of Unicode 11.0 standard.
casting issue https://bugs.python.org/issue37121
>>> 'ჯანგო'.capitalize()
'Ლანგო'
>>> 'ჯანგო'.upper()
'ᲚᲚᲚᲚᲚ'
"""
if x and unicodedata.name(str(x)[0]).startswith('GEORGIAN LETTER'):
return x
return x and str(x)[0].upper() + str(x)[1:]
add unit test for this method
class TestUtilsText(SimpleTestCase):
def test_capfirst(self):
self.assertEqual(text.capfirst('english'), 'English')
self.assertEqual(text.capfirst('სიახლეები'), 'სიახლეები')
self.assertEqual(text.capfirst('a'), 'A')
self.assertEqual(text.capfirst('ა'), 'ა')
self.assertEqual(text.capfirst(''), '')
self.assertEqual(text.capfirst('\xeb'), '\xcb')
same method implemented into defaultfilters as tag and it should be replaced too
capfirst
tag existing implementation into defaultfilters(django core).
@register.filter(is_safe=True)
@stringfilter
def capfirst(value):
"""Capitalize the first character of the value."""
return value and value[0].upper() + value[1:]
@register.filter(is_safe=False)
@stringfilter
def upper(value):
"""Convert a string into all uppercase."""
return value.upper()
should be replaced with
@register.filter(is_safe=True)
@stringfilter
def capfirst(value):
"""
Capitalize the first character of the value.
Georgian is that the primary orthography does not use titlecasing,
This is unique among bicameral systems in the Unicode Standard,
so casing implementations should be prepared for this exception.
See https://www.unicode.org/versions/Unicode11.0.0/#Migration
for documentation of Unicode 11.0 standard.
casting issue https://bugs.python.org/issue37121
>>> 'ჯანგო'.capitalize()
'Ლანგო'
>>> 'ჯანგო'.upper()
'ᲚᲚᲚᲚᲚ'
"""
if value and unicodedata.name(value[0]).startswith('GEORGIAN LETTER'):
return value
return value and value[0].upper() + value[1:]
@register.filter(is_safe=False)
@stringfilter
def upper(value):
"""
Convert a string into all uppercase.
Georgian is that the primary orthography does not use titlecasing,
This is unique among bicameral systems in the Unicode Standard,
so casing implementations should be prepared for this exception.
See https://www.unicode.org/versions/Unicode11.0.0/#Migration
for documentation of Unicode 11.0 standard.
casting issue https://bugs.python.org/issue37121
>>> 'ჯანგო'.capitalize()
'Ლანგო'
>>> 'ჯანგო'.upper()
'ᲚᲚᲚᲚᲚ'
"""
for val in value:
if val and unicodedata.name(val).startswith('GEORGIAN LETTER'):
return value
return value.upper()
add unit tests for this changes
class CapfirstTests(SimpleTestCase):
@setup({'capfirst01': '{% autoescape off %}{{ a|capfirst }} {{ b|capfirst }}{% endautoescape %}'})
def test_capfirst01(self):
output = self.engine.render_to_string('capfirst01', {'a': 'fred>', 'b': mark_safe('fred>')})
self.assertEqual(output, 'Fred> Fred>')
@setup({'capfirst02': '{{ a|capfirst }} {{ b|capfirst }}'})
def test_capfirst02(self):
output = self.engine.render_to_string('capfirst02', {'a': 'fred>', 'b': mark_safe('fred>')})
self.assertEqual(output, 'Fred> Fred>')
@setup({'capfirst03': '{{ a|capfirst }} {{ b|capfirst }}'})
def test_capfirst03(self):
output = self.engine.render_to_string('capfirst03', {'a': 'სიახლე', 'b': mark_safe('სიახლეები;')})
self.assertEqual(output, 'სიახლე სიახლეები;')
class FunctionTests(SimpleTestCase):
def test_capfirst(self):
self.assertEqual(capfirst('hello world'), 'Hello world')
self.assertEqual(capfirst('საქართველო'), 'საქართველო')
class UpperTests(SimpleTestCase):
"""
The "upper" filter messes up entities (which are case-sensitive),
so it's not safe for non-escaping purposes.
"""
@setup({'upper01': '{% autoescape off %}{{ a|upper }} {{ b|upper }}{% endautoescape %}'})
def test_upper01(self):
output = self.engine.render_to_string('upper01', {'a': 'a & b', 'b': mark_safe('a & b')})
self.assertEqual(output, 'A & B A & B')
@setup({'upper02': '{{ a|upper }} {{ b|upper }}'})
def test_upper02(self):
output = self.engine.render_to_string('upper02', {'a': 'a & b', 'b': mark_safe('a & b')})
self.assertEqual(output, 'A & B A & B')
@setup({'upper03': '{{ a|upper }} {{ b|upper }}'})
def test_upper03(self):
output = self.engine.render_to_string('upper03', {'a': 'სიახლე', 'b': mark_safe('სიახლეები;')})
self.assertEqual(output, 'სიახლე სიახლეები;')
class FunctionTests(SimpleTestCase):
def test_upper(self):
self.assertEqual(upper('Mixed case input'), 'MIXED CASE INPUT')
self.assertEqual(upper('საქართველო'), 'საქართველო')