Last active
August 29, 2015 13:58
-
-
Save LeslieK/10013426 to your computer and use it in GitHub Desktop.
Python 3.3: Windows Console Behavior with Unicode
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"metadata": { | |
"name": "Python3.3WindowsConsoleUnicode" | |
}, | |
"nbformat": 3, | |
"nbformat_minor": 0, | |
"worksheets": [ | |
{ | |
"cells": [ | |
{ | |
"cell_type": "heading", | |
"level": 1, | |
"metadata": {}, | |
"source": "Python 3.3 Windows console behavior: Issue Summary" | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "import sys\nsys.platform", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 9, | |
"text": "'win32'" | |
} | |
], | |
"prompt_number": 9 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "sys.version", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 10, | |
"text": "'3.3.3 |Anaconda 1.8.0 (64-bit)| (default, Dec 13 2013, 10:31:18) [MSC v.1600 64 bit (AMD64)]'" | |
} | |
], | |
"prompt_number": 10 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "Python 3.3 Windows console behavior explained: should be included in python documentation\n\nWindows console uses 'cp437' to decode bytes -- This cannot be overridden. PYTHONIOENCODING does not override this.\nPYTHONIOENCODING is set to UTF-8\nIn [86]: sys.getdefaultencoding(), locale.getpreferredencoding(), sys.stdout.encoding, os.device_encoding(1)\nOut[86]: ('utf-8', 'cp1252', 'UTF-8', 'cp437')\n\nos.device_encoding(0), os.device_encoding(1), os.device_encoding(2) => None, 'cp437', 'cp437' # 0: stdin 1: stdout 2: stderr\n\nBEHAVIOR\nThe print encodes to bytes using 'utf-8' correctly\nconsole decodes bytes to glyph using 'cp437' => wrong glyph\n\nEXAMPLE\nIn [75]: shalom = '\\u05dd\\u05dc\\u05e9'\n\nIn [76]: shalom.encode()\nOut[76]: b'\\xd7\\x9d\\xd7\\x9c\\xd7\\xa9'\n\nIn [77]: shalom.encode().decode()\nOut[77]: '???'\n\nIn [78]: shalom.encode().decode('cp437')\nOut[78]: '\u256b\u00a5\u256b\u00a3\u256b\u2310'\n\nIn [79]: print(shalom)\n\u256b\u00a5\u256b\u00a3\u256b\u2310\n\nIn [80]: shalom\nOut[80]: '???'", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "print('\\u05dd\\u05dc\\u05e9') # works in ipython notebook", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "\u05dd\u05dc\u05e9\n" | |
} | |
], | |
"prompt_number": 32 | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 1, | |
"metadata": {}, | |
"source": "Another Example with More Details" | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "import locale\nlocale.getpreferredencoding() # value returned is platform dependent", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 13, | |
"text": "'cp1252'" | |
} | |
], | |
"prompt_number": 13 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "# windows uses **console** encodings instead of locale encoding for the console\n# windows default console encodings\nsys.stdout.encoding\nsys.stdin.encoding\nsys.stderr.encoding\n'cp437'", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "In [1]: print('\\u00A9')\n---------------------------------------------------------------------------\nUnicodeEncodeError Traceback (most recent call last)\n<ipython-input-1-427f3468b115> in <module>()\n----> 1 print('\\u00A9')\n\nC:\\Anaconda\\envs\\py33\\lib\\encodings\\cp437.py in encode(self, input, final)\n 17 class IncrementalEncoder(codecs.IncrementalEncoder):\n 18 def encode(self, input, final=False):\n---> 19 return codecs.charmap_encode(input,self.errors,encoding_map)[0]\n 20\n 21 class IncrementalDecoder(codecs.IncrementalDecoder):\n\nUnicodeEncodeError: 'charmap' codec can't encode character '\\xa9' in position 0: character maps to <undefined>\n\nIn [2]:", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "# to override IO encodings (on all platforms) set ENVIRONMENT variable PYTHONIOENCODING\n# after setting system ENVIRONMENT variable: PYTHONIOENCODING UTF-8\n\nprint(\"sys.stdout.encoding\", sys.stdout.encoding) \nprint(\"sys.stdin.encoding \", sys.stdin.encoding)\nprint(\"sys.stderr.encoding\", sys.stdout.encoding)", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "sys.stdout.encoding UTF-8\nsys.stdin.encoding " | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": " UTF-8\nsys.stderr.encoding" | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": " UTF-8\n" | |
} | |
], | |
"prompt_number": 7 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "# But I think python uses these device_encoding settings, not the ones that were overridden\n\nos.device_encoding(0), os.device_encoding(1), os.device_encoding(2) # 0: stdin, 1: stdout, 2: stderr", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 18, | |
"text": "(None, 'cp437', 'cp437')" | |
} | |
], | |
"prompt_number": 18 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "# This time ipython console doesn't crash but it prints out gibberish\n\nIn [11]: print('\\u00A9') \n\u252c\u2310\n\nIn [12]:\n\n# print encodes str to bytes using sys default ('utf-8') => b'\\xc2\\xa9' \n# then it decodes bytes with 'cp437' to glyph\n# calling 'cp437' is a bug", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "print('\\u00A9') # works correctly in ipython notebook", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "\u00a9\n" | |
} | |
], | |
"prompt_number": 20 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "'\\u00A9' # repr works", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 33, | |
"text": "'\u00a9'" | |
} | |
], | |
"prompt_number": 33 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "copyutf8.decode? # this shows that encoding='utf-8' is the default encoding", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 29 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "copyutf8.decode() # again, this displays correctly", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 31, | |
"text": "'\u00a9'" | |
} | |
], | |
"prompt_number": 31 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "copyutf8 = '\\u00a9'.encode('utf-8')\ncopyutf8 # displays correctly", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 26, | |
"text": "b'\\xc2\\xa9'" | |
} | |
], | |
"prompt_number": 26 | |
} | |
], | |
"metadata": {} | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment