diff --git a/PRESUBMIT.py b/PRESUBMIT.py index 50aa3850c..6232c191b 100644 --- a/PRESUBMIT.py +++ b/PRESUBMIT.py @@ -9,6 +9,7 @@ details on the presubmit API built into gcl. """ UNIT_TESTS = [ + 'tests.fix_encoding_test', 'tests.gcl_unittest', 'tests.gclient_scm_test', 'tests.gclient_smoketest', diff --git a/fix_encoding.py b/fix_encoding.py new file mode 100644 index 000000000..11dfd6c3d --- /dev/null +++ b/fix_encoding.py @@ -0,0 +1,356 @@ +# Copyright (c) 2011 The Chromium Authors. All rights reserved. +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. + +"""Collection of functions and classes to fix various encoding problems on +multiple platforms with python. +""" + +import codecs +import locale +import os +import sys + + +# Prevents initializing multiple times. +_SYS_ARGV_PROCESSED = False + + +def complain(message): + """If any exception occurs in this file, we'll probably try to print it + on stderr, which makes for frustrating debugging if stderr is directed + to our wrapper. So be paranoid about catching errors and reporting them + to sys.__stderr__, so that the user has a higher chance to see them. + """ + print >> sys.__stderr__, ( + isinstance(message, str) and message or repr(message)) + + +def fix_default_encoding(): + """Forces utf8 solidly on all platforms. + + By default python execution environment is lazy and defaults to ascii + encoding. + + http://uucode.com/blog/2007/03/23/shut-up-you-dummy-7-bit-python/ + """ + if sys.getdefaultencoding() == 'utf-8': + return False + + # Regenerate setdefaultencoding. + reload(sys) + # Module 'sys' has no 'setdefaultencoding' member + # pylint: disable=E1101 + sys.setdefaultencoding('utf-8') + for attr in dir(locale): + if attr[0:3] != 'LC_': + continue + aref = getattr(locale, attr) + locale.setlocale(aref, '') + try: + lang = locale.getlocale(aref)[0] + except TypeError: + lang = None + if lang: + try: + locale.setlocale(aref, (lang, 'UTF-8')) + except locale.Error: + os.environ[attr] = lang + '.UTF-8' + locale.setlocale(locale.LC_ALL, '') + return True + + +############################### +# Windows specific + + +def fix_win_sys_argv(encoding): + """Converts sys.argv to 'encoding' encoded string. + + utf-8 is recommended. + + Works around . + """ + global _SYS_ARGV_PROCESSED + if _SYS_ARGV_PROCESSED: + return False + + from ctypes import byref, c_int, POINTER, windll, WINFUNCTYPE + from ctypes.wintypes import LPCWSTR, LPWSTR + + # + GetCommandLineW = WINFUNCTYPE(LPWSTR)(('GetCommandLineW', windll.kernel32)) + # + CommandLineToArgvW = WINFUNCTYPE(POINTER(LPWSTR), LPCWSTR, POINTER(c_int))( + ('CommandLineToArgvW', windll.shell32)) + + argc = c_int(0) + argv_unicode = CommandLineToArgvW(GetCommandLineW(), byref(argc)) + argv = [ + argv_unicode[i].encode(encoding, 'replace') + for i in xrange(0, argc.value)] + + if not hasattr(sys, 'frozen'): + # If this is an executable produced by py2exe or bbfreeze, then it + # will have been invoked directly. Otherwise, unicode_argv[0] is the + # Python interpreter, so skip that. + argv = argv[1:] + + # Also skip option arguments to the Python interpreter. + while len(argv) > 0: + arg = argv[0] + if not arg.startswith(u'-') or arg == u'-': + break + argv = argv[1:] + if arg == u'-m': + # sys.argv[0] should really be the absolute path of the + # module source, but never mind. + break + if arg == u'-c': + argv[0] = u'-c' + break + sys.argv = argv + _SYS_ARGV_PROCESSED = True + return True + + +def fix_win_codec(): + """Works around .""" + # + try: + codecs.lookup('cp65001') + return False + except LookupError: + codecs.register( + lambda name: name == 'cp65001' and codecs.lookup('utf-8') or None) + return True + + +class WinUnicodeOutputBase(object): + """Base class to adapt sys.stdout or sys.stderr to behave correctly on + Windows. + + Setting encoding to utf-8 is recommended. + """ + def __init__(self, fileno, name, encoding): + # Corresponding file handle. + self._fileno = fileno + self.encoding = encoding + self.name = name + + self.closed = False + self.softspace = False + self.mode = 'w' + + @staticmethod + def isatty(): + return False + + def close(self): + # Don't really close the handle, that would only cause problems. + self.closed = True + + def fileno(self): + return self._fileno + + def flush(self): + raise NotImplementedError() + + def write(self, text): + raise NotImplementedError() + + def writelines(self, lines): + try: + for line in lines: + self.write(line) + except Exception, e: + complain('%s.writelines: %r' % (self.name, e)) + raise + + +class WinUnicodeConsoleOutput(WinUnicodeOutputBase): + """Output adapter to a Windows Console. + + Understands how to use the win32 console API. + """ + def __init__(self, console_handle, fileno, stream_name, encoding): + super(WinUnicodeConsoleOutput, self).__init__( + fileno, '' % stream_name, encoding) + # Handle to use for WriteConsoleW + self._console_handle = console_handle + + # Loads the necessary function. + from ctypes import byref, GetLastError, POINTER, windll, WINFUNCTYPE + from ctypes.wintypes import BOOL, DWORD, HANDLE, LPVOID, LPWSTR + + self._DWORD = DWORD + self._byref = byref + + # + self._WriteConsoleW = WINFUNCTYPE( + BOOL, HANDLE, LPWSTR, DWORD, POINTER(DWORD), LPVOID)( + ('WriteConsoleW', windll.kernel32)) + self._GetLastError = GetLastError + + def flush(self): + # No need to flush the console since it's immediate. + pass + + def write(self, text): + try: + if not isinstance(text, unicode): + # Convert to unicode. + text = str(text).decode(self.encoding, 'replace') + remaining = len(text) + while remaining > 0: + n = self._DWORD(0) + # There is a shorter-than-documented limitation on the length of the + # string passed to WriteConsoleW. See + # . + retval = self._WriteConsoleW( + self._console_handle, text, + min(remaining, 10000), + self._byref(n), None) + if retval == 0 or n.value == 0: + raise IOError( + 'WriteConsoleW returned %r, n.value = %r, last error = %r' % ( + retval, n.value, self._GetLastError())) + remaining -= n.value + if not remaining: + break + text = text[n.value:] + except Exception, e: + complain('%s.write: %r' % (self.name, e)) + raise + + +class WinUnicodeOutput(WinUnicodeOutputBase): + """Output adaptor to a file output on Windows. + + If the standard FileWrite function is used, it will be encoded in the current + code page. WriteConsoleW() permits writting any character. + """ + def __init__(self, stream, fileno, encoding): + super(WinUnicodeOutput, self).__init__( + fileno, '' % stream.name, encoding) + # Output stream + self._stream = stream + + # Flush right now. + self.flush() + + def flush(self): + try: + self._stream.flush() + except Exception, e: + complain('%s.flush: %r from %r' % (self.name, e, self._stream)) + raise + + def write(self, text): + try: + if isinstance(text, unicode): + # Replace characters that cannot be printed instead of failing. + text = text.encode(self.encoding, 'replace') + self._stream.write(text) + except Exception, e: + complain('%s.write: %r' % (self.name, e)) + raise + + +def win_handle_is_a_console(handle): + """Returns True if a Windows file handle is a handle to a console.""" + from ctypes import byref, POINTER, windll, WINFUNCTYPE + from ctypes.wintypes import BOOL, DWORD, HANDLE + + FILE_TYPE_CHAR = 0x0002 + FILE_TYPE_REMOTE = 0x8000 + INVALID_HANDLE_VALUE = DWORD(-1).value + + # + GetConsoleMode = WINFUNCTYPE(BOOL, HANDLE, POINTER(DWORD))( + ('GetConsoleMode', windll.kernel32)) + # + GetFileType = WINFUNCTYPE(DWORD, DWORD)(('GetFileType', windll.kernel32)) + + # GetStdHandle returns INVALID_HANDLE_VALUE, NULL, or a valid handle. + if handle == INVALID_HANDLE_VALUE or handle is None: + return False + return ( + (GetFileType(handle) & ~FILE_TYPE_REMOTE) == FILE_TYPE_CHAR and + GetConsoleMode(handle, byref(DWORD()))) + + +def win_get_unicode_stream(stream, excepted_fileno, output_handle, encoding): + """Returns a unicode-compatible stream. + + This function will return a direct-Console writing object only if: + - the file number is the expected console file number + - the handle the expected file handle + - the 'real' handle is in fact a handle to a console. + """ + old_fileno = getattr(stream, 'fileno', lambda: None)() + if old_fileno == excepted_fileno: + from ctypes import windll, WINFUNCTYPE + from ctypes.wintypes import DWORD, HANDLE + + # + GetStdHandle = WINFUNCTYPE(HANDLE, DWORD)(('GetStdHandle', windll.kernel32)) + + real_output_handle = GetStdHandle(DWORD(output_handle)) + if win_handle_is_a_console(real_output_handle): + # It's a console. + return WinUnicodeConsoleOutput( + real_output_handle, old_fileno, stream.name, encoding) + + # It's something else. Create an auto-encoding stream. + return WinUnicodeOutput(stream, old_fileno, encoding) + + +def fix_win_console(encoding): + """Makes Unicode console output work independently of the current code page. + + This also fixes . + Credit to Michael Kaplan + and + TZOmegaTZIOY + . + """ + if (isinstance(sys.stdout, WinUnicodeOutputBase) or + isinstance(sys.stderr, WinUnicodeOutputBase)): + return False + + try: + # SetConsoleCP and SetConsoleOutputCP could be used to change the code page + # but it's not really useful since the code here is using WriteConsoleW(). + # Also, changing the code page is 'permanent' to the console and needs to be + # reverted manually. + # In practice one needs to set the console font to a TTF font to be able to + # see all the characters but it failed for me in practice. In any case, it + # won't throw any exception when printing, which is the important part. + # -11 and -12 are defined in stdio.h + sys.stdout = win_get_unicode_stream(sys.stdout, 1, -11, encoding) + sys.stderr = win_get_unicode_stream(sys.stderr, 2, -12, encoding) + # TODO(maruel): Do sys.stdin with ReadConsoleW(). Albeit the limitation is + # "It doesn't appear to be possible to read Unicode characters in UTF-8 + # mode" and this appears to be a limitation of cmd.exe. + except Exception, e: + complain('exception %r while fixing up sys.stdout and sys.stderr' % e) + return True + + +def fix_encoding(): + """Fixes various encoding problems on all platforms. + + Should be called at the very begining of the process. + """ + ret = True + if sys.platform == 'win32': + ret &= fix_win_codec() + + ret &= fix_default_encoding() + + if sys.platform == 'win32': + encoding = sys.getdefaultencoding() + ret &= fix_win_sys_argv(encoding) + ret &= fix_win_console(encoding) + return ret diff --git a/gcl.py b/gcl.py index 20762d1be..ace9f6f3c 100755 --- a/gcl.py +++ b/gcl.py @@ -40,6 +40,7 @@ import breakpad # pylint: disable=W0611 # gcl now depends on gclient. from scm import SVN +import fix_encoding import gclient_utils import owners import presubmit_support @@ -1473,4 +1474,5 @@ def main(argv): if __name__ == "__main__": + fix_encoding.fix_encoding() sys.exit(main(sys.argv[1:])) diff --git a/gclient.py b/gclient.py index 4b9d94cac..c33973dfd 100644 --- a/gclient.py +++ b/gclient.py @@ -64,6 +64,7 @@ import urllib import breakpad # pylint: disable=W0611 +import fix_encoding import gclient_scm import gclient_utils from third_party.repo.progress import Progress @@ -1266,6 +1267,7 @@ def Main(argv): if '__main__' == __name__: + fix_encoding.fix_encoding() sys.exit(Main(sys.argv[1:])) # vim: ts=2:sw=2:tw=80:et: diff --git a/presubmit_support.py b/presubmit_support.py index f1f70d9c4..e9a2dbd49 100755 --- a/presubmit_support.py +++ b/presubmit_support.py @@ -50,6 +50,7 @@ except ImportError: import simplejson as json # pylint: disable=F0401 # Local imports. +import fix_encoding import gclient_utils import owners import presubmit_canned_checks @@ -142,18 +143,18 @@ class OutputApi(object): def handle(self, output): output.write(self._message) output.write('\n') - if len(self._items) > 0: - output.write(' ' + ' \\\n '.join(map(str, self._items)) + '\n') + for index, item in enumerate(self._items): + output.write(' ') + # Write separately in case it's unicode. + output.write(item) + if index < len(self._items) - 1: + output.write(' \\') + output.write('\n') if self._long_text: - # Sometimes self._long_text is a ascii string, a codepage string - # (on windows), or a unicode object. - try: - long_text = self._long_text.decode() - except UnicodeDecodeError: - long_text = self._long_text.decode('ascii', 'replace') - - output.write('\n***************\n%s\n***************\n' % - long_text) + output.write('\n***************\n') + # Write separately in case it's unicode. + output.write(self._long_text) + output.write('\n***************\n') if self.fatal: output.fail() @@ -1192,4 +1193,5 @@ def Main(argv): if __name__ == '__main__': + fix_encoding.fix_encoding() sys.exit(Main(None)) diff --git a/tests/fix_encoding_test.py b/tests/fix_encoding_test.py new file mode 100755 index 000000000..a6ee18627 --- /dev/null +++ b/tests/fix_encoding_test.py @@ -0,0 +1,60 @@ +#!/usr/bin/python +# coding=utf8 +# Copyright (c) 2011 The Chromium Authors. All rights reserved. +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. + +"""Unit tests for fix_encoding.py.""" + +import os +import sys +import unittest + +ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +sys.path.insert(0, ROOT_DIR) + +import fix_encoding + + +class FixEncodingTest(unittest.TestCase): + # Nice mix of latin, hebrew, arabic and chinese. Doesn't mean anything. + text = u'Héllô 偉大 سيد' + + def test_code_page(self): + # Make sure printing garbage won't throw. + print self.text.encode() + '\xff' + print >> sys.stderr, self.text.encode() + '\xff' + + def test_utf8(self): + # Make sure printing utf-8 works. + print self.text.encode('utf-8') + print >> sys.stderr, self.text.encode('utf-8') + + def test_unicode(self): + # Make sure printing unicode works. + print self.text + print >> sys.stderr, self.text + + def test_default_encoding(self): + self.assertEquals('utf-8', sys.getdefaultencoding()) + + def test_win_console(self): + if sys.platform != 'win32': + return + # This should fail if redirected. Can be checked with: + # python fix_encoding_test.py > a + self.assertEquals( + sys.stdout.__class__, fix_encoding.WinUnicodeConsoleOutput) + self.assertEquals( + sys.stderr.__class__, fix_encoding.WinUnicodeConsoleOutput) + self.assertEquals(sys.stdout.encoding, sys.getdefaultencoding()) + self.assertEquals(sys.stderr.encoding, sys.getdefaultencoding()) + + def test_multiple_calls(self): + # Shouldn't do anything. + self.assertEquals(False, fix_encoding.fix_encoding()) + + +if __name__ == '__main__': + assert fix_encoding.fix_encoding() + unittest.main() diff --git a/tests/gcl_unittest.py b/tests/gcl_unittest.py index 4ab2b6fc1..322586987 100755 --- a/tests/gcl_unittest.py +++ b/tests/gcl_unittest.py @@ -90,7 +90,8 @@ class GclUnittest(GclTestsBase): 'OptionallyDoPresubmitChecks', 'REPOSITORY_ROOT', 'REVIEWERS_REGEX', 'RunShell', 'RunShellWithReturnCode', 'SVN', 'TryChange', 'UnknownFiles', 'Warn', - 'attrs', 'breakpad', 'defer_attributes', 'gclient_utils', 'getpass', + 'attrs', 'breakpad', 'defer_attributes', 'fix_encoding', + 'gclient_utils', 'getpass', 'json', 'main', 'need_change', 'need_change_and_args', 'no_args', 'optparse', 'os', 'owners', 'presubmit_support', 'random', 're', 'string', 'subprocess', 'suggest_reviewers', 'sys', 'tempfile', diff --git a/tests/presubmit_unittest.py b/tests/presubmit_unittest.py index b2cb19083..c7ad8191d 100755 --- a/tests/presubmit_unittest.py +++ b/tests/presubmit_unittest.py @@ -141,7 +141,8 @@ class PresubmitUnittest(PresubmitTestsBase): 'NotImplementedException', 'OutputApi', 'ParseFiles', 'PresubmitExecuter', 'PresubmitOutput', 'ScanSubDirs', 'SvnAffectedFile', 'SvnChange', 'cPickle', 'cStringIO', - 'exceptions', 'fnmatch', 'gclient_utils', 'glob', 'json', 'load_files', + 'exceptions', 'fix_encoding', 'fnmatch', 'gclient_utils', 'glob', 'json', + 'load_files', 'logging', 'marshal', 'normpath', 'optparse', 'os', 'owners', 'pickle', 'presubmit_canned_checks', 'random', 're', 'scm', 'subprocess', 'sys', 'tempfile', 'time', 'traceback', 'types', 'unittest', 'urllib2', diff --git a/tests/trychange_unittest.py b/tests/trychange_unittest.py index c57cceea6..f2b52633e 100755 --- a/tests/trychange_unittest.py +++ b/tests/trychange_unittest.py @@ -45,7 +45,8 @@ class TryChangeUnittest(TryChangeTestsBase): 'EPILOG', 'Escape', 'GIT', 'GuessVCS', 'GetMungedDiff', 'HELP_STRING', 'InvalidScript', 'NoTryServerAccess', 'PrintSuccess', 'SCM', 'SVN', 'TryChange', 'USAGE', - 'breakpad', 'datetime', 'errno', 'gcl', 'gclient_utils', 'getpass', + 'breakpad', 'datetime', 'errno', 'fix_encoding', 'gcl', 'gclient_utils', + 'getpass', 'json', 'logging', 'optparse', 'os', 'posixpath', 're', 'scm', 'shutil', 'sys', 'tempfile', 'urllib', ] diff --git a/trychange.py b/trychange.py index fb5e935c3..a848c5b24 100755 --- a/trychange.py +++ b/trychange.py @@ -39,6 +39,7 @@ try: except ImportError: gcl = None +import fix_encoding import gclient_utils import scm @@ -769,4 +770,5 @@ def TryChange(argv, if __name__ == "__main__": + fix_encoding.fix_encoding() sys.exit(TryChange(None, [], False))