[lit] Support parsing scripts with inconsistent or invalid encodings.

- For whatever reason, we have a lot of test files with bogus unicode characters. This patch allows those scripts to still be parsed on Python3 by changing the parsing logic to work on binary files, and only require the actual script commands to be convertible to ascii. - This patch has been tweaked to now ensure that the command strings are not of unicode type on Python 2.6-7. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@188398 91177308-0d34-0410-b5e6-96231b3b80d8
author: Daniel Dunbar <daniel@zuster.org> 2013-08-14 18:22:41 +0000
committer: Daniel Dunbar <daniel@zuster.org> 2013-08-14 18:22:41 +0000
commit: 35d5e9044c580c843eb6e825d87816619e6d7f8f (patch)
tree: 79149218fd7a688f6e3a740de9385d01eb4a6eb0 /utils/lit
parent: c97db8dfdd854430c28db74825a6bb7593b5ec05 (diff)
download: llvm-35d5e9044c580c843eb6e825d87816619e6d7f8f.tar.gz
llvm-35d5e9044c580c843eb6e825d87816619e6d7f8f.tar.bz2
llvm-35d5e9044c580c843eb6e825d87816619e6d7f8f.tar.xz
2 files changed, 48 insertions, 12 deletions
diff --git a/utils/lit/lit/TestRunner.py b/utils/lit/lit/TestRunner.py
index 068e4991b2..3c658eafcd 100644
--- a/utils/lit/lit/TestRunner.py
+++ b/utils/lit/lit/TestRunner.py
@@ -305,24 +305,57 @@ def isExpectedFail(test, xfails):
 
     return False
 
-def parseIntegratedTestScriptCommands(sourcepath):
+def parseIntegratedTestScriptCommands(source_path):
     """
     parseIntegratedTestScriptCommands(source_path) -> commands
 
     Parse the commands in an integrated test script file into a list of
     (line_number, command_type, line).
     """
-    line_number = 0
-    for ln in open(sourcepath):
-        line_number += 1
-        if 'RUN:' in ln:
-            yield (line_number, 'RUN', ln[ln.index('RUN:')+4:])
-        elif 'XFAIL:' in ln:
-            yield (line_number, 'XFAIL', ln[ln.index('XFAIL:') + 6:])
-        elif 'REQUIRES:' in ln:
-            yield (line_number, 'REQUIRES', ln[ln.index('REQUIRES:') + 9:])
-        elif 'END.' in ln:
-            yield (line_number, 'END', ln[ln.index('END.') + 4:])
+
+    # This code is carefully written to be dual compatible with Python 2.5+ and
+    # Python 3 without requiring input files to always have valid codings. The
+    # trick we use is to open the file in binary mode and use the regular
+    # expression library to find the commands, with it scanning strings in
+    # Python2 and bytes in Python3.
+    #
+    # Once we find a match, we do require each script line to be decodable to
+    # ascii, so we convert the outputs to ascii before returning. This way the
+    # remaining code can work with "strings" agnostic of the executing Python
+    # version.
+    
+    def to_bytes(str):
+        # Encode to Latin1 to get binary data.
+        return str.encode('ISO-8859-1')
+    keywords = ('RUN:', 'XFAIL:', 'REQUIRES:', 'END.')
+    keywords_re = re.compile(
+        to_bytes("(%s)(.*)\n" % ("|".join(k for k in keywords),)))
+
+    f = open(source_path, 'rb')
+    try:
+        # Read the entire file contents.
+        data = f.read()
+
+        # Iterate over the matches.
+        line_number = 1
+        last_match_position = 0
+        for match in keywords_re.finditer(data):
+            # Compute the updated line number by counting the intervening
+            # newlines.
+            match_position = match.start()
+            line_number += data.count(to_bytes('\n'), last_match_position,
+                                      match_position)
+            last_match_position = match_position
+
+            # Convert the keyword and line to ascii strings and yield the
+            # command. Note that we take care to return regular strings in
+            # Python 2, to avoid other code having to differentiate between the
+            # str and unicode types.
+            keyword,ln = match.groups()
+            yield (line_number, str(keyword[:-1].decode('ascii')),
+                   str(ln.decode('ascii')))
+    finally:
+        f.close()
 
 def parseIntegratedTestScript(test, normalize_slashes=False,
                               extra_substitutions=[]):
diff --git a/utils/lit/tests/shtest-encoding.py b/utils/lit/tests/shtest-encoding.py
new file mode 100644
index 0000000000..dfc987f6df
--- /dev/null
+++ b/utils/lit/tests/shtest-encoding.py
@@ -0,0 +1,3 @@
+# RUN: true
+
+# Here is a string that cannot be decoded in line mode: Ā.
author	Daniel Dunbar <daniel@zuster.org>	2013-08-14 18:22:41 +0000
committer	Daniel Dunbar <daniel@zuster.org>	2013-08-14 18:22:41 +0000
commit	35d5e9044c580c843eb6e825d87816619e6d7f8f (patch)
tree	79149218fd7a688f6e3a740de9385d01eb4a6eb0 /utils/lit
parent	c97db8dfdd854430c28db74825a6bb7593b5ec05 (diff)
download	llvm-35d5e9044c580c843eb6e825d87816619e6d7f8f.tar.gz llvm-35d5e9044c580c843eb6e825d87816619e6d7f8f.tar.bz2 llvm-35d5e9044c580c843eb6e825d87816619e6d7f8f.tar.xz