summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--bindings/python/llvm/disassembler.py564
-rw-r--r--bindings/python/llvm/tests/test_disassembler.py62
2 files changed, 626 insertions, 0 deletions
diff --git a/bindings/python/llvm/disassembler.py b/bindings/python/llvm/disassembler.py
new file mode 100644
index 0000000000..d1fd789dba
--- /dev/null
+++ b/bindings/python/llvm/disassembler.py
@@ -0,0 +1,564 @@
+#===- disassembler.py - Python LLVM Bindings -----------------*- python -*--===#
+#
+# The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+#===------------------------------------------------------------------------===#
+
+from abc import ABCMeta
+from abc import abstractmethod
+
+from ctypes import CFUNCTYPE
+from ctypes import POINTER
+from ctypes import byref
+from ctypes import c_char_p
+from ctypes import c_int
+from ctypes import c_ubyte
+from ctypes import c_uint64
+from ctypes import c_uint
+from ctypes import c_void_p
+from ctypes import memmove
+
+from .common import CachedProperty
+from .common import LLVMObject
+from .common import c_object_p
+from .common import get_library
+
+__all__ = [
+ 'DisassemblerByteArraySource',
+ 'DisassemblerFileSource',
+ 'DisassemblerSource',
+ 'Disassembler',
+ 'Instruction',
+ 'Operand',
+ 'Token',
+]
+
+callbacks = {}
+
+class DisassemblerSource:
+ """Abstract base class for disassembler input.
+
+ This defines the interface to which inputs to the disassembler must
+ conform.
+
+ Basically, the disassembler input is a read-only sequence of a finite
+ length.
+ """
+ __metaclass__ = ABCMeta
+
+ @abstractmethod
+ def __len__(self):
+ """Returns the number of bytes that are available for input."""
+ pass
+
+ @abstractmethod
+ def get_byte(self, address):
+ """Returns the byte at the specified address."""
+ pass
+
+ @abstractmethod
+ def start_address(self):
+ """Returns the address at which to start fetch bytes, as a long."""
+ pass
+
+class DisassemblerByteArraySource(DisassemblerSource):
+ """A disassembler source for byte arrays."""
+
+ def __init__(self, b):
+ self._array = b
+
+ def __len__(self):
+ return len(self._array)
+
+ def get_byte(self, address):
+ return self._array[address]
+
+ def start_address(self):
+ return 0
+
+class DisassemblerFileSource(DisassemblerSource):
+ """A disassembler source for file segments.
+
+ This allows you to feed in segments of a file into a Disassembler.
+ """
+
+ def __init__(self, filename, start_offset, length=None, end_offset=None,
+ start_address=None):
+ """Create a new source from a file.
+
+ A source begins at a specified byte offset and can be defined in terms
+ of byte length of the end byte offset.
+ """
+ if length is None and end_offset is None:
+ raise Exception('One of length or end_offset must be defined.')
+
+ self._start_address = start_address
+ if self._start_address is None:
+ self._start_address = 0
+
+ count = length
+ if length is None:
+ count = end_offset - start_offset
+
+ with open(filename, 'rb') as fh:
+ fh.seek(start_offset)
+
+ # FIXME handle case where read bytes != requested
+ self._buf = fh.read(count)
+
+ def __len__(self):
+ return len(self._buf)
+
+ def get_byte(self, address):
+ return self._buf[address - self._start_address]
+
+ def start_address(self):
+ return self._start_address
+
+class Disassembler(LLVMObject):
+ """Interface to LLVM's enhanced disassembler.
+
+ The API is slightly different from the C API in that we tightly couple a
+ disassembler instance to an input source. This saves an extra level of
+ abstraction and makes the Python implementation easier.
+ """
+
+ SYNTAX_X86_INTEL = 0
+ SYNTAX_X86_ATT = 1
+ SYNTAX_ARM_UAL = 2
+
+ def __init__(self, triple, source, syntax=0):
+ """Create a new disassembler instance.
+
+ Arguments:
+
+ triple -- str target type (e.g. x86_64-apple-darwin10)
+ source -- DisassemblerSource instance to be fed into this disassembler.
+ syntax -- The assembly syntax to use. One of the SYNTAX_* class
+ constants. e.g. EnhancedDisassembler.SYNTAX_X86_INTEL
+ """
+ assert isinstance(source, DisassemblerSource)
+
+ ptr = c_object_p()
+ result = lib.EDGetDisassembler(byref(ptr), c_char_p(triple),
+ c_int(syntax))
+ if result != 0:
+ raise Exception('Non-0 return code.')
+
+ LLVMObject.__init__(self, ptr)
+
+ self._source = source
+
+ def get_instructions(self):
+ """Obtain the instructions from the input.
+
+ This is a generator for Instruction instances.
+
+ By default, this will return instructions for the entire source which
+ has been defined. It does this by querying the source's start_address()
+ method and continues to request instructions until len(source) is
+ exhausted.
+ """
+
+ # We currently obtain 1 instruction at a time because it is easiest.
+
+ # This serves as our EDByteReaderCallback. It is a proxy between C and
+ # the Python DisassemblerSource.
+ def byte_reader(dest, address, arg):
+ try:
+ byte = self._source.get_byte(address)
+ memmove(dest, byte, 1)
+
+ return 0
+ except:
+ return -1
+
+ address = self._source.start_address()
+ end_address = address + len(self._source)
+ cb = callbacks['byte_reader'](byte_reader)
+ while address < end_address:
+ ptr = c_object_p()
+
+ result = lib.EDCreateInsts(byref(ptr), c_uint(1), self, cb,
+ address, c_void_p(None))
+
+ if result != 1:
+ raise Exception('Error obtaining instruction at address %d' %
+ address)
+
+ instruction = Instruction(ptr, self)
+ yield instruction
+
+ address += instruction.byte_size
+
+
+class Instruction(LLVMObject):
+ """Represents an individual instruction.
+
+ Instruction instances are obtained from Disassembler.get_instructions().
+ """
+ def __init__(self, ptr, disassembler):
+ """Create a new instruction.
+
+ Instructions are created from within this module. You should have no
+ need to call this from outside this module.
+ """
+ assert isinstance(ptr, c_object_p)
+ assert isinstance(disassembler, Disassembler)
+
+ LLVMObject.__init__(self, ptr, disposer=lib.EDReleaseInst)
+ self._disassembler = disassembler
+
+ def __str__(self):
+ s = c_char_p(None)
+ result = lib.EDGetInstString(byref(s), self)
+ if result != 0:
+ raise Exception('Non-0 return code.')
+
+ return s.value
+
+ @CachedProperty
+ def byte_size(self):
+ result = lib.EDInstByteSize(self)
+ if result == -1:
+ raise Exception('Error code returned.')
+
+ return result
+
+ @CachedProperty
+ def id(self):
+ i = c_uint()
+ result = lib.EDInstID(byref(i), self)
+ if result != 0:
+ raise Exception('Non-0 return code.')
+
+ return i.value
+
+ @CachedProperty
+ def is_branch(self):
+ result = lib.EDInstIsBranch(self)
+ if result == -1:
+ raise Exception('Error code returned.')
+
+ return result > 0
+
+ @CachedProperty
+ def is_move(self):
+ result = lib.EDInstIsMove(self)
+ if result == -1:
+ raise Exception('Error code returned.')
+
+ return result > 0
+
+ @CachedProperty
+ def branch_target_id(self):
+ result = lib.EDBranchTargetID(self)
+ if result == -1:
+ raise Exception('Error code returned.')
+
+ return result
+
+ @CachedProperty
+ def move_source_id(self):
+ result = lib.EDMoveSourceID(self)
+ if result == -1:
+ raise Exception('Error code returned.')
+
+ return result
+
+ def get_tokens(self):
+ """Obtain the tokens in this instruction.
+
+ This is a generator for Token instances.
+ """
+ count = lib.EDNumTokens(self)
+ if count == -1:
+ raise Exception('Error code returned.')
+
+ for i in range(0, count):
+ ptr = c_object_p()
+ result = lib.EDGetToken(byref(ptr), self, c_int(i))
+ if result != 0:
+ raise Exception('Non-0 return code.')
+
+ yield Token(ptr, self)
+
+ def get_operands(self):
+ """Obtain the operands in this instruction.
+
+ This is a generator for Operand instances.
+ """
+ count = lib.EDNumOperands(self)
+ if count == -1:
+ raise Exception('Error code returned.')
+
+ for i in range(0, count):
+ ptr = c_object_p()
+ result = lib.EDGetOperand(byref(ptr), self, c_int(i))
+ if result != 0:
+ raise Exception('Non-0 return code.')
+
+ yield Operand(ptr, self)
+
+class Token(LLVMObject):
+ def __init__(self, ptr, instruction):
+ assert isinstance(ptr, c_object_p)
+ assert isinstance(instruction, Instruction)
+
+ LLVMObject.__init__(self, ptr)
+
+ self._instruction = instruction
+
+ def __str__(self):
+ s = c_char_p(None)
+ result = lib.EDGetTokenString(byref(s), self)
+ if result != 0:
+ raise Exception('Non-0 return code.')
+
+ return s.value
+
+ @CachedProperty
+ def operand_index(self):
+ result = lib.EDOperandIndexForToken(self)
+ if result == -1:
+ raise Exception('Error code returned.')
+
+ return result
+
+ @CachedProperty
+ def is_whitespace(self):
+ result = lib.EDTokenIsWhitespace(self)
+ if result == -1:
+ raise Exception('Error code returned.')
+
+ return result > 0
+
+ @CachedProperty
+ def is_punctuation(self):
+ result = lib.EDTokenIsPunctuation(self)
+ if result == -1:
+ raise Exception('Error code returned.')
+
+ return result > 0
+
+ @CachedProperty
+ def is_opcode(self):
+ result = lib.EDTokenIsOpcode(self)
+ if result == -1:
+ raise Exception('Error code returned.')
+
+ return result > 0
+
+ @CachedProperty
+ def is_literal(self):
+ result = lib.EDTokenIsLiteral(self)
+ if result == -1:
+ raise Exception('Error code returned.')
+
+ return result > 0
+
+ @CachedProperty
+ def is_register(self):
+ result = lib.EDTokenIsRegister(self)
+ if result == -1:
+ raise Exception('Error code returned.')
+
+ return result > 0
+
+ @CachedProperty
+ def is_negative_literal(self):
+ result = lib.EDTokenIsNegativeLiteral(self)
+ if result == -1:
+ raise Exception('Error code returned.')
+
+ return result > 0
+
+ @CachedProperty
+ def absolute_value(self):
+ value = c_uint64()
+ result = lib.EDLiteralTokenAbsoluteValue(byref(value), self)
+ if result != 0:
+ raise Exception('Non-0 return code.')
+
+ return value
+
+ @CachedProperty
+ def register_value(self):
+ value = c_uint()
+ result = lib.EDRegisterTokenValue(byref(value), self)
+ if result != 0:
+ raise Exception('Non-0 return code.')
+
+ return value
+
+class Operand(LLVMObject):
+ """Represents an operand in an instruction.
+
+ FIXME support register evaluation.
+ """
+ def __init__(self, ptr, instruction):
+ assert isinstance(ptr, c_object_p)
+ assert isinstance(instruction, Instruction)
+
+ LLVMObject.__init__(self, ptr)
+
+ self._instruction = instruction
+
+ @CachedProperty
+ def is_register(self):
+ result = lib.EDOperandIsRegister(self)
+ if result == -1:
+ raise Exception('Error code returned.')
+
+ return result > 0
+
+ @CachedProperty
+ def is_immediate(self):
+ result = lib.EDOperandIsImmediate(self)
+ if result == -1:
+ raise Exception('Error code returned.')
+
+ return result > 0
+
+ @CachedProperty
+ def is_memory(self):
+ result = lib.EDOperandIsMemory(self)
+ if result == -1:
+ raise Exception('Error code returned.')
+
+ return result > 0
+
+ @CachedProperty
+ def register_value(self):
+ value = c_uint()
+ result = lib.EDRegisterOperandValue(byref(value), self)
+ if result != 0:
+ raise Exception('Non-0 return code.')
+
+ return value
+
+ @CachedProperty
+ def immediate_value(self):
+ value = c_uint64()
+ result = lib.EDImmediateOperandValue(byref(value), self)
+ if result != 0:
+ raise Exception('Non-0 return code.')
+
+ return value
+
+def register_library(library):
+ library.EDGetDisassembler.argtypes = [POINTER(c_object_p), c_char_p, c_int]
+ library.EDGetDisassembler.restype = c_int
+
+ library.EDGetRegisterName.argtypes = [POINTER(c_char_p), Disassembler,
+ c_uint]
+ library.EDGetRegisterName.restype = c_int
+
+ library.EDRegisterIsStackPointer.argtypes = [Disassembler, c_uint]
+ library.EDRegisterIsStackPointer.restype = c_int
+
+ library.EDRegisterIsProgramCounter.argtypes = [Disassembler, c_uint]
+ library.EDRegisterIsProgramCounter.restype = c_int
+
+ library.EDCreateInsts.argtypes = [POINTER(c_object_p), c_uint,
+ Disassembler, callbacks['byte_reader'], c_uint64, c_void_p]
+ library.EDCreateInsts.restype = c_uint
+
+ library.EDReleaseInst.argtypes = [Instruction]
+
+ library.EDInstByteSize.argtypes = [Instruction]
+ library.EDInstByteSize.restype = c_int
+
+ library.EDGetInstString.argtypes = [POINTER(c_char_p), Instruction]
+ library.EDGetInstString.restype = c_int
+
+ library.EDInstID.argtypes = [POINTER(c_uint), Instruction]
+ library.EDInstID.restype = c_int
+
+ library.EDInstIsBranch.argtypes = [Instruction]
+ library.EDInstIsBranch.restype = c_int
+
+ library.EDInstIsMove.argtypes = [Instruction]
+ library.EDInstIsMove.restype = c_int
+
+ library.EDBranchTargetID.argtypes = [Instruction]
+ library.EDBranchTargetID.restype = c_int
+
+ library.EDMoveSourceID.argtypes = [Instruction]
+ library.EDMoveSourceID.restype = c_int
+
+ library.EDMoveTargetID.argtypes = [Instruction]
+ library.EDMoveTargetID.restype = c_int
+
+ library.EDNumTokens.argtypes = [Instruction]
+ library.EDNumTokens.restype = c_int
+
+ library.EDGetToken.argtypes = [POINTER(c_object_p), Instruction, c_int]
+ library.EDGetToken.restype = c_int
+
+ library.EDGetTokenString.argtypes = [POINTER(c_char_p), Token]
+ library.EDGetTokenString.restype = c_int
+
+ library.EDOperandIndexForToken.argtypes = [Token]
+ library.EDOperandIndexForToken.restype = c_int
+
+ library.EDTokenIsWhitespace.argtypes = [Token]
+ library.EDTokenIsWhitespace.restype = c_int
+
+ library.EDTokenIsPunctuation.argtypes = [Token]
+ library.EDTokenIsPunctuation.restype = c_int
+
+ library.EDTokenIsOpcode.argtypes = [Token]
+ library.EDTokenIsOpcode.restype = c_int
+
+ library.EDTokenIsLiteral.argtypes = [Token]
+ library.EDTokenIsLiteral.restype = c_int
+
+ library.EDTokenIsRegister.argtypes = [Token]
+ library.EDTokenIsRegister.restype = c_int
+
+ library.EDTokenIsNegativeLiteral.argtypes = [Token]
+ library.EDTokenIsNegativeLiteral.restype = c_int
+
+ library.EDLiteralTokenAbsoluteValue.argtypes = [POINTER(c_uint64), Token]
+ library.EDLiteralTokenAbsoluteValue.restype = c_int
+
+ library.EDRegisterTokenValue.argtypes = [POINTER(c_uint), Token]
+ library.EDRegisterTokenValue.restype = c_int
+
+ library.EDNumOperands.argtypes = [Instruction]
+ library.EDNumOperands.restype = c_int
+
+ library.EDGetOperand.argtypes = [POINTER(c_object_p), Instruction, c_int]
+ library.EDGetOperand.restype = c_int
+
+ library.EDOperandIsRegister.argtypes = [Operand]
+ library.EDOperandIsRegister.restype = c_int
+
+ library.EDOperandIsImmediate.argtypes = [Operand]
+ library.EDOperandIsImmediate.restype = c_int
+
+ library.EDOperandIsMemory.argtypes = [Operand]
+ library.EDOperandIsMemory.restype = c_int
+
+ library.EDRegisterOperandValue.argtypes = [POINTER(c_uint), Operand]
+ library.EDRegisterOperandValue.restype = c_int
+
+ library.EDImmediateOperandValue.argtypes = [POINTER(c_uint64), Operand]
+ library.EDImmediateOperandValue.restype = c_int
+
+ library.EDEvaluateOperand.argtypes = [c_uint64, Operand,
+ callbacks['register_reader'], c_void_p]
+ library.EDEvaluateOperand.restype = c_int
+
+# Enhanced disassembler.
+callbacks['byte_reader'] = CFUNCTYPE(c_int, POINTER(c_ubyte), c_uint64,
+ c_void_p)
+callbacks['register_reader'] = CFUNCTYPE(c_int, POINTER(c_uint64), c_uint,
+ c_void_p)
+
+lib = get_library()
+register_library(lib)
diff --git a/bindings/python/llvm/tests/test_disassembler.py b/bindings/python/llvm/tests/test_disassembler.py
new file mode 100644
index 0000000000..6eb11a23dc
--- /dev/null
+++ b/bindings/python/llvm/tests/test_disassembler.py
@@ -0,0 +1,62 @@
+from unittest import expectedFailure
+from unittest import skip
+
+from .base import TestBase
+from ..disassembler import DisassemblerByteArraySource
+from ..disassembler import DisassemblerFileSource
+from ..disassembler import Disassembler
+from ..object import ObjectFile
+
+class TestDisassembler(TestBase):
+ def test_simple(self):
+ sequence = '\x67\xe3\x81' # jcxz -127
+ triple = 'i686-apple-darwin9'
+
+ source = DisassemblerByteArraySource(sequence)
+
+ disassembler = Disassembler(triple, source)
+ instructions = list(disassembler.get_instructions())
+
+ self.assertEqual(len(instructions), 1)
+
+ i = instructions[0]
+ self.assertEqual(str(i), '\tjcxz\t-127\n')
+ self.assertEqual(i.byte_size, 3)
+ self.assertEqual(i.id, 1032)
+ self.assertTrue(i.is_branch)
+ self.assertFalse(i.is_move)
+ self.assertEqual(i.branch_target_id, 0)
+
+ tokens = list(i.get_tokens())
+ self.assertEqual(len(tokens), 4)
+ token = tokens[0]
+ self.assertEqual(str(token), 'jcxz')
+ self.assertFalse(token.is_whitespace)
+ self.assertFalse(token.is_punctuation)
+ self.assertTrue(token.is_opcode)
+ self.assertFalse(token.is_literal)
+ self.assertFalse(token.is_register)
+
+ self.assertTrue(tokens[1].is_whitespace)
+
+ operands = list(i.get_operands())
+ self.assertEqual(len(operands), 1)
+
+ # TODO implement operand tests
+
+ @skip('This test is horribly broken and probably not even correct.')
+ def test_read_instructions(self):
+ filename = self.get_test_binary()
+ o = ObjectFile(filename=filename)
+
+ for symbol in o.get_symbols():
+ address = symbol.address
+ offset = symbol.file_offset
+ size = symbol.size
+
+ source = DisassemblerFileSource(filename, offset, length=size,
+ start_address=address)
+
+ disassembler = Disassembler('x86-generic-gnu-linux', source)
+ for instruction in disassembler.get_instructions():
+ print instruction