shaka-packager/tools/find_runtime_symbols/static_symbols.py

# Copyright (c) 2012 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.

import bisect
import re


_ARGUMENT_TYPE_PATTERN = re.compile('\([^()]*\)(\s*const)?')
_TEMPLATE_ARGUMENT_PATTERN = re.compile('<[^<>]*>')
_LEADING_TYPE_PATTERN = re.compile('^.*\s+(\w+::)')
_READELF_SECTION_HEADER_PATTER = re.compile(
    '^\s*\[\s*(Nr|\d+)\]\s+(|\S+)\s+([A-Z_]+)\s+([0-9a-f]+)\s+'
    '([0-9a-f]+)\s+([0-9a-f]+)\s+([0-9]+)\s+([WAXMSILGxOop]*)\s+'
    '([0-9]+)\s+([0-9]+)\s+([0-9]+)')


class ParsingException(Exception):
  def __str__(self):
    return repr(self.args[0])


class AddressMapping(object):
  def __init__(self):
    self._symbol_map = {}

  def append(self, start, entry):
    self._symbol_map[start] = entry

  def find(self, address):
    return self._symbol_map.get(address)


class RangeAddressMapping(AddressMapping):
  def __init__(self):
    super(RangeAddressMapping, self).__init__()
    self._sorted_start_list = []
    self._is_sorted = True

  def append(self, start, entry):
    if self._sorted_start_list:
      if self._sorted_start_list[-1] > start:
        self._is_sorted = False
      elif self._sorted_start_list[-1] == start:
        return
    self._sorted_start_list.append(start)
    self._symbol_map[start] = entry

  def find(self, address):
    if not self._sorted_start_list:
      return None
    if not self._is_sorted:
      self._sorted_start_list.sort()
      self._is_sorted = True
    found_index = bisect.bisect_left(self._sorted_start_list, address)
    found_start_address = self._sorted_start_list[found_index - 1]
    return self._symbol_map[found_start_address]


class Procedure(object):
  """A class for a procedure symbol and an address range for the symbol."""

  def __init__(self, start, end, name):
    self.start = start
    self.end = end
    self.name = name

  def __eq__(self, other):
    return (self.start == other.start and
            self.end == other.end and
            self.name == other.name)

  def __ne__(self, other):
    return not self.__eq__(other)

  def __str__(self):
    return '%x-%x: %s' % (self.start, self.end, self.name)


class ElfSection(object):
  """A class for an elf section header."""

  def __init__(
      self, number, name, stype, address, offset, size, es, flg, lk, inf, al):
    self.number = number
    self.name = name
    self.stype = stype
    self.address = address
    self.offset = offset
    self.size = size
    self.es = es
    self.flg = flg
    self.lk = lk
    self.inf = inf
    self.al = al

  def __eq__(self, other):
    return (self.number == other.number and
            self.name == other.name and
            self.stype == other.stype and
            self.address == other.address and
            self.offset == other.offset and
            self.size == other.size and
            self.es == other.es and
            self.flg == other.flg and
            self.lk == other.lk and
            self.inf == other.inf and
            self.al == other.al)

  def __ne__(self, other):
    return not self.__eq__(other)

  def __str__(self):
    return '%x+%x(%x) %s' % (self.address, self.size, self.offset, self.name)


class StaticSymbolsInFile(object):
  """Represents static symbol information in a binary file."""

  def __init__(self, my_name):
    self.my_name = my_name
    self._elf_sections = []
    self._procedures = RangeAddressMapping()
    self._sourcefiles = RangeAddressMapping()
    self._typeinfos = AddressMapping()

  def _append_elf_section(self, elf_section):
    self._elf_sections.append(elf_section)

  def _append_procedure(self, start, procedure):
    self._procedures.append(start, procedure)

  def _append_sourcefile(self, start, sourcefile):
    self._sourcefiles.append(start, sourcefile)

  def _append_typeinfo(self, start, typeinfo):
    self._typeinfos.append(start, typeinfo)

  def _find_symbol_by_runtime_address(self, address, vma, target):
    if not (vma.begin <= address < vma.end):
      return None

    if vma.name != self.my_name:
      return None

    file_offset = address - (vma.begin - vma.offset)
    elf_address = None
    for section in self._elf_sections:
      if section.offset <= file_offset < (section.offset + section.size):
        elf_address = section.address + file_offset - section.offset
    if not elf_address:
      return None

    return target.find(elf_address)

  def find_procedure_by_runtime_address(self, address, vma):
    return self._find_symbol_by_runtime_address(address, vma, self._procedures)

  def find_sourcefile_by_runtime_address(self, address, vma):
    return self._find_symbol_by_runtime_address(address, vma, self._sourcefiles)

  def find_typeinfo_by_runtime_address(self, address, vma):
    return self._find_symbol_by_runtime_address(address, vma, self._typeinfos)

  def load_readelf_ew(self, f):
    found_header = False
    for line in f:
      if line.rstrip() == 'Section Headers:':
        found_header = True
        break
    if not found_header:
      return None

    for line in f:
      line = line.rstrip()
      matched = _READELF_SECTION_HEADER_PATTER.match(line)
      if matched:
        self._append_elf_section(ElfSection(
            int(matched.group(1), 10), # number
            matched.group(2), # name
            matched.group(3), # stype
            int(matched.group(4), 16), # address
            int(matched.group(5), 16), # offset
            int(matched.group(6), 16), # size
            matched.group(7), # es
            matched.group(8), # flg
            matched.group(9), # lk
            matched.group(10), # inf
            matched.group(11) # al
            ))
      else:
        if line in ('Key to Flags:', 'Program Headers:'):
          break

  def load_readelf_debug_decodedline_file(self, input_file):
    for line in input_file:
      splitted = line.rstrip().split(None, 2)
      self._append_sourcefile(int(splitted[0], 16), splitted[1])

  @staticmethod
  def _parse_nm_bsd_line(line):
    if line[8] == ' ':
      return line[0:8], line[9], line[11:]
    elif line[16] == ' ':
      return line[0:16], line[17], line[19:]
    raise ParsingException('Invalid nm output.')

  @staticmethod
  def _get_short_function_name(function):
    while True:
      function, number = _ARGUMENT_TYPE_PATTERN.subn('', function)
      if not number:
        break
    while True:
      function, number = _TEMPLATE_ARGUMENT_PATTERN.subn('', function)
      if not number:
        break
    return _LEADING_TYPE_PATTERN.sub('\g<1>', function)

  def load_nm_bsd(self, f, mangled=False):
    last_start = 0
    routine = ''

    for line in f:
      line = line.rstrip()
      sym_value, sym_type, sym_name = self._parse_nm_bsd_line(line)

      if sym_value[0] == ' ':
        continue

      start_val = int(sym_value, 16)

      if (sym_type in ('r', 'R', 'D', 'U', 'd', 'V') and
          (not mangled and sym_name.startswith('typeinfo'))):
        self._append_typeinfo(start_val, sym_name)

      # It's possible for two symbols to share the same address, if
      # one is a zero-length variable (like __start_google_malloc) or
      # one symbol is a weak alias to another (like __libc_malloc).
      # In such cases, we want to ignore all values except for the
      # actual symbol, which in nm-speak has type "T".  The logic
      # below does this, though it's a bit tricky: what happens when
      # we have a series of lines with the same address, is the first
      # one gets queued up to be processed.  However, it won't
      # *actually* be processed until later, when we read a line with
      # a different address.  That means that as long as we're reading
      # lines with the same address, we have a chance to replace that
      # item in the queue, which we do whenever we see a 'T' entry --
      # that is, a line with type 'T'.  If we never see a 'T' entry,
      # we'll just go ahead and process the first entry (which never
      # got touched in the queue), and ignore the others.
      if start_val == last_start and (sym_type == 't' or sym_type == 'T'):
        # We are the 'T' symbol at this address, replace previous symbol.
        routine = sym_name
        continue
      elif start_val == last_start:
        # We're not the 'T' symbol at this address, so ignore us.
        continue

      # Tag this routine with the starting address in case the image
      # has multiple occurrences of this routine.  We use a syntax
      # that resembles template paramters that are automatically
      # stripped out by ShortFunctionName()
      sym_name += "<%016x>" % start_val

      if not mangled:
        routine = self._get_short_function_name(routine)
      self._append_procedure(
          last_start, Procedure(last_start, start_val, routine))

      last_start = start_val
      routine = sym_name

    if not mangled:
      routine = self._get_short_function_name(routine)
    self._append_procedure(
        last_start, Procedure(last_start, last_start, routine))
Start with media/mp4, media/webm and base codes from Chromium. 2013-09-24 01:35:40 +00:00			`# Copyright (c) 2012 The Chromium Authors. All rights reserved.`
			`# Use of this source code is governed by a BSD-style license that can be`
			`# found in the LICENSE file.`

			`import bisect`
			`import re`


			`_ARGUMENT_TYPE_PATTERN = re.compile('\([^()]\)(\sconst)?')`
			`_TEMPLATE_ARGUMENT_PATTERN = re.compile('<[^<>]*>')`
			`_LEADING_TYPE_PATTERN = re.compile('^.*\s+(\w+::)')`
			`_READELF_SECTION_HEADER_PATTER = re.compile(`
			`'^\s\[\s(Nr\|\d+)\]\s+(\|\S+)\s+([A-Z_]+)\s+([0-9a-f]+)\s+'`
			`'([0-9a-f]+)\s+([0-9a-f]+)\s+([0-9]+)\s+([WAXMSILGxOop]*)\s+'`
			`'([0-9]+)\s+([0-9]+)\s+([0-9]+)')`


			`class ParsingException(Exception):`
			`def __str__(self):`
			`return repr(self.args[0])`


			`class AddressMapping(object):`
			`def __init__(self):`
			`self._symbol_map = {}`

			`def append(self, start, entry):`
			`self._symbol_map[start] = entry`

			`def find(self, address):`
			`return self._symbol_map.get(address)`


			`class RangeAddressMapping(AddressMapping):`
			`def __init__(self):`
			`super(RangeAddressMapping, self).__init__()`
			`self._sorted_start_list = []`
			`self._is_sorted = True`

			`def append(self, start, entry):`
			`if self._sorted_start_list:`
			`if self._sorted_start_list[-1] > start:`
			`self._is_sorted = False`
			`elif self._sorted_start_list[-1] == start:`
			`return`
			`self._sorted_start_list.append(start)`
			`self._symbol_map[start] = entry`

			`def find(self, address):`
			`if not self._sorted_start_list:`
			`return None`
			`if not self._is_sorted:`
			`self._sorted_start_list.sort()`
			`self._is_sorted = True`
			`found_index = bisect.bisect_left(self._sorted_start_list, address)`
			`found_start_address = self._sorted_start_list[found_index - 1]`
			`return self._symbol_map[found_start_address]`


			`class Procedure(object):`
			`"""A class for a procedure symbol and an address range for the symbol."""`

			`def __init__(self, start, end, name):`
			`self.start = start`
			`self.end = end`
			`self.name = name`

			`def __eq__(self, other):`
			`return (self.start == other.start and`
			`self.end == other.end and`
			`self.name == other.name)`

			`def __ne__(self, other):`
			`return not self.__eq__(other)`

			`def __str__(self):`
			`return '%x-%x: %s' % (self.start, self.end, self.name)`


			`class ElfSection(object):`
			`"""A class for an elf section header."""`

			`def __init__(`
			`self, number, name, stype, address, offset, size, es, flg, lk, inf, al):`
			`self.number = number`
			`self.name = name`
			`self.stype = stype`
			`self.address = address`
			`self.offset = offset`
			`self.size = size`
			`self.es = es`
			`self.flg = flg`
			`self.lk = lk`
			`self.inf = inf`
			`self.al = al`

			`def __eq__(self, other):`
			`return (self.number == other.number and`
			`self.name == other.name and`
			`self.stype == other.stype and`
			`self.address == other.address and`
			`self.offset == other.offset and`
			`self.size == other.size and`
			`self.es == other.es and`
			`self.flg == other.flg and`
			`self.lk == other.lk and`
			`self.inf == other.inf and`
			`self.al == other.al)`

			`def __ne__(self, other):`
			`return not self.__eq__(other)`

			`def __str__(self):`
			`return '%x+%x(%x) %s' % (self.address, self.size, self.offset, self.name)`


			`class StaticSymbolsInFile(object):`
			`"""Represents static symbol information in a binary file."""`

			`def __init__(self, my_name):`
			`self.my_name = my_name`
			`self._elf_sections = []`
			`self._procedures = RangeAddressMapping()`
			`self._sourcefiles = RangeAddressMapping()`
			`self._typeinfos = AddressMapping()`

			`def _append_elf_section(self, elf_section):`
			`self._elf_sections.append(elf_section)`

			`def _append_procedure(self, start, procedure):`
			`self._procedures.append(start, procedure)`

			`def _append_sourcefile(self, start, sourcefile):`
			`self._sourcefiles.append(start, sourcefile)`

			`def _append_typeinfo(self, start, typeinfo):`
			`self._typeinfos.append(start, typeinfo)`

			`def _find_symbol_by_runtime_address(self, address, vma, target):`
			`if not (vma.begin <= address < vma.end):`
			`return None`

			`if vma.name != self.my_name:`
			`return None`

			`file_offset = address - (vma.begin - vma.offset)`
			`elf_address = None`
			`for section in self._elf_sections:`
			`if section.offset <= file_offset < (section.offset + section.size):`
			`elf_address = section.address + file_offset - section.offset`
			`if not elf_address:`
			`return None`

			`return target.find(elf_address)`

			`def find_procedure_by_runtime_address(self, address, vma):`
			`return self._find_symbol_by_runtime_address(address, vma, self._procedures)`

			`def find_sourcefile_by_runtime_address(self, address, vma):`
			`return self._find_symbol_by_runtime_address(address, vma, self._sourcefiles)`

			`def find_typeinfo_by_runtime_address(self, address, vma):`
			`return self._find_symbol_by_runtime_address(address, vma, self._typeinfos)`

			`def load_readelf_ew(self, f):`
			`found_header = False`
			`for line in f:`
			`if line.rstrip() == 'Section Headers:':`
			`found_header = True`
			`break`
			`if not found_header:`
			`return None`

			`for line in f:`
			`line = line.rstrip()`
			`matched = _READELF_SECTION_HEADER_PATTER.match(line)`
			`if matched:`
			`self._append_elf_section(ElfSection(`
			`int(matched.group(1), 10), # number`
			`matched.group(2), # name`
			`matched.group(3), # stype`
			`int(matched.group(4), 16), # address`
			`int(matched.group(5), 16), # offset`
			`int(matched.group(6), 16), # size`
			`matched.group(7), # es`
			`matched.group(8), # flg`
			`matched.group(9), # lk`
			`matched.group(10), # inf`
			`matched.group(11) # al`
			`))`
			`else:`
			`if line in ('Key to Flags:', 'Program Headers:'):`
			`break`

			`def load_readelf_debug_decodedline_file(self, input_file):`
			`for line in input_file:`
			`splitted = line.rstrip().split(None, 2)`
			`self._append_sourcefile(int(splitted[0], 16), splitted[1])`

			`@staticmethod`
			`def _parse_nm_bsd_line(line):`
			`if line[8] == ' ':`
			`return line[0:8], line[9], line[11:]`
			`elif line[16] == ' ':`
			`return line[0:16], line[17], line[19:]`
			`raise ParsingException('Invalid nm output.')`

			`@staticmethod`
			`def _get_short_function_name(function):`
			`while True:`
			`function, number = _ARGUMENT_TYPE_PATTERN.subn('', function)`
			`if not number:`
			`break`
			`while True:`
			`function, number = _TEMPLATE_ARGUMENT_PATTERN.subn('', function)`
			`if not number:`
			`break`
			`return _LEADING_TYPE_PATTERN.sub('\g<1>', function)`

			`def load_nm_bsd(self, f, mangled=False):`
			`last_start = 0`
			`routine = ''`

			`for line in f:`
			`line = line.rstrip()`
			`sym_value, sym_type, sym_name = self._parse_nm_bsd_line(line)`

			`if sym_value[0] == ' ':`
			`continue`

			`start_val = int(sym_value, 16)`

			`if (sym_type in ('r', 'R', 'D', 'U', 'd', 'V') and`
			`(not mangled and sym_name.startswith('typeinfo'))):`
			`self._append_typeinfo(start_val, sym_name)`

			`# It's possible for two symbols to share the same address, if`
			`# one is a zero-length variable (like __start_google_malloc) or`
			`# one symbol is a weak alias to another (like __libc_malloc).`
			`# In such cases, we want to ignore all values except for the`
			`# actual symbol, which in nm-speak has type "T". The logic`
			`# below does this, though it's a bit tricky: what happens when`
			`# we have a series of lines with the same address, is the first`
			`# one gets queued up to be processed. However, it won't`
			`# actually be processed until later, when we read a line with`
			`# a different address. That means that as long as we're reading`
			`# lines with the same address, we have a chance to replace that`
			`# item in the queue, which we do whenever we see a 'T' entry --`
			`# that is, a line with type 'T'. If we never see a 'T' entry,`
			`# we'll just go ahead and process the first entry (which never`
			`# got touched in the queue), and ignore the others.`
			`if start_val == last_start and (sym_type == 't' or sym_type == 'T'):`
			`# We are the 'T' symbol at this address, replace previous symbol.`
			`routine = sym_name`
			`continue`
			`elif start_val == last_start:`
			`# We're not the 'T' symbol at this address, so ignore us.`
			`continue`

			`# Tag this routine with the starting address in case the image`
			`# has multiple occurrences of this routine. We use a syntax`
			`# that resembles template paramters that are automatically`
			`# stripped out by ShortFunctionName()`
			`sym_name += "<%016x>" % start_val`

			`if not mangled:`
			`routine = self._get_short_function_name(routine)`
			`self._append_procedure(`
			`last_start, Procedure(last_start, start_val, routine))`

			`last_start = start_val`
			`routine = sym_name`

			`if not mangled:`
			`routine = self._get_short_function_name(routine)`
			`self._append_procedure(`
			`last_start, Procedure(last_start, last_start, routine))`