shaka-packager/tools/idl_parser/idl_lexer.py

#!/usr/bin/env python
# Copyright (c) 2013 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.

""" Lexer for PPAPI IDL

The lexer uses the PLY library to build a tokenizer which understands both
WebIDL and Pepper tokens.

WebIDL, and WebIDL regular expressions can be found at:
   http://www.w3.org/TR/2012/CR-WebIDL-20120419/
PLY can be found at:
   http://www.dabeaz.com/ply/
"""

import os.path
import sys

#
# Try to load the ply module, if not, then assume it is in the third_party
# directory.
#
try:
  # Disable lint check which fails to find the ply module.
  # pylint: disable=F0401
  from ply import lex
except ImportError:
  module_path, module_name = os.path.split(__file__)
  third_party = os.path.join(module_path, '..', '..', 'third_party')
  sys.path.append(third_party)
  # pylint: disable=F0401
  from ply import lex

#
# IDL Lexer
#
class IDLLexer(object):
  # 'literals' is a value expected by lex which specifies a list of valid
  # literal tokens, meaning the token type and token value are identical.
  literals = r'"*.(){}[],;:=+-/~|&^?<>'

  # 't_ignore' contains ignored characters (spaces and tabs)
  t_ignore = ' \t'

  # 'tokens' is a value required by lex which specifies the complete list
  # of valid token types.
  tokens = [
    # Data types
      'float',
      'integer',
      'string',

    # Symbol and keywords types
      'COMMENT',
      'identifier',

    # MultiChar operators
      'ELLIPSIS',
  ]

  # 'keywords' is a map of string to token type.  All tokens matching
  # KEYWORD_OR_SYMBOL are matched against keywords dictionary, to determine
  # if the token is actually a keyword.
  keywords = {
    'any' : 'ANY',
    'attribute' : 'ATTRIBUTE',
    'boolean' : 'BOOLEAN',
    'byte' : 'BYTE',
    'callback' : 'CALLBACK',
    'const' : 'CONST',
    'creator' : 'CREATOR',
    'Date' : 'DATE',
    'deleter' : 'DELETER',
    'dictionary' : 'DICTIONARY',
    'DOMString' : 'DOMSTRING',
    'double' : 'DOUBLE',
    'enum'  : 'ENUM',
    'false' : 'FALSE',
    'float' : 'FLOAT',
    'exception' : 'EXCEPTION',
    'getter': 'GETTER',
    'implements' : 'IMPLEMENTS',
    'Infinity' : 'INFINITY',
    'inherit' : 'INHERIT',
    'interface' : 'INTERFACE',
    'legacycaller' : 'LEGACYCALLER',
    'long' : 'LONG',
    'Nan' : 'NAN',
    'null' : 'NULL',
    'object' : 'OBJECT',
    'octet' : 'OCTET',
    'optional' : 'OPTIONAL',
    'or' : 'OR',
    'partial'  : 'PARTIAL',
    'readonly' : 'READONLY',
    'sequence' : 'SEQUENCE',
    'setter': 'SETTER',
    'short' : 'SHORT',
    'static' : 'STATIC',
    'stringifier' : 'STRINGIFIER',
    'typedef' : 'TYPEDEF',
    'true' : 'TRUE',
    'unsigned' : 'UNSIGNED',
    'unrestricted' : 'UNRESTRICTED',
    'void' : 'VOID'
  }

  # Token definitions
  #
  # Lex assumes any value or function in the form of 't_<TYPE>' represents a
  # regular expression where a match will emit a token of type <TYPE>.  In the
  # case of a function, the function is called when a match is made. These
  # definitions come from WebIDL.
  #
  # These need to be methods for lexer construction, despite not using self.
  # pylint: disable=R0201
  def t_ELLIPSIS(self, t):
    r'\.\.\.'
    return t

  # Regex needs to be in the docstring
  # pylint: disable=C0301
  def t_float(self, t):
    r'-?(([0-9]+\.[0-9]*|[0-9]*\.[0-9]+)([Ee][+-]?[0-9]+)?|[0-9]+[Ee][+-]?[0-9]+)'
    return t

  def t_integer(self, t):
    r'-?([1-9][0-9]*|0[Xx][0-9A-Fa-f]+|0[0-7]*)'
    return t


  # A line ending '\n', we use this to increment the line number
  def t_LINE_END(self, t):
    r'\n+'
    self.AddLines(len(t.value))

  # We do not process escapes in the IDL strings.  Strings are exclusively
  # used for attributes and enums, and not used as typical 'C' constants.
  def t_string(self, t):
    r'"[^"]*"'
    t.value = t.value[1:-1]
    self.AddLines(t.value.count('\n'))
    return t

  # A C or C++ style comment:  /* xxx */ or //
  def t_COMMENT(self, t):
    r'(/\*(.|\n)*?\*/)|(//.*(\n[ \t]*//.*)*)'
    self.AddLines(t.value.count('\n'))
    return t

  # A symbol or keyword.
  def t_KEYWORD_OR_SYMBOL(self, t):
    r'_?[A-Za-z][A-Za-z_0-9]*'

    # All non-keywords are assumed to be symbols
    t.type = self.keywords.get(t.value, 'identifier')

    # We strip leading underscores so that you can specify symbols with the same
    # value as a keywords (E.g. a dictionary named 'interface').
    if t.value[0] == '_':
      t.value = t.value[1:]
    return t

  def t_ANY_error(self, t):
    msg = 'Unrecognized input'
    line = self.Lexer().lineno

    # If that line has not been accounted for, then we must have hit
    # EoF, so compute the beginning of the line that caused the problem.
    if line >= len(self.index):
      # Find the offset in the line of the first word causing the issue
      word = t.value.split()[0]
      offs = self.lines[line - 1].find(word)
      # Add the computed line's starting position
      self.index.append(self.Lexer().lexpos - offs)
      msg = 'Unexpected EoF reached after'

    pos = self.Lexer().lexpos - self.index[line]
    out = self.ErrorMessage(line, pos, msg)
    sys.stderr.write(out + '\n')
    self._lex_errors += 1


  def AddLines(self, count):
    # Set the lexer position for the beginning of the next line.  In the case
    # of multiple lines, tokens can not exist on any of the lines except the
    # last one, so the recorded value for previous lines are unused.  We still
    # fill the array however, to make sure the line count is correct.
    self.Lexer().lineno += count
    for _ in range(count):
      self.index.append(self.Lexer().lexpos)

  def FileLineMsg(self, line, msg):
    # Generate a message containing the file and line number of a token.
    filename = self.Lexer().filename
    if filename:
      return "%s(%d) : %s" % (filename, line + 1, msg)
    return "<BuiltIn> : %s" % msg

  def SourceLine(self, line, pos):
    # Create a source line marker
    caret = ' ' * pos + '^'
    # We decrement the line number since the array is 0 based while the
    # line numbers are 1 based.
    return "%s\n%s" % (self.lines[line - 1], caret)

  def ErrorMessage(self, line, pos, msg):
    return "\n%s\n%s" % (
        self.FileLineMsg(line, msg),
        self.SourceLine(line, pos))

#
# Tokenizer
#
# The token function returns the next token provided by IDLLexer for matching
# against the leaf paterns.
#
  def token(self):
    tok = self.Lexer().token()
    if tok:
      self.last = tok
    return tok


  def GetTokens(self):
    outlist = []
    while True:
      t = self.Lexer().token()
      if not t:
        break
      outlist.append(t)
    return outlist

  def Tokenize(self, data, filename='__no_file__'):
    lexer = self.Lexer()
    lexer.lineno = 1
    lexer.filename = filename
    lexer.input(data)
    self.lines = data.split('\n')

  def KnownTokens(self):
    return self.tokens

  def Lexer(self):
    if not self._lexobj:
      self._lexobj = lex.lex(object=self, lextab=None, optimize=0)
    return self._lexobj

  def _AddToken(self, token):
    if token in self.tokens:
      raise RuntimeError('Same token: ' + token)
    self.tokens.append(token)

  def _AddTokens(self, tokens):
    for token in tokens:
      self._AddToken(token)

  def _AddKeywords(self, keywords):
    for key in keywords:
      value = key.upper()
      self._AddToken(value)
      self.keywords[key] = value

  def _DelKeywords(self, keywords):
    for key in keywords:
      self.tokens.remove(key.upper())
      del self.keywords[key]

  def __init__(self):
    self.index = [0]
    self._lex_errors = 0
    self.linex = []
    self.filename = None
    self.keywords = {}
    self.tokens = []
    self._AddTokens(IDLLexer.tokens)
    self._AddKeywords(IDLLexer.keywords)
    self._lexobj = None
    self.last = None
    self.lines = None

# If run by itself, attempt to build the lexer
if __name__ == '__main__':
  lexer_object = IDLLexer()
Start with media/mp4, media/webm and base codes from Chromium. 2013-09-24 01:35:40 +00:00			`#!/usr/bin/env python`
			`# Copyright (c) 2013 The Chromium Authors. All rights reserved.`
			`# Use of this source code is governed by a BSD-style license that can be`
			`# found in the LICENSE file.`

			`""" Lexer for PPAPI IDL`

			`The lexer uses the PLY library to build a tokenizer which understands both`
			`WebIDL and Pepper tokens.`

			`WebIDL, and WebIDL regular expressions can be found at:`
			`http://www.w3.org/TR/2012/CR-WebIDL-20120419/`
			`PLY can be found at:`
			`http://www.dabeaz.com/ply/`
			`"""`

			`import os.path`
			`import sys`

			`#`
			`# Try to load the ply module, if not, then assume it is in the third_party`
			`# directory.`
			`#`
			`try:`
			`# Disable lint check which fails to find the ply module.`
			`# pylint: disable=F0401`
			`from ply import lex`
			`except ImportError:`
			`module_path, module_name = os.path.split(__file__)`
			`third_party = os.path.join(module_path, '..', '..', 'third_party')`
			`sys.path.append(third_party)`
			`# pylint: disable=F0401`
			`from ply import lex`

			`#`
			`# IDL Lexer`
			`#`
			`class IDLLexer(object):`
			`# 'literals' is a value expected by lex which specifies a list of valid`
			`# literal tokens, meaning the token type and token value are identical.`
			`literals = r'"*.(){}[],;:=+-/~\|&^?<>'`

			`# 't_ignore' contains ignored characters (spaces and tabs)`
			`t_ignore = ' \t'`

			`# 'tokens' is a value required by lex which specifies the complete list`
			`# of valid token types.`
			`tokens = [`
			`# Data types`
			`'float',`
			`'integer',`
			`'string',`

			`# Symbol and keywords types`
			`'COMMENT',`
			`'identifier',`

			`# MultiChar operators`
			`'ELLIPSIS',`
			`]`

			`# 'keywords' is a map of string to token type. All tokens matching`
			`# KEYWORD_OR_SYMBOL are matched against keywords dictionary, to determine`
			`# if the token is actually a keyword.`
			`keywords = {`
			`'any' : 'ANY',`
			`'attribute' : 'ATTRIBUTE',`
			`'boolean' : 'BOOLEAN',`
			`'byte' : 'BYTE',`
			`'callback' : 'CALLBACK',`
			`'const' : 'CONST',`
			`'creator' : 'CREATOR',`
			`'Date' : 'DATE',`
			`'deleter' : 'DELETER',`
			`'dictionary' : 'DICTIONARY',`
			`'DOMString' : 'DOMSTRING',`
			`'double' : 'DOUBLE',`
			`'enum' : 'ENUM',`
			`'false' : 'FALSE',`
			`'float' : 'FLOAT',`
			`'exception' : 'EXCEPTION',`
			`'getter': 'GETTER',`
			`'implements' : 'IMPLEMENTS',`
			`'Infinity' : 'INFINITY',`
			`'inherit' : 'INHERIT',`
			`'interface' : 'INTERFACE',`
			`'legacycaller' : 'LEGACYCALLER',`
			`'long' : 'LONG',`
			`'Nan' : 'NAN',`
			`'null' : 'NULL',`
			`'object' : 'OBJECT',`
			`'octet' : 'OCTET',`
			`'optional' : 'OPTIONAL',`
			`'or' : 'OR',`
			`'partial' : 'PARTIAL',`
			`'readonly' : 'READONLY',`
			`'sequence' : 'SEQUENCE',`
			`'setter': 'SETTER',`
			`'short' : 'SHORT',`
			`'static' : 'STATIC',`
			`'stringifier' : 'STRINGIFIER',`
			`'typedef' : 'TYPEDEF',`
			`'true' : 'TRUE',`
			`'unsigned' : 'UNSIGNED',`
			`'unrestricted' : 'UNRESTRICTED',`
			`'void' : 'VOID'`
			`}`

			`# Token definitions`
			`#`
			`# Lex assumes any value or function in the form of 't_<TYPE>' represents a`
			`# regular expression where a match will emit a token of type <TYPE>. In the`
			`# case of a function, the function is called when a match is made. These`
			`# definitions come from WebIDL.`
			`#`
			`# These need to be methods for lexer construction, despite not using self.`
			`# pylint: disable=R0201`
			`def t_ELLIPSIS(self, t):`
			`r'\.\.\.'`
			`return t`

			`# Regex needs to be in the docstring`
			`# pylint: disable=C0301`
			`def t_float(self, t):`
			`r'-?(([0-9]+\.[0-9]\|[0-9]\.[0-9]+)([Ee][+-]?[0-9]+)?\|[0-9]+[Ee][+-]?[0-9]+)'`
			`return t`

			`def t_integer(self, t):`
			`r'-?([1-9][0-9]\|0[Xx][0-9A-Fa-f]+\|0[0-7])'`
			`return t`


			`# A line ending '\n', we use this to increment the line number`
			`def t_LINE_END(self, t):`
			`r'\n+'`
			`self.AddLines(len(t.value))`

			`# We do not process escapes in the IDL strings. Strings are exclusively`
			`# used for attributes and enums, and not used as typical 'C' constants.`
			`def t_string(self, t):`
			`r'"[^"]*"'`
			`t.value = t.value[1:-1]`
			`self.AddLines(t.value.count('\n'))`
			`return t`

			`# A C or C++ style comment: /* xxx */ or //`
			`def t_COMMENT(self, t):`
			`r'(/\(.\|\n)?\/)\|(//.(\n[ \t]//.)*)'`
			`self.AddLines(t.value.count('\n'))`
			`return t`

			`# A symbol or keyword.`
			`def t_KEYWORD_OR_SYMBOL(self, t):`
			`r'_?[A-Za-z][A-Za-z_0-9]*'`

			`# All non-keywords are assumed to be symbols`
			`t.type = self.keywords.get(t.value, 'identifier')`

			`# We strip leading underscores so that you can specify symbols with the same`
			`# value as a keywords (E.g. a dictionary named 'interface').`
			`if t.value[0] == '_':`
			`t.value = t.value[1:]`
			`return t`

			`def t_ANY_error(self, t):`
			`msg = 'Unrecognized input'`
			`line = self.Lexer().lineno`

			`# If that line has not been accounted for, then we must have hit`
			`# EoF, so compute the beginning of the line that caused the problem.`
			`if line >= len(self.index):`
			`# Find the offset in the line of the first word causing the issue`
			`word = t.value.split()[0]`
			`offs = self.lines[line - 1].find(word)`
			`# Add the computed line's starting position`
			`self.index.append(self.Lexer().lexpos - offs)`
			`msg = 'Unexpected EoF reached after'`

			`pos = self.Lexer().lexpos - self.index[line]`
			`out = self.ErrorMessage(line, pos, msg)`
			`sys.stderr.write(out + '\n')`
			`self._lex_errors += 1`


			`def AddLines(self, count):`
			`# Set the lexer position for the beginning of the next line. In the case`
			`# of multiple lines, tokens can not exist on any of the lines except the`
			`# last one, so the recorded value for previous lines are unused. We still`
			`# fill the array however, to make sure the line count is correct.`
			`self.Lexer().lineno += count`
			`for _ in range(count):`
			`self.index.append(self.Lexer().lexpos)`

			`def FileLineMsg(self, line, msg):`
			`# Generate a message containing the file and line number of a token.`
			`filename = self.Lexer().filename`
			`if filename:`
			`return "%s(%d) : %s" % (filename, line + 1, msg)`
			`return "<BuiltIn> : %s" % msg`

			`def SourceLine(self, line, pos):`
			`# Create a source line marker`
			`caret = ' ' * pos + '^'`
			`# We decrement the line number since the array is 0 based while the`
			`# line numbers are 1 based.`
			`return "%s\n%s" % (self.lines[line - 1], caret)`

			`def ErrorMessage(self, line, pos, msg):`
			`return "\n%s\n%s" % (`
			`self.FileLineMsg(line, msg),`
			`self.SourceLine(line, pos))`

			`#`
			`# Tokenizer`
			`#`
			`# The token function returns the next token provided by IDLLexer for matching`
			`# against the leaf paterns.`
			`#`
			`def token(self):`
			`tok = self.Lexer().token()`
			`if tok:`
			`self.last = tok`
			`return tok`


			`def GetTokens(self):`
			`outlist = []`
			`while True:`
			`t = self.Lexer().token()`
			`if not t:`
			`break`
			`outlist.append(t)`
			`return outlist`

			`def Tokenize(self, data, filename='__no_file__'):`
			`lexer = self.Lexer()`
			`lexer.lineno = 1`
			`lexer.filename = filename`
			`lexer.input(data)`
			`self.lines = data.split('\n')`

			`def KnownTokens(self):`
			`return self.tokens`

			`def Lexer(self):`
			`if not self._lexobj:`
			`self._lexobj = lex.lex(object=self, lextab=None, optimize=0)`
			`return self._lexobj`

			`def _AddToken(self, token):`
			`if token in self.tokens:`
			`raise RuntimeError('Same token: ' + token)`
			`self.tokens.append(token)`

			`def _AddTokens(self, tokens):`
			`for token in tokens:`
			`self._AddToken(token)`

			`def _AddKeywords(self, keywords):`
			`for key in keywords:`
			`value = key.upper()`
			`self._AddToken(value)`
			`self.keywords[key] = value`

			`def _DelKeywords(self, keywords):`
			`for key in keywords:`
			`self.tokens.remove(key.upper())`
			`del self.keywords[key]`

			`def __init__(self):`
			`self.index = [0]`
			`self._lex_errors = 0`
			`self.linex = []`
			`self.filename = None`
			`self.keywords = {}`
			`self.tokens = []`
			`self._AddTokens(IDLLexer.tokens)`
			`self._AddKeywords(IDLLexer.keywords)`
			`self._lexobj = None`
			`self.last = None`
			`self.lines = None`

			`# If run by itself, attempt to build the lexer`
			`if __name__ == '__main__':`
			`lexer_object = IDLLexer()`