"""
Copyright (c) 2005, Anthony Roberts
All rights reserved.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:

* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following
disclaimer in the documentation and/or other materials provided
with the distribution.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""

from array import array
from cStringIO import StringIO

import sys
import exceptions
import re
import types


def accumulator (i = 0):
  while True:
    yield i
    i += 1


class UnoptimizableException(exceptions.Exception):
  pass


class NoMatchException(exceptions.Exception):
  pass


class RuleMatch:
  def __init__ (self, text, action):
    self.text = text
    self.action = action

class Rule:
  """This class maps input onto actions according to regular expression
  patterns."""

  @staticmethod
  def ruleFactory (ruleList):
    if len(ruleList) == 0:
      return None

    try:
      return CompositeRule(ruleList)
    except UnoptimizableException:
      return SimpleRule(ruleList)

  def __init__ (self):
    # Our component of the rule list should contain patterns and actions
    # associated with those patterns. These should be 2-tuples of the form:
    # (pattern, action).  See Scanner class for info on actions as it is solely
    # responsible for providing and interpreting them.
    self.ruleList = []

    self.child = None
    self.inited = False

  def addRule (self, pattern, action):
    # make sure it's a valid regex independantly
    re.compile(pattern)
    
    self.ruleList.append((pattern, action))

  def init (self):
    if len(self.ruleList) != 0:
      self.child = Rule.ruleFactory(self.ruleList)
    else:
      self.child = None

    self.inited = True

  def match (self, incom):
    if not self.inited:
      raise exceptions.Exception("not initialized")
    if self.child == None:
      raise NoMatchException()

    return self.child.match(incom)


class SimpleRule:
  """This class provides a simple wrapper around one pattern in order to
  accomodate rules with the maximum number of groups or unsupportable
  features (eg option flags)."""

  def __init__ (self, ruleList):
    ruleList = ruleList

    self.count = 0
    self.pattern, self.action = ruleList[0]
    self.pattern = re.compile(self.pattern)
    self.child = Rule.ruleFactory(ruleList[1:])

  def match (self, incom):
    m = self.pattern.match(incom)

    if m != None:
      return RuleMatch(m.group(0), self.action)
    elif self.child != None:
      return self.child.match(incom)
    else:
      raise NoMatchException()


class CompositeRule:
  """This class provides a wrapper around multiple patterns such that all can
  be tested against a string."""

  # This is the maximum number of subgroups. This is a limitation in the
  # Python regex implementation. The actual limit is 100 but the whole pattern
  # counts as the first, so the most we have access to is 99.
  maxgroups = 99

  """This regex is used to detect unacceptable (enumerable)
  subexpressions."""
  unacceptableSubs = re.compile(
    r'(?<!\\)\((' # don't match if preceeding backslash escapes paren
    '(?P<option>' # options, these can't be in a shared pattern
    r'\?[iLmsux]+'
    ')|(?P<named>' # named group (grouping)
    r'\?P<(?P<name>[A-Za-z_][A-Za-z0-9_]*?)>'
    ')|(?P<named2>' # named group match (requires named group)
    r'\?P=(?P<name2>[A-Za-z_][A-Za-z0-9_]*?)'
    ')|(?P<yesno>' # yes/no pattern (requires groups)
    r'\?\((?P<target>([0-9]+)|([A-Za-z_][A-Za-z0-9_]*))\)'
    ')|(?P<noext>' # a paren without a following question mark (grouping)
    r'(?!\?)'
    '))')

  def __init__ (self, ruleList):
    # these serve the same function as in Rule
    self.ruleList = ruleList

    # mapping of patterns onto actions so we know what action to return on
    # a match
    self.actions = {}

    # regex patterns merged into one
    self.pattern = None

    # Fallback for unmergable patterns, or extras if there are too many
    # subgroups. This is a CompositeRule object on which match() gets called
    # recursively.
    self.child = None

    # first and last group numbers for the patterns in this object
    self.count = 0
    self.last = 0

    # merge patterns that can be merged, push the rest into child SimpleRules
    # and CompositeRules
    self.optimizeRules()

  def optimizeSubexpression (self, match, num, namenum):
    groups = match.groupdict()

    if groups['option'] != None:
      # The flags subexpression affects settings for the whole regex. It isn't
      # allowed in the merged pattern.
            
      raise UnoptimizableException()

    elif groups['noext'] != None:
      # Enumerated grouping is broken because patterns are merged. We could
      # keep it without hurting anything but convert it to non-grouping because
      # Python has a limit of 100 groups, and using one of them up without
      # getting anything out of it is a waste. Stuff that cares will fall back
      # on SimpleRule.
      
      return '(?:'

    elif groups['named'] != None:
      # Group names are broken because there might be name collisions in the
      # merged expression. Add a prefix unique to this subexpression. Increment
      # the group counter because this uses up a group.

      namenum = namenum.next()
      return '(?P<_%s_%s>' % (str(num), groups['name'])

    elif groups['named2'] != None:
      # Group names are broken because there might be name collisions in the
      # merged expression. Add a prefix unique to this subexpression. Don't
      # increment the group counter because these are non-grouping.

      return '(?P=_%s_%s' % (str(num), groups['name2'])

    elif groups['yesno'] != None:
      # These only work correctly at the top level of the regex, but I
      # automatically put every pattern in its own group so I can tell which
      # matches. Therefore, these cannot participate in a CompositeRule.

      raise UnoptimizableException()

    else:
      # Every named group in the pattern is tested, every tested group name
      # is in the regex, and the regex cannot match without one of the known
      # groups being present. This can only be encountered if the regex has
      # been edited.

      raise exceptions.Exception('no known groups matched'
                                 ' (has the CompositeRule.unacceptableSubs'
                                 ' regex been replaced?)')

  def optimizeRule (self, pattern, num):
    nameacc = accumulator(1)
    namedict = {}

    ret1 = self.unacceptableSubs.sub(lambda match:
                                     self.optimizeSubexpression(match, num,
                                                                nameacc),
                                     pattern)
    # This number would be +1 too much, but the rule gets wrapped in another
    # subexpression or alternately the 0th group is the whole pattern.
    ret2 = nameacc.next()

    return ret1, ret2
  
  def optimizeRules (self):
    # running total number of subgroups that we've assembled for this object
    total = 0

    # total number of mergable patterns so far (can include parents)
    count = accumulator(self.count)

    optimizedPatterns = []
    
    for rule in self.ruleList:
      pattern, action = rule
      cur = count.next()
      
      try:
        opt, size = self.optimizeRule(pattern, cur)
        total += size
      except UnoptimizableException, e:
        if total == 0:
          raise e
        self.child = Rule.ruleFactory(self.ruleList[cur - self.count:])
        break

      if total > self.maxgroups:
        self.child = Rule.ruleFactory(self.ruleList[cur - self.count:])
        break
      else:
        curstr = '_' + str(cur)
        optimizedPatterns.append('(?P<%s>%s)' % (curstr, opt))
        self.actions[cur] = action

    pattern = []

    for i in optimizedPatterns:
      pattern.append(i)
      pattern.append('|')
    pattern = pattern[:-1]
    self.pattern = ''.join(pattern)
    self.pattern = re.compile(self.pattern)

    self.last = cur

  def match (self, incom):
    m = self.pattern.match(incom)
    if m == None:
      if self.child == None:
        raise NoMatchException()
      return self.child.match(incom)

    groups = m.groupdict()
    for i in xrange(0, self.last + 1):
      if groups['_' + str(i)] != None:
        return RuleMatch(m.group(0), self.actions[i])


class Scanner:
  """This class matches rules against the input for the purpose of performing
  the specified action (a function) on the matching text."""

  def __init__ (self, buffLen = 65536):
    self.states = {}
    self.buffLen = buffLen

    # to be initialized by the tokenizer function
    self.buff = None
    self.incom = None
    self.state = None

  def addRule (self, pattern, action = None, state = None):
    """Adds a rule mapping a (state, pattern) pair onto an action. If the
    action is None, a default action of consume is used. If the state is None,
    it will work with the scanner's default state of None."""

    if state not in self.states:
      self.states[state] = Rule()
    self.states[state].addRule(pattern, action)

  def init (self):
    """Initializes this object such that it is ready to begin accepting
    input."""

    for i in self.states:
      self.states[i].init()

  def restoreBuff (self):
    """Restores the buffer to at least the number of characters specified to
    the constructor (default 64 kb)."""

    n = self.buffLen - len(self.buff)

    """pushbacks have us over the minimum, don't read anything"""
    if n < 0:
      n = 0

    if n:
      if type(self.incom) == array:
        self.buff.extend(self.incom[:n])
      else:
        raise exceptions.Exception("input type must be string")

  def pushback (self, text):
    """Pushes text back into the buffer to be evaluated as if it had come from
    input."""
    self.incom.insert(0, text)

  def getTokenizer (self, incom):
    self.state = None
    self.buff = array('c')

    """we need a buffer that can be efficiently shortened"""
    if type(incom) == types.StringType:
      self.incom = array('c', incom)
    else:
      raise exceptions.Exception("input type must be string")

    """Outputs tokens for the scanner."""
    while True:
      if len(self.incom) == 0:
        break

      self.restoreBuff()

      m = self.states[self.state].match(self.incom)

      """Discard the text that matched the rule."""
      for i in xrange(len(m.text)):
        self.incom.pop(0)

      yield m

  def scanInput (self, incom):
    t = self.getTokenizer(incom)

    """iterate over tokens, performing the specified actions"""
    for i in t:
      if t.action != None
        i.action(i.text.tostring())

